diff options
Diffstat (limited to 'usr')
195 files changed, 28628 insertions, 16272 deletions
diff --git a/usr/src/cmd/cmd-inet/etc/sock2path b/usr/src/cmd/cmd-inet/etc/sock2path index 425d6c8006..aba55bb652 100644 --- a/usr/src/cmd/cmd-inet/etc/sock2path +++ b/usr/src/cmd/cmd-inet/etc/sock2path @@ -1,9 +1,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -18,39 +17,37 @@ # # CDDL HEADER END # -# Copyright 2004 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# # socket configuration information # -# Family Type Protocol Path - 2 2 0 /dev/tcp - 2 2 6 /dev/tcp +# Family Type Protocol Dev|Module + 2 2 0 tcp + 2 2 6 tcp - 26 2 0 /dev/tcp6 - 26 2 6 /dev/tcp6 + 26 2 0 tcp + 26 2 6 tcp - 2 1 0 /dev/udp - 2 1 17 /dev/udp + 2 1 0 udp + 2 1 17 udp - 26 1 0 /dev/udp6 - 26 1 17 /dev/udp6 + 26 1 0 udp + 26 1 17 udp 1 2 0 /dev/ticotsord 1 6 0 /dev/ticotsord 1 1 0 /dev/ticlts - 2 4 0 /dev/rawip - 26 4 0 /dev/rawip6 + 2 4 0 icmp + 26 4 0 icmp - 2 2 132 /dev/sctp - 26 2 132 /dev/sctp6 - 2 6 132 /dev/sctp - 26 6 132 /dev/sctp6 + 2 2 132 socksctp + 26 2 132 socksctp + 2 6 132 socksctp + 26 6 132 socksctp - 24 4 0 /dev/rts + 24 4 0 rts 27 4 2 /dev/keysock 28 2 0 /dev/nca diff --git a/usr/src/cmd/cmd-inet/usr.bin/netstat/unix.c b/usr/src/cmd/cmd-inet/usr.bin/netstat/unix.c index 5e7afa8e3d..175310a9a6 100644 --- a/usr/src/cmd/cmd-inet/usr.bin/netstat/unix.c +++ b/usr/src/cmd/cmd-inet/usr.bin/netstat/unix.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2001 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -37,8 +36,6 @@ * contributors. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * code for netstat's -k option * @@ -130,8 +127,8 @@ print_kn(kstat_t *ksp) (void) printf("\nActive UNIX domain sockets\n"); (void) printf("%-8.8s %-10.10s %8.8s %8.8s " - "Local Addr Remote Addr\n", - "Address", "Type", "Vnode", "Conn"); + "Local Addr Remote Addr\n", + "Address", "Type", "Vnode", "Conn"); /* for each sockinfo structure, display what we need: */ for (i = 0; i < ksp->ks_ndata; i++) { @@ -164,13 +161,13 @@ print_kn(kstat_t *ksp) if ((psi->si_state & SS_ISBOUND) && strlen(psi->si_laddr_sun_path) != 0 && psi->si_laddr_soa_len != 0) { - if (psi->si_state & SS_FADDR_NOXLATE) { + if (psi->si_faddr_noxlate) { (void) printf(" (socketpair) "); } else { if (psi->si_laddr_soa_len > - sizeof (psi->si_laddr_family)) + sizeof (psi->si_laddr_family)) (void) printf("%s ", - psi->si_laddr_sun_path); + psi->si_laddr_sun_path); else (void) printf(" "); } @@ -182,13 +179,13 @@ print_kn(kstat_t *ksp) strlen(psi->si_faddr_sun_path) != 0 && psi->si_faddr_soa_len != 0) { - if (psi->si_state & SS_FADDR_NOXLATE) { + if (psi->si_faddr_noxlate) { (void) printf(" (socketpair) "); } else { if (psi->si_faddr_soa_len > - sizeof (psi->si_faddr_family)) + sizeof (psi->si_faddr_family)) (void) printf("%s ", - psi->si_faddr_sun_path); + psi->si_faddr_sun_path); else (void) printf(" "); } diff --git a/usr/src/cmd/cmd-inet/usr.sbin/soconfig.c b/usr/src/cmd/cmd-inet/usr.sbin/soconfig.c index 5d3838623f..b5c45f7b6f 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/soconfig.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/soconfig.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright (c) 1991-1996,2001 by Sun Microsystems, Inc. - * All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <stdio.h> #include <sys/stat.h> #include <stdlib.h> @@ -40,12 +37,12 @@ * Usage: * sonconfig -f <file> * Reads input from file. The file is structured as - * <fam> <type> <protocol> <path> + * <fam> <type> <protocol> <path|module> * <fam> <type> <protocol> * with the first line registering and the second line * deregistering. * - * soconfig <fam> <type> <protocol> <path> + * soconfig <fam> <type> <protocol> <path|module> * registers * * soconfig <fam> <type> <protocol> @@ -99,9 +96,9 @@ static void usage(void) { fprintf(stderr, gettext( - "Usage: soconfig -f <file>\n" - "\tsoconfig <fam> <type> <protocol> <path>\n" - "\tsoconfig <fam> <type> <protocol>\n")); + "Usage: soconfig -f <file>\n" + "\tsoconfig <fam> <type> <protocol> <path|module>\n" + "\tsoconfig <fam> <type> <protocol>\n")); } /* @@ -131,7 +128,7 @@ parse_file(char *filename) linecount++; strcpy(pline, line); argcount = split_line(pline, argvec, - sizeof (argvec) / sizeof (argvec[0])); + sizeof (argvec) / sizeof (argvec[0])); #ifdef DEBUG { int i; @@ -147,18 +144,18 @@ parse_file(char *filename) break; case 3: numerror += parse_params(argvec[0], argvec[1], - argvec[2], NULL, linecount); + argvec[2], NULL, linecount); break; case 4: numerror += parse_params(argvec[0], argvec[1], - argvec[2], argvec[3], linecount); + argvec[2], argvec[3], linecount); break; default: numerror++; fprintf(stderr, - gettext("Malformed line: <%s>\n"), line); + gettext("Malformed line: <%s>\n"), line); fprintf(stderr, - gettext("\ton line %d\n"), linecount); + gettext("\ton line %d\n"), linecount); break; } } @@ -223,7 +220,7 @@ parse_params(char *famstr, char *typestr, char *protostr, char *path, int line) fprintf(stderr, gettext("Bad family number: %s\n"), famstr); if (line != -1) fprintf(stderr, - gettext("\ton line %d\n"), line); + gettext("\ton line %d\n"), line); else { fprintf(stderr, "\n"); usage(); @@ -234,10 +231,10 @@ parse_params(char *famstr, char *typestr, char *protostr, char *path, int line) type = parse_int(typestr); if (type == -1) { fprintf(stderr, - gettext("Bad socket type number: %s\n"), typestr); + gettext("Bad socket type number: %s\n"), typestr); if (line != -1) fprintf(stderr, - gettext("\ton line %d\n"), line); + gettext("\ton line %d\n"), line); else { fprintf(stderr, "\n"); usage(); @@ -248,10 +245,10 @@ parse_params(char *famstr, char *typestr, char *protostr, char *path, int line) protocol = parse_int(protostr); if (protocol == -1) { fprintf(stderr, - gettext("Bad protocol number: %s\n"), protostr); + gettext("Bad protocol number: %s\n"), protostr); if (line != -1) fprintf(stderr, - gettext("\ton line %d\n"), line); + gettext("\ton line %d\n"), line); else { fprintf(stderr, "\n"); usage(); @@ -263,11 +260,12 @@ parse_params(char *famstr, char *typestr, char *protostr, char *path, int line) if (path != NULL) { struct stat stats; - if (stat(path, &stats) == -1) { + if (strncmp(path, "/dev", strlen("/dev")) == 0 && + stat(path, &stats) == -1) { perror(path); if (line != -1) fprintf(stderr, - gettext("\ton line %d\n"), line); + gettext("\ton line %d\n"), line); else { fprintf(stderr, "\n"); usage(); @@ -278,7 +276,7 @@ parse_params(char *famstr, char *typestr, char *protostr, char *path, int line) #ifdef DEBUG printf("not calling sockconfig(%d, %d, %d, %s)\n", - fam, type, protocol, path == NULL ? "(null)" : path); + fam, type, protocol, path == NULL ? "(null)" : path); #else if (_sockconfig(fam, type, protocol, path) == -1) { perror("sockconfig"); diff --git a/usr/src/cmd/mdb/Makefile.common b/usr/src/cmd/mdb/Makefile.common index ed27426b8d..bb341fdc8f 100644 --- a/usr/src/cmd/mdb/Makefile.common +++ b/usr/src/cmd/mdb/Makefile.common @@ -87,6 +87,7 @@ COMMON_MODULES_KVM = \ sdbc \ smbfs \ smbsrv \ + sockfs \ specfs \ sppp \ stmf \ diff --git a/usr/src/cmd/mdb/common/modules/genunix/net.c b/usr/src/cmd/mdb/common/modules/genunix/net.c index c8785ed796..987e3b52a0 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/net.c +++ b/usr/src/cmd/mdb/common/modules/genunix/net.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <mdb/mdb_modapi.h> #include <mdb/mdb_ks.h> #include <mdb/mdb_ctf.h> @@ -50,6 +48,7 @@ #include <inet/arp_impl.h> #include <inet/rawip_impl.h> #include <inet/mi.h> +#include <fs/sockfs/socktpi_impl.h> #define ADDR_V6_WIDTH 23 #define ADDR_V4_WIDTH 15 @@ -248,7 +247,7 @@ sonode_walk_init(mdb_walk_state_t *wsp) } } - wsp->walk_data = mdb_alloc(sizeof (struct sonode), UM_SLEEP); + wsp->walk_data = mdb_alloc(sizeof (struct sotpi_sonode), UM_SLEEP); return (WALK_NEXT); } @@ -256,12 +255,12 @@ int sonode_walk_step(mdb_walk_state_t *wsp) { int status; - struct sonode *sonodep; + struct sotpi_sonode *stp; if (wsp->walk_addr == NULL) return (WALK_DONE); - if (mdb_vread(wsp->walk_data, sizeof (struct sonode), + if (mdb_vread(wsp->walk_data, sizeof (struct sotpi_sonode), wsp->walk_addr) == -1) { mdb_warn("failed to read sonode at %p", wsp->walk_addr); return (WALK_ERR); @@ -270,16 +269,16 @@ sonode_walk_step(mdb_walk_state_t *wsp) status = wsp->walk_callback(wsp->walk_addr, wsp->walk_data, wsp->walk_cbdata); - sonodep = wsp->walk_data; + stp = wsp->walk_data; - wsp->walk_addr = (uintptr_t)sonodep->so_next; + wsp->walk_addr = (uintptr_t)stp->st_info.sti_next_so; return (status); } void sonode_walk_fini(mdb_walk_state_t *wsp) { - mdb_free(wsp->walk_data, sizeof (struct sonode)); + mdb_free(wsp->walk_data, sizeof (struct sotpi_sonode)); } struct mi_walk_data { @@ -517,9 +516,9 @@ sonode(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) mdb_printf(" %4hi", so.so_type); } - mdb_printf(" %5hi %05x %04x %04hx %0?p\n", + mdb_printf(" %5hi %05x %04x %04hx\n", so.so_protocol, so.so_state, so.so_mode, - so.so_flag, so.so_accessvp); + so.so_flag); return (DCMD_OK); } @@ -740,12 +739,13 @@ netstat_udpv6_cb(uintptr_t kaddr, const void *walk_data, void *cb_data) * returns 0 on success, -1 otherwise */ static int -netstat_unix_name_pr(const struct sonode *so, const struct soaddr *soa) +netstat_unix_name_pr(const struct sotpi_sonode *st, const struct soaddr *soa) { + const struct sonode *so = &st->st_sonode; const char none[] = " (none)"; if ((so->so_state & SS_ISBOUND) && (soa->soa_len != 0)) { - if (so->so_state & SS_FADDR_NOXLATE) { + if (st->st_info.sti_faddr_noxlate) { mdb_printf("%-14s ", " (socketpair)"); } else { if (soa->soa_len > sizeof (sa_family_t)) { @@ -775,9 +775,11 @@ netstat_unix_name_pr(const struct sonode *so, const struct soaddr *soa) static int netstat_unix_cb(uintptr_t kaddr, const void *walk_data, void *cb_data) { - const struct sonode *so = walk_data; + const struct sotpi_sonode *st = walk_data; + const struct sonode *so = &st->st_sonode; + const struct sotpi_info *sti = &st->st_info; - if (so->so_accessvp == NULL) + if (so->so_count == 0) return (WALK_NEXT); if (so->so_family != AF_UNIX) { @@ -787,7 +789,7 @@ netstat_unix_cb(uintptr_t kaddr, const void *walk_data, void *cb_data) mdb_printf("%-?p ", kaddr); - switch (so->so_serv_type) { + switch (sti->sti_serv_type) { case T_CLTS: mdb_printf("%-10s ", "dgram"); break; @@ -798,27 +800,27 @@ netstat_unix_cb(uintptr_t kaddr, const void *walk_data, void *cb_data) mdb_printf("%-10s ", "stream-ord"); break; default: - mdb_printf("%-10i ", so->so_serv_type); + mdb_printf("%-10i ", sti->sti_serv_type); } if ((so->so_state & SS_ISBOUND) && - (so->so_ux_laddr.soua_magic == SOU_MAGIC_EXPLICIT)) { - mdb_printf("%0?p ", so->so_ux_laddr.soua_vp); + (sti->sti_ux_laddr.soua_magic == SOU_MAGIC_EXPLICIT)) { + mdb_printf("%0?p ", sti->sti_ux_laddr.soua_vp); } else { mdb_printf("%0?p ", NULL); } if ((so->so_state & SS_ISCONNECTED) && - (so->so_ux_faddr.soua_magic == SOU_MAGIC_EXPLICIT)) { - mdb_printf("%0?p ", so->so_ux_faddr.soua_vp); + (sti->sti_ux_faddr.soua_magic == SOU_MAGIC_EXPLICIT)) { + mdb_printf("%0?p ", sti->sti_ux_faddr.soua_vp); } else { mdb_printf("%0?p ", NULL); } - if (netstat_unix_name_pr(so, &so->so_laddr) == -1) + if (netstat_unix_name_pr(st, &sti->sti_laddr) == -1) return (WALK_ERR); - if (netstat_unix_name_pr(so, &so->so_faddr) == -1) + if (netstat_unix_name_pr(st, &sti->sti_faddr) == -1) return (WALK_ERR); mdb_printf("%4i\n", so->so_zoneid); diff --git a/usr/src/cmd/mdb/common/modules/genunix/vfs.c b/usr/src/cmd/mdb/common/modules/genunix/vfs.c index 5c5fc3361e..b12cdca0c9 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/vfs.c +++ b/usr/src/cmd/mdb/common/modules/genunix/vfs.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <mdb/mdb_modapi.h> #include <mdb/mdb_ks.h> @@ -47,6 +45,11 @@ #include <sys/socketvar.h> #include <sys/strsubr.h> #include <sys/un.h> +#include <fs/sockfs/socktpi_impl.h> +#include <inet/ipclassifier.h> +#include <inet/ip_if.h> +#include <inet/sctp/sctp_impl.h> +#include <inet/sctp/sctp_addr.h> int vfs_walk_init(mdb_walk_state_t *wsp) @@ -173,7 +176,7 @@ read_fsname(uintptr_t vfsp, char *fsname) #define FSINFO_MNTLEN 56 #endif -/*ARGSUSED*/ +/* ARGSUSED */ int fsinfo(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) { @@ -387,14 +390,14 @@ pfiles_print_addr(struct sockaddr *addr) switch (addr->sa_family) { case AF_INET: - /*LINTED: alignment*/ + /* LINTED: alignment */ s_in = (struct sockaddr_in *)addr; mdb_nhconvert(&port, &s_in->sin_port, sizeof (port)); mdb_printf("AF_INET %I %d ", s_in->sin_addr.s_addr, port); break; case AF_INET6: - /*LINTED: alignment*/ + /* LINTED: alignment */ s_in6 = (struct sockaddr_in6 *)addr; mdb_nhconvert(&port, &s_in6->sin6_port, sizeof (port)); mdb_printf("AF_INET6 %N %d ", &(s_in6->sin6_addr), port); @@ -410,31 +413,39 @@ pfiles_print_addr(struct sockaddr *addr) } } - static int -pfiles_get_sonode(uintptr_t vp, struct sonode *sonode) +pfiles_get_sonode(vnode_t *v_sock, struct sonode *sonode) { - vnode_t v; - struct stdata stream; - - if (mdb_vread(&v, sizeof (v), vp) == -1) { - mdb_warn("failed to read socket vnode"); + if (mdb_vread(sonode, sizeof (struct sonode), + (uintptr_t)v_sock->v_data) == -1) { + mdb_warn("failed to read sonode"); return (-1); } - if (mdb_vread(&stream, sizeof (stream), (uintptr_t)v.v_stream) == -1) { + return (0); +} + +static int +pfiles_get_tpi_sonode(vnode_t *v_sock, sotpi_sonode_t *sotpi_sonode) +{ + + struct stdata stream; + + if (mdb_vread(&stream, sizeof (stream), + (uintptr_t)v_sock->v_stream) == -1) { mdb_warn("failed to read stream data"); return (-1); } - if (mdb_vread(&v, sizeof (v), (uintptr_t)stream.sd_vnode) == -1) { + if (mdb_vread(v_sock, sizeof (vnode_t), + (uintptr_t)stream.sd_vnode) == -1) { mdb_warn("failed to read stream vnode"); return (-1); } - if (mdb_vread(sonode, sizeof (struct sonode), - (uintptr_t)v.v_data) == -1) { - mdb_warn("failed to read sonode"); + if (mdb_vread(sotpi_sonode, sizeof (sotpi_sonode_t), + (uintptr_t)v_sock->v_data) == -1) { + mdb_warn("failed to read sotpi_sonode"); return (-1); } @@ -470,16 +481,20 @@ pfiles_dig_pathname(uintptr_t vp, char *path) /* * For sockets, we won't find a path unless we print the path - * associated with the accessvp. + * associated with transport's STREAM device. */ if (v.v_type == VSOCK) { struct sonode sonode; - if (pfiles_get_sonode(vp, &sonode) == -1) { + if (pfiles_get_sonode(&v, &sonode) == -1) { return (-1); } - - vp = (uintptr_t)sonode.so_accessvp; + if (!SOCK_IS_NONSTR(&sonode)) { + struct sockparams *sp = sonode.so_sockparams; + vp = (uintptr_t)sp->sp_sdev_info.sd_vnode; + } else { + vp = NULL; + } } } @@ -531,6 +546,364 @@ struct pfiles_cbdata { int fd; }; +#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset)) +#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset)) + +/* + * SCTP interface for geting the first source address of a sctp_t. + */ +int +sctp_getsockaddr(sctp_t *sctp, struct sockaddr *addr) +{ + int err = -1; + int i; + int l; + sctp_saddr_ipif_t *pobj; + sctp_saddr_ipif_t obj; + size_t added = 0; + sin6_t *sin6; + sin_t *sin4; + int scanned = 0; + boolean_t skip_lback = B_FALSE; + + addr->sa_family = sctp->sctp_family; + if (sctp->sctp_nsaddrs == 0) + goto done; + + /* + * Skip loopback addresses for non-loopback assoc. + */ + if (sctp->sctp_state >= SCTPS_ESTABLISHED && !sctp->sctp_loopback) { + skip_lback = B_TRUE; + } + + for (i = 0; i < SCTP_IPIF_HASH; i++) { + if (sctp->sctp_saddrs[i].ipif_count == 0) + continue; + + pobj = list_object(&sctp->sctp_saddrs[i].sctp_ipif_list, + sctp->sctp_saddrs[i].sctp_ipif_list.list_head.list_next); + if (mdb_vread(&obj, sizeof (sctp_saddr_ipif_t), + (uintptr_t)pobj) == -1) { + mdb_warn("failed to read sctp_saddr_ipif_t"); + return (err); + } + + for (l = 0; l < sctp->sctp_saddrs[i].ipif_count; l++) { + sctp_ipif_t ipif; + in6_addr_t laddr; + list_node_t *pnode; + list_node_t node; + + if (mdb_vread(&ipif, sizeof (sctp_ipif_t), + (uintptr_t)obj.saddr_ipifp) == -1) { + mdb_warn("failed to read sctp_ipif_t"); + return (err); + } + laddr = ipif.sctp_ipif_saddr; + + scanned++; + if ((ipif.sctp_ipif_state == SCTP_IPIFS_CONDEMNED) || + SCTP_DONT_SRC(&obj) || + (ipif.sctp_ipif_ill->sctp_ill_flags & + PHYI_LOOPBACK) && skip_lback) { + if (scanned >= sctp->sctp_nsaddrs) + goto done; + + /* LINTED: alignment */ + pnode = list_d2l(&sctp->sctp_saddrs[i]. + sctp_ipif_list, pobj); + if (mdb_vread(&node, sizeof (list_node_t), + (uintptr_t)pnode) == -1) { + mdb_warn("failed to read list_node_t"); + return (err); + } + pobj = list_object(&sctp->sctp_saddrs[i]. + sctp_ipif_list, node.list_next); + if (mdb_vread(&obj, sizeof (sctp_saddr_ipif_t), + (uintptr_t)pobj) == -1) { + mdb_warn("failed to read " + "sctp_saddr_ipif_t"); + return (err); + } + continue; + } + + switch (sctp->sctp_family) { + case AF_INET: + /* LINTED: alignment */ + sin4 = (sin_t *)addr; + if ((sctp->sctp_state <= SCTPS_LISTEN) && + sctp->sctp_bound_to_all) { + sin4->sin_addr.s_addr = INADDR_ANY; + sin4->sin_port = sctp->sctp_lport; + } else { + sin4 += added; + sin4->sin_family = AF_INET; + sin4->sin_port = sctp->sctp_lport; + IN6_V4MAPPED_TO_INADDR(&laddr, + &sin4->sin_addr); + } + break; + + case AF_INET6: + /* LINTED: alignment */ + sin6 = (sin6_t *)addr; + if ((sctp->sctp_state <= SCTPS_LISTEN) && + sctp->sctp_bound_to_all) { + bzero(&sin6->sin6_addr, + sizeof (sin6->sin6_addr)); + sin6->sin6_port = sctp->sctp_lport; + } else { + sin6 += added; + sin6->sin6_family = AF_INET6; + sin6->sin6_port = sctp->sctp_lport; + sin6->sin6_addr = laddr; + } + sin6->sin6_flowinfo = sctp->sctp_ip6h->ip6_vcf & + ~IPV6_VERS_AND_FLOW_MASK; + sin6->sin6_scope_id = 0; + sin6->__sin6_src_id = 0; + break; + } + added++; + if (added >= 1) { + err = 0; + goto done; + } + if (scanned >= sctp->sctp_nsaddrs) + goto done; + + /* LINTED: alignment */ + pnode = list_d2l(&sctp->sctp_saddrs[i].sctp_ipif_list, + pobj); + if (mdb_vread(&node, sizeof (list_node_t), + (uintptr_t)pnode) == -1) { + mdb_warn("failed to read list_node_t"); + return (err); + } + pobj = list_object(&sctp->sctp_saddrs[i]. + sctp_ipif_list, node.list_next); + if (mdb_vread(&obj, sizeof (sctp_saddr_ipif_t), + (uintptr_t)pobj) == -1) { + mdb_warn("failed to read sctp_saddr_ipif_t"); + return (err); + } + } + } +done: + return (err); +} + +/* + * SCTP interface for geting the primary peer address of a sctp_t. + */ +static int +sctp_getpeeraddr(sctp_t *sctp, struct sockaddr *addr) +{ + struct sockaddr_in *sin4; + struct sockaddr_in6 *sin6; + sctp_faddr_t sctp_primary; + in6_addr_t faddr; + + if (sctp->sctp_faddrs == NULL) + return (-1); + + addr->sa_family = sctp->sctp_family; + if (mdb_vread(&sctp_primary, sizeof (sctp_faddr_t), + (uintptr_t)sctp->sctp_primary) == -1) { + mdb_warn("failed to read sctp primary faddr"); + return (-1); + } + faddr = sctp_primary.faddr; + + switch (sctp->sctp_family) { + case AF_INET: + /* LINTED: alignment */ + sin4 = (struct sockaddr_in *)addr; + IN6_V4MAPPED_TO_INADDR(&faddr, &sin4->sin_addr); + sin4->sin_port = sctp->sctp_fport; + sin4->sin_family = AF_INET; + break; + + case AF_INET6: + /* LINTED: alignment */ + sin6 = (struct sockaddr_in6 *)addr; + sin6->sin6_addr = faddr; + sin6->sin6_port = sctp->sctp_fport; + sin6->sin6_family = AF_INET6; + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = 0; + sin6->__sin6_src_id = 0; + break; + } + + return (0); +} + +static int +tpi_sock_print(sotpi_sonode_t *sotpi_sonode) +{ + if (sotpi_sonode->st_info.sti_laddr_valid == 1) { + struct sockaddr *laddr = + mdb_alloc(sotpi_sonode->st_info.sti_laddr_len, UM_SLEEP); + if (mdb_vread(laddr, sotpi_sonode->st_info.sti_laddr_len, + (uintptr_t)sotpi_sonode->st_info.sti_laddr_sa) == -1) { + mdb_warn("failed to read sotpi_sonode socket addr"); + return (-1); + } + + mdb_printf("socket: "); + pfiles_print_addr(laddr); + } + + if (sotpi_sonode->st_info.sti_faddr_valid == 1) { + struct sockaddr *faddr = + mdb_alloc(sotpi_sonode->st_info.sti_faddr_len, UM_SLEEP); + if (mdb_vread(faddr, sotpi_sonode->st_info.sti_faddr_len, + (uintptr_t)sotpi_sonode->st_info.sti_faddr_sa) == -1) { + mdb_warn("failed to read sotpi_sonode remote addr"); + return (-1); + } + + mdb_printf("remote: "); + pfiles_print_addr(faddr); + } + + return (0); +} + +static int +tcpip_sock_print(struct sonode *socknode) +{ + switch (socknode->so_family) { + case AF_INET: + { + conn_t conn_t; + in_port_t port; + + if (mdb_vread(&conn_t, sizeof (conn_t), + (uintptr_t)socknode->so_proto_handle) == -1) { + mdb_warn("failed to read conn_t V4"); + return (-1); + } + + mdb_printf("socket: "); + mdb_nhconvert(&port, &conn_t.conn_lport, sizeof (port)); + mdb_printf("AF_INET %I %d ", conn_t.conn_src, port); + + /* + * If this is a listening socket, we don't print + * the remote address. + */ + if (IPCL_IS_TCP(&conn_t) && IPCL_IS_BOUND(&conn_t) == 0 || + IPCL_IS_UDP(&conn_t) && IPCL_IS_CONNECTED(&conn_t)) { + mdb_printf("remote: "); + mdb_nhconvert(&port, &conn_t.conn_fport, sizeof (port)); + mdb_printf("AF_INET %I %d ", conn_t.conn_rem, port); + } + + break; + } + + case AF_INET6: + { + conn_t conn_t; + in_port_t port; + + if (mdb_vread(&conn_t, sizeof (conn_t), + (uintptr_t)socknode->so_proto_handle) == -1) { + mdb_warn("failed to read conn_t V6"); + return (-1); + } + + mdb_printf("socket: "); + mdb_nhconvert(&port, &conn_t.conn_lport, sizeof (port)); + mdb_printf("AF_INET6 %N %d ", &conn_t.conn_srcv6, port); + + /* + * If this is a listening socket, we don't print + * the remote address. + */ + if (IPCL_IS_TCP(&conn_t) && IPCL_IS_BOUND(&conn_t) == 0 || + IPCL_IS_UDP(&conn_t) && IPCL_IS_CONNECTED(&conn_t)) { + mdb_printf("remote: "); + mdb_nhconvert(&port, &conn_t.conn_fport, sizeof (port)); + mdb_printf("AF_INET6 %N %d ", &conn_t.conn_remv6, port); + } + + break; + } + + default: + mdb_printf("AF_?? (%d)", socknode->so_family); + break; + } + + return (0); +} + +static int +sctp_sock_print(struct sonode *socknode) +{ + sctp_t sctp_t; + + struct sockaddr *laddr = mdb_alloc(sizeof (struct sockaddr), UM_SLEEP); + struct sockaddr *faddr = mdb_alloc(sizeof (struct sockaddr), UM_SLEEP); + + if (mdb_vread(&sctp_t, sizeof (sctp_t), + (uintptr_t)socknode->so_proto_handle) == -1) { + mdb_warn("failed to read sctp_t"); + return (-1); + } + + if (sctp_getsockaddr(&sctp_t, laddr) == 0) { + mdb_printf("socket:"); + pfiles_print_addr(laddr); + } + if (sctp_getpeeraddr(&sctp_t, faddr) == 0) { + mdb_printf("remote:"); + pfiles_print_addr(faddr); + } + + return (0); +} + +/* ARGSUSED */ +static int +sdp_sock_print(struct sonode *socknode) +{ + return (0); +} + +struct sock_print { + int family; + int type; + int pro; + int (*print)(struct sonode *socknode); +} sock_prints[] = { + { 2, 2, 0, tcpip_sock_print }, /* /dev/tcp */ + { 2, 2, 6, tcpip_sock_print }, /* /dev/tcp */ + { 26, 2, 0, tcpip_sock_print }, /* /dev/tcp6 */ + { 26, 2, 6, tcpip_sock_print }, /* /dev/tcp6 */ + { 2, 1, 0, tcpip_sock_print }, /* /dev/udp */ + { 2, 1, 17, tcpip_sock_print }, /* /dev/udp */ + { 26, 1, 0, tcpip_sock_print }, /* /dev/udp6 */ + { 26, 1, 17, tcpip_sock_print }, /* /dev/udp6 */ + { 2, 4, 0, tcpip_sock_print }, /* /dev/rawip */ + { 26, 4, 0, tcpip_sock_print }, /* /dev/rawip6 */ + { 2, 2, 132, sctp_sock_print }, /* /dev/sctp */ + { 26, 2, 132, sctp_sock_print }, /* /dev/sctp6 */ + { 2, 6, 132, sctp_sock_print }, /* /dev/sctp */ + { 26, 6, 132, sctp_sock_print }, /* /dev/sctp6 */ + { 24, 4, 0, tcpip_sock_print }, /* /dev/rts */ + { 2, 2, 257, sdp_sock_print }, /* /dev/sdp */ + { 26, 2, 257, sdp_sock_print }, /* /dev/sdp */ +}; + +#define NUM_SOCK_PRINTS \ + (sizeof (sock_prints) / sizeof (struct sock_print)) + static int pfile_callback(uintptr_t addr, const struct file *f, struct pfiles_cbdata *cb) { @@ -624,40 +997,62 @@ pfile_callback(uintptr_t addr, const struct file *f, struct pfiles_cbdata *cb) case VSOCK: { - struct sonode sonode; + vnode_t v_sock; + struct sonode so; - if (pfiles_get_sonode(realvpp, &sonode) == -1) + if (mdb_vread(&v_sock, sizeof (v_sock), realvpp) == -1) { + mdb_warn("failed to read socket vnode"); return (DCMD_ERR); + } /* - * If the address is cached in the sonode, use it; otherwise, - * we print nothing. + * Sockets can be non-stream or stream, they have to be dealed + * with differently. */ - if (sonode.so_state & SS_LADDR_VALID) { - struct sockaddr *laddr = - mdb_alloc(sonode.so_laddr_len, UM_SLEEP); - if (mdb_vread(laddr, sonode.so_laddr_len, - (uintptr_t)sonode.so_laddr_sa) == -1) { - mdb_warn("failed to read sonode socket addr"); + if (v_sock.v_stream == NULL) { + if (pfiles_get_sonode(&v_sock, &so) == -1) return (DCMD_ERR); - } - mdb_printf("socket: "); - pfiles_print_addr(laddr); - } + /* Pick the proper methods. */ + for (i = 0; i <= NUM_SOCK_PRINTS; i++) { + if ((sock_prints[i].family == so.so_family && + sock_prints[i].type == so.so_type && + sock_prints[i].pro == so.so_protocol) || + (sock_prints[i].family == so.so_family && + sock_prints[i].type == so.so_type && + so.so_type == SOCK_RAW)) { + if ((*sock_prints[i].print)(&so) == -1) + return (DCMD_ERR); + } + } + } else { + sotpi_sonode_t sotpi_sonode; - if (sonode.so_state & SS_FADDR_VALID) { - struct sockaddr *faddr = - mdb_alloc(sonode.so_faddr_len, UM_SLEEP); - if (mdb_vread(faddr, sonode.so_faddr_len, - (uintptr_t)sonode.so_faddr_sa) == -1) { - mdb_warn("failed to read sonode remote addr"); + if (pfiles_get_sonode(&v_sock, &so) == -1) return (DCMD_ERR); + + /* + * If the socket is a fallback socket, read its related + * information separately; otherwise, read it as a whole + * tpi socket. + */ + if (so.so_state & SS_FALLBACK_COMP) { + sotpi_sonode.st_sonode = so; + + if (mdb_vread(&(sotpi_sonode.st_info), + sizeof (sotpi_info_t), + (uintptr_t)so.so_priv) == -1) + return (DCMD_ERR); + } else { + if (pfiles_get_tpi_sonode(&v_sock, + &sotpi_sonode) == -1) + return (DCMD_ERR); } - mdb_printf("remote: "); - pfiles_print_addr(faddr); + if (tpi_sock_print(&sotpi_sonode) == -1) + return (DCMD_ERR); } + break; } @@ -691,7 +1086,6 @@ pfile_callback(uintptr_t addr, const struct file *f, struct pfiles_cbdata *cb) break; } - mdb_printf("\n"); return (WALK_NEXT); diff --git a/usr/src/cmd/mdb/common/modules/sockfs/sockfs.c b/usr/src/cmd/mdb/common/modules/sockfs/sockfs.c new file mode 100644 index 0000000000..33b8d20f8a --- /dev/null +++ b/usr/src/cmd/mdb/common/modules/sockfs/sockfs.c @@ -0,0 +1,154 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/stropts.h> +#include <sys/socket.h> +#include <sys/socketvar.h> + +#include <mdb/mdb_modapi.h> +#include <mdb/mdb_ks.h> + +/* + * Look up the symbol name for the given sockparams list and walk + * all the entries. + */ +static boolean_t +sockparams_walk_list(const char *symname, int argc, const mdb_arg_t *argv) +{ + GElf_Sym sym; + + if (mdb_lookup_by_name(symname, &sym)) { + mdb_warn("can't find symbol %s", symname); + return (B_FALSE); + } + + if (mdb_pwalk_dcmd("list", "sockfs`sockparams", argc, argv, + sym.st_value) != 0) { + mdb_warn("can't walk %s", symname); + return (B_FALSE); + } + + return (B_TRUE); +} + +/* + * dcmd to print sockparams info. + * + * If no address is given then the default is to print all sockparams on the + * global list (i.e., installed with soconfig(1)). To also print the ephemeral + * entries the '-e' flag should be used. Only ephemeral entries can be printed + * by specifying the '-E' flag. + */ +static int +sockparams_prt(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) +{ + struct sockparams sp; + + if ((flags & DCMD_ADDRSPEC) == 0) { + uint_t opt_e = 0; + uint_t opt_E = 0; + + /* + * Determine what lists should be printed + */ + if (mdb_getopts(argc, argv, + 'e', MDB_OPT_SETBITS, 1, &opt_e, + 'E', MDB_OPT_SETBITS, 1, &opt_E) != argc) + return (DCMD_USAGE); + + if (!opt_E) { + if (!sockparams_walk_list("sphead", argc, argv)) + return (DCMD_ERR); + } + + if (opt_e || opt_E) { + if (!sockparams_walk_list("sp_ephem_list", argc, argv)) + return (DCMD_ERR); + } + + return (DCMD_OK); + } + + /* + * If we are piping the output, then just print out the address, + * otherwise summarize the sockparams info. + */ + if ((flags & DCMD_PIPE_OUT) != 0) { + mdb_printf("%#lr\n", addr); + return (DCMD_OK); + } + + if (DCMD_HDRSPEC(flags)) { + mdb_printf("%-?s %3s %3s %3s %15s %15s %6s %6s\n", + "ADDR", "FAM", "TYP", "PRO", "STRDEV", "SOCKMOD", "REFS", + "FLGS"); + } + + if (mdb_vread(&sp, sizeof (sp), addr) == -1) { + mdb_warn("failed to read sockparams at %0?p", addr); + return (DCMD_ERR); + } + + mdb_printf("%0?p %3u %3u %3u %15s %15s %6u %#6x\n", + addr, + sp.sp_family, sp.sp_type, sp.sp_protocol, + (sp.sp_sdev_info.sd_devpath != 0) ? + sp.sp_sdev_info.sd_devpath : "-", + sp.sp_smod_name, sp.sp_refcnt, + sp.sp_flags); + + + return (DCMD_OK); +} + +/* + * Help function + */ +void +sockparams_help(void) +{ + mdb_printf("Print sockparams information for a give sockparams ptr.\n" + "Without the address, list available sockparams. Default " + "behavior is to list only entries that were installed by the " + "admin (via soconfig(1M)).\n\n" + "Options:\n" + " -e:\t\tlist ephemeral sockparams\n" + " -E:\t\tonly list ephemeral sockparams\n"); +} + +static const mdb_dcmd_t dcmds[] = { + { "sockparams", "[-eE]", "print sockparams", sockparams_prt, + sockparams_help }, + { NULL } +}; + +static const mdb_modinfo_t modinfo = { MDB_API_VERSION, dcmds, NULL }; + +const mdb_modinfo_t * +_mdb_init(void) +{ + return (&modinfo); +} diff --git a/usr/src/cmd/mdb/intel/amd64/sockfs/Makefile b/usr/src/cmd/mdb/intel/amd64/sockfs/Makefile new file mode 100644 index 0000000000..9808e469f6 --- /dev/null +++ b/usr/src/cmd/mdb/intel/amd64/sockfs/Makefile @@ -0,0 +1,33 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. + +MODULE = sockfs.so +MDBTGT = kvm + +MODSRCS = sockfs.c + +include ../../../../Makefile.cmd +include ../../../../Makefile.cmd.64 +include ../../Makefile.amd64 +include ../../../Makefile.module diff --git a/usr/src/cmd/mdb/intel/ia32/sockfs/Makefile b/usr/src/cmd/mdb/intel/ia32/sockfs/Makefile new file mode 100644 index 0000000000..9b14d2fd04 --- /dev/null +++ b/usr/src/cmd/mdb/intel/ia32/sockfs/Makefile @@ -0,0 +1,32 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. + +MODULE = sockfs.so +MDBTGT = kvm + +MODSRCS = sockfs.c + +include ../../../../Makefile.cmd +include ../../Makefile.ia32 +include ../../../Makefile.module diff --git a/usr/src/cmd/mdb/sparc/v9/sockfs/Makefile b/usr/src/cmd/mdb/sparc/v9/sockfs/Makefile new file mode 100644 index 0000000000..9e65a6282b --- /dev/null +++ b/usr/src/cmd/mdb/sparc/v9/sockfs/Makefile @@ -0,0 +1,33 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. + +MODULE = sockfs.so +MDBTGT = kvm + +MODSRCS = sockfs.c + +include ../../../../Makefile.cmd +include ../../../../Makefile.cmd.64 +include ../../Makefile.sparcv9 +include ../../../Makefile.module diff --git a/usr/src/pkgdefs/SUNWckr/prototype_com b/usr/src/pkgdefs/SUNWckr/prototype_com index 1988298dfe..ead3a7e5e8 100644 --- a/usr/src/pkgdefs/SUNWckr/prototype_com +++ b/usr/src/pkgdefs/SUNWckr/prototype_com @@ -134,6 +134,7 @@ d none kernel/misc 755 root sys d none kernel/sched 755 root sys d none kernel/strmod 755 root sys d none kernel/sys 755 root sys +d none kernel/socketmod 755 root sys d none lib 755 root bin d none lib/svc 0755 root bin d none lib/svc/method 0755 root bin diff --git a/usr/src/pkgdefs/SUNWckr/prototype_i386 b/usr/src/pkgdefs/SUNWckr/prototype_i386 index 57be328034..adc41583bb 100644 --- a/usr/src/pkgdefs/SUNWckr/prototype_i386 +++ b/usr/src/pkgdefs/SUNWckr/prototype_i386 @@ -212,6 +212,7 @@ f none kernel/misc/ipc 755 root sys f none kernel/misc/kbtrans 755 root sys f none kernel/misc/kcf 755 root sys f none kernel/misc/kmdbmod 755 root sys +f none kernel/misc/ksocket 755 root sys f none kernel/misc/mac 755 root sys l none kernel/misc/md5=../../kernel/crypto/md5 f none kernel/misc/net80211 755 root sys @@ -427,6 +428,7 @@ f none kernel/misc/amd64/ipc 755 root sys f none kernel/misc/amd64/kbtrans 755 root sys f none kernel/misc/amd64/kcf 755 root sys f none kernel/misc/amd64/kmdbmod 755 root sys +f none kernel/misc/amd64/ksocket 755 root sys f none kernel/misc/amd64/mac 755 root sys l none kernel/misc/amd64/md5=../../../kernel/crypto/amd64/md5 f none kernel/misc/amd64/net80211 755 root sys @@ -497,3 +499,14 @@ f none kernel/kiconv/amd64/kiconv_ja 755 root sys f none kernel/kiconv/amd64/kiconv_ko 755 root sys f none kernel/kiconv/amd64/kiconv_sc 755 root sys f none kernel/kiconv/amd64/kiconv_tc 755 root sys +l none kernel/socketmod/icmp=../../kernel/drv/icmp +l none kernel/socketmod/rts=../../kernel/drv/rts +l none kernel/socketmod/tcp=../../kernel/drv/tcp +l none kernel/socketmod/udp=../../kernel/drv/udp +f none kernel/socketmod/socksctp 755 root sys +d none kernel/socketmod/amd64 755 root sys +l none kernel/socketmod/amd64/icmp=../../../kernel/drv/amd64/icmp +l none kernel/socketmod/amd64/rts=../../../kernel/drv/amd64/rts +l none kernel/socketmod/amd64/tcp=../../../kernel/drv/amd64/tcp +l none kernel/socketmod/amd64/udp=../../../kernel/drv/amd64/udp +f none kernel/socketmod/amd64/socksctp 755 root sys diff --git a/usr/src/pkgdefs/SUNWckr/prototype_sparc b/usr/src/pkgdefs/SUNWckr/prototype_sparc index daccee4e10..e81a86168e 100644 --- a/usr/src/pkgdefs/SUNWckr/prototype_sparc +++ b/usr/src/pkgdefs/SUNWckr/prototype_sparc @@ -199,6 +199,7 @@ f none kernel/misc/sparcv9/idmap 755 root sys f none kernel/misc/sparcv9/ipc 755 root sys f none kernel/misc/sparcv9/kbtrans 755 root sys f none kernel/misc/sparcv9/kcf 755 root sys +f none kernel/misc/sparcv9/ksocket 755 root sys f none kernel/misc/sparcv9/mac 755 root sys l none kernel/misc/sparcv9/md5=../../../kernel/crypto/sparcv9/md5 f none kernel/misc/sparcv9/neti 755 root sys @@ -267,3 +268,9 @@ f none kernel/kiconv/sparcv9/kiconv_ja 755 root sys f none kernel/kiconv/sparcv9/kiconv_ko 755 root sys f none kernel/kiconv/sparcv9/kiconv_sc 755 root sys f none kernel/kiconv/sparcv9/kiconv_tc 755 root sys +d none kernel/socketmod/sparcv9 755 root sys +l none kernel/socketmod/sparcv9/icmp=../../../kernel/drv/sparcv9/icmp +l none kernel/socketmod/sparcv9/rts=../../../kernel/drv/sparcv9/rts +l none kernel/socketmod/sparcv9/tcp=../../../kernel/drv/sparcv9/tcp +l none kernel/socketmod/sparcv9/udp=../../../kernel/drv/sparcv9/udp +f none kernel/socketmod/sparcv9/socksctp 755 root sys diff --git a/usr/src/pkgdefs/SUNWhea/prototype_com b/usr/src/pkgdefs/SUNWhea/prototype_com index c5d0e03053..df95ddfabe 100644 --- a/usr/src/pkgdefs/SUNWhea/prototype_com +++ b/usr/src/pkgdefs/SUNWhea/prototype_com @@ -971,6 +971,7 @@ f none usr/include/sys/kmem.h 644 root bin f none usr/include/sys/kmem_impl.h 644 root bin f none usr/include/sys/kobj.h 644 root bin f none usr/include/sys/kobj_impl.h 644 root bin +f none usr/include/sys/ksocket.h 644 root bin f none usr/include/sys/kstat.h 644 root bin f none usr/include/sys/kstr.h 644 root bin f none usr/include/sys/ksyms.h 644 root bin @@ -1225,6 +1226,7 @@ f none usr/include/sys/socket.h 644 root bin f none usr/include/sys/socket_impl.h 644 root bin f none usr/include/sys/socketvar.h 644 root bin f none usr/include/sys/sockio.h 644 root bin +f none usr/include/sys/socket_proto.h 644 root bin f none usr/include/sys/sodirect.h 644 root bin f none usr/include/sys/sservice.h 644 root bin f none usr/include/sys/squeue.h 644 root bin diff --git a/usr/src/pkgdefs/SUNWibsdp/postinstall b/usr/src/pkgdefs/SUNWibsdp/postinstall index e320b55507..01b5720227 100644 --- a/usr/src/pkgdefs/SUNWibsdp/postinstall +++ b/usr/src/pkgdefs/SUNWibsdp/postinstall @@ -19,18 +19,15 @@ # # CDDL HEADER END # -# -# ident "%Z%%M% %I% %E% SMI" -# -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # PATH="/usr/bin:/usr/sbin:${PATH}" export PATH -SDP4_SOCK_ENTRY=" 2 2 257 /dev/sdp" -SDP6_SOCK_ENTRY=" 26 2 257 /dev/sdp" +SDP4_SOCK_ENTRY=" 2 2 257 socksdp" +SDP6_SOCK_ENTRY=" 26 2 257 socksdp" if [ "${BASEDIR:=/}" != "/" ] then diff --git a/usr/src/pkgdefs/SUNWibsdp/preremove b/usr/src/pkgdefs/SUNWibsdp/preremove index d0f143d2cf..bf6b2d72ad 100644 --- a/usr/src/pkgdefs/SUNWibsdp/preremove +++ b/usr/src/pkgdefs/SUNWibsdp/preremove @@ -19,18 +19,15 @@ # # CDDL HEADER END # -# -# ident "%Z%%M% %I% %E% SMI" -# -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # PATH="/usr/bin:/usr/sbin:${PATH}" export PATH -SDP4_SOCK_ENTRY=" 2 2 257 /dev/sdp" -SDP6_SOCK_ENTRY=" 26 2 257 /dev/sdp" +SDP4_SOCK_ENTRY=" 2 2 257 socksdp" +SDP6_SOCK_ENTRY=" 26 2 257 socksdp" EXIT=0 diff --git a/usr/src/pkgdefs/SUNWibsdp/prototype_i386 b/usr/src/pkgdefs/SUNWibsdp/prototype_i386 index 2c01d15098..f1a1db9a48 100644 --- a/usr/src/pkgdefs/SUNWibsdp/prototype_i386 +++ b/usr/src/pkgdefs/SUNWibsdp/prototype_i386 @@ -19,11 +19,9 @@ # CDDL HEADER END # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# # This required package information file contains a list of package contents. # The 'pkgmk' command uses this file to identify the contents of a package # and their location on the development machine when building the package. @@ -47,3 +45,7 @@ f none kernel/drv/sdp 0755 root sys d none kernel/drv/amd64 0755 root sys f none kernel/drv/amd64/sdp 0755 root sys +d none kernel/socketmod 755 root sys +f none kernel/socketmod/socksdp 755 root sys +d none kernel/socketmod/amd64 755 root sys +f none kernel/socketmod/amd64/socksdp 755 root sys diff --git a/usr/src/pkgdefs/SUNWibsdp/prototype_sparc b/usr/src/pkgdefs/SUNWibsdp/prototype_sparc index 891011aba8..37fa95f27d 100644 --- a/usr/src/pkgdefs/SUNWibsdp/prototype_sparc +++ b/usr/src/pkgdefs/SUNWibsdp/prototype_sparc @@ -19,11 +19,9 @@ # CDDL HEADER END # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# # This required package information file contains a list of package contents. # The 'pkgmk' command uses this file to identify the contents of a package # and their location on the development machine when building the package. @@ -49,3 +47,6 @@ # d none kernel/drv/sparcv9 0755 root sys f none kernel/drv/sparcv9/sdp 0755 root sys +d none kernel/socketmod 755 root sys +d none kernel/socketmod/sparcv9 755 root sys +f none kernel/socketmod/sparcv9/socksdp 755 root sys diff --git a/usr/src/pkgdefs/SUNWmdb/prototype_i386 b/usr/src/pkgdefs/SUNWmdb/prototype_i386 index 05c255e659..fb1a898f13 100644 --- a/usr/src/pkgdefs/SUNWmdb/prototype_i386 +++ b/usr/src/pkgdefs/SUNWmdb/prototype_i386 @@ -89,6 +89,7 @@ f none usr/lib/mdb/kvm/amd64/sppp.so 555 root sys f none usr/lib/mdb/kvm/amd64/ufs.so 555 root sys f none usr/lib/mdb/kvm/amd64/uhci.so 555 root sys f none usr/lib/mdb/kvm/amd64/usba.so 555 root sys +f none usr/lib/mdb/kvm/amd64/sockfs.so 555 root sys f none usr/lib/mdb/kvm/arp.so 555 root sys f none usr/lib/mdb/kvm/audiosup.so 555 root sys f none usr/lib/mdb/kvm/cpc.so 555 root sys @@ -117,6 +118,7 @@ f none usr/lib/mdb/kvm/s1394.so 555 root sys f none usr/lib/mdb/kvm/scsi_vhci.so 555 root sys f none usr/lib/mdb/kvm/sctp.so 555 root sys f none usr/lib/mdb/kvm/sd.so 555 root sys +f none usr/lib/mdb/kvm/sockfs.so 555 root sys f none usr/lib/mdb/kvm/specfs.so 555 root sys f none usr/lib/mdb/kvm/sppp.so 555 root sys f none usr/lib/mdb/kvm/ufs.so 555 root sys diff --git a/usr/src/pkgdefs/SUNWmdb/prototype_sparc b/usr/src/pkgdefs/SUNWmdb/prototype_sparc index 51f5c49182..eae343b703 100644 --- a/usr/src/pkgdefs/SUNWmdb/prototype_sparc +++ b/usr/src/pkgdefs/SUNWmdb/prototype_sparc @@ -64,6 +64,7 @@ f none usr/lib/mdb/kvm/sparcv9/ptm.so 555 root sys s none usr/lib/mdb/kvm/sparcv9/px.so=intr.so f none usr/lib/mdb/kvm/sparcv9/random.so 555 root sys f none usr/lib/mdb/kvm/sparcv9/sctp.so 555 root sys +f none usr/lib/mdb/kvm/sparcv9/sockfs.so 555 root sys f none usr/lib/mdb/kvm/sparcv9/s1394.so 555 root sys f none usr/lib/mdb/kvm/sparcv9/scsi_vhci.so 555 root sys f none usr/lib/mdb/kvm/sparcv9/specfs.so 555 root sys diff --git a/usr/src/pkgdefs/SUNWmdbr/prototype_i386 b/usr/src/pkgdefs/SUNWmdbr/prototype_i386 index 237c1da83b..662f4cb1e3 100644 --- a/usr/src/pkgdefs/SUNWmdbr/prototype_i386 +++ b/usr/src/pkgdefs/SUNWmdbr/prototype_i386 @@ -33,9 +33,8 @@ f none kernel/kmdb/amd64/cpu.generic 555 root sys f none kernel/kmdb/amd64/cpu_ms.AuthenticAMD.15 555 root sys f none kernel/kmdb/amd64/crypto 555 root sys f none kernel/kmdb/amd64/genunix 555 root sys -f none kernel/kmdb/amd64/ip 555 root sys f none kernel/kmdb/amd64/hook 555 root sys -f none kernel/kmdb/amd64/neti 555 root sys +f none kernel/kmdb/amd64/ip 555 root sys f none kernel/kmdb/amd64/ipc 555 root sys f none kernel/kmdb/amd64/ipp 555 root sys f none kernel/kmdb/amd64/krtld 555 root sys @@ -46,6 +45,7 @@ f none kernel/kmdb/amd64/md 555 root sys f none kernel/kmdb/amd64/mdb_ds 555 root sys f none kernel/kmdb/amd64/mpt 555 root sys f none kernel/kmdb/amd64/nca 555 root sys +f none kernel/kmdb/amd64/neti 555 root sys f none kernel/kmdb/amd64/nfs 555 root sys f none kernel/kmdb/amd64/ptm 555 root sys f none kernel/kmdb/amd64/random 555 root sys @@ -53,6 +53,7 @@ f none kernel/kmdb/amd64/s1394 555 root sys f none kernel/kmdb/amd64/scsi_vhci 555 root sys f none kernel/kmdb/amd64/sctp 555 root sys f none kernel/kmdb/amd64/sd 555 root sys +f none kernel/kmdb/amd64/sockfs 555 root sys f none kernel/kmdb/amd64/specfs 555 root sys f none kernel/kmdb/amd64/sppp 555 root sys f none kernel/kmdb/amd64/ufs 555 root sys @@ -65,9 +66,8 @@ f none kernel/kmdb/cpu.generic 555 root sys f none kernel/kmdb/cpu_ms.AuthenticAMD.15 555 root sys f none kernel/kmdb/crypto 555 root sys f none kernel/kmdb/genunix 555 root sys -f none kernel/kmdb/ip 555 root sys f none kernel/kmdb/hook 555 root sys -f none kernel/kmdb/neti 555 root sys +f none kernel/kmdb/ip 555 root sys f none kernel/kmdb/ipc 555 root sys f none kernel/kmdb/ipp 555 root sys f none kernel/kmdb/krtld 555 root sys @@ -78,6 +78,7 @@ f none kernel/kmdb/md 555 root sys f none kernel/kmdb/mdb_ds 555 root sys f none kernel/kmdb/mpt 555 root sys f none kernel/kmdb/nca 555 root sys +f none kernel/kmdb/neti 555 root sys f none kernel/kmdb/nfs 555 root sys f none kernel/kmdb/ptm 555 root sys f none kernel/kmdb/random 555 root sys @@ -85,6 +86,7 @@ f none kernel/kmdb/s1394 555 root sys f none kernel/kmdb/scsi_vhci 555 root sys f none kernel/kmdb/sctp 555 root sys f none kernel/kmdb/sd 555 root sys +f none kernel/kmdb/sockfs 555 root sys f none kernel/kmdb/specfs 555 root sys f none kernel/kmdb/sppp 555 root sys f none kernel/kmdb/ufs 555 root sys diff --git a/usr/src/pkgdefs/SUNWmdbr/prototype_sparc b/usr/src/pkgdefs/SUNWmdbr/prototype_sparc index b4057c2328..0e3e805552 100644 --- a/usr/src/pkgdefs/SUNWmdbr/prototype_sparc +++ b/usr/src/pkgdefs/SUNWmdbr/prototype_sparc @@ -22,7 +22,6 @@ # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# !include prototype_com # @@ -32,10 +31,9 @@ f none kernel/kmdb/sparcv9/audiosup 555 root sys f none kernel/kmdb/sparcv9/cpc 555 root sys f none kernel/kmdb/sparcv9/crypto 555 root sys f none kernel/kmdb/sparcv9/genunix 555 root sys +f none kernel/kmdb/sparcv9/hook 555 root sys f none kernel/kmdb/sparcv9/intr 555 root sys f none kernel/kmdb/sparcv9/ip 555 root sys -f none kernel/kmdb/sparcv9/hook 555 root sys -f none kernel/kmdb/sparcv9/neti 555 root sys f none kernel/kmdb/sparcv9/ipc 555 root sys f none kernel/kmdb/sparcv9/ipp 555 root sys f none kernel/kmdb/sparcv9/isp 555 root sys @@ -47,16 +45,18 @@ f none kernel/kmdb/sparcv9/md 555 root sys f none kernel/kmdb/sparcv9/mdb_ds 555 root sys f none kernel/kmdb/sparcv9/mpt 555 root sys f none kernel/kmdb/sparcv9/nca 555 root sys +f none kernel/kmdb/sparcv9/neti 555 root sys f none kernel/kmdb/sparcv9/nfs 555 root sys -s none kernel/kmdb/sparcv9/pcisch=intr s none kernel/kmdb/sparcv9/pcipsy=intr +s none kernel/kmdb/sparcv9/pcisch=intr f none kernel/kmdb/sparcv9/ptm 555 root sys s none kernel/kmdb/sparcv9/px=intr f none kernel/kmdb/sparcv9/random 555 root sys -f none kernel/kmdb/sparcv9/sctp 555 root sys f none kernel/kmdb/sparcv9/s1394 555 root sys f none kernel/kmdb/sparcv9/scsi_vhci 555 root sys +f none kernel/kmdb/sparcv9/sctp 555 root sys f none kernel/kmdb/sparcv9/sd 555 root sys +f none kernel/kmdb/sparcv9/sockfs 555 root sys f none kernel/kmdb/sparcv9/specfs 555 root sys f none kernel/kmdb/sparcv9/sppp 555 root sys f none kernel/kmdb/sparcv9/ssd 555 root sys @@ -68,10 +68,10 @@ d none platform/sun4u 755 root sys d none platform/sun4u/kernel 755 root sys d none platform/sun4u/kernel/kmdb 755 root sys d none platform/sun4u/kernel/kmdb/sparcv9 755 root sys +f none platform/sun4u/kernel/kmdb/sparcv9/oplhwd 555 root sys f none platform/sun4u/kernel/kmdb/sparcv9/sgenv 555 root sys f none platform/sun4u/kernel/kmdb/sparcv9/sgsbbc 555 root sys f none platform/sun4u/kernel/kmdb/sparcv9/unix 555 root sys -f none platform/sun4u/kernel/kmdb/sparcv9/oplhwd 555 root sys # d none platform/sun4v 755 root sys d none platform/sun4v/kernel 755 root sys diff --git a/usr/src/pkgdefs/common_files/i.sock2path b/usr/src/pkgdefs/common_files/i.sock2path index 9b1bdedc36..31fcde8e06 100644 --- a/usr/src/pkgdefs/common_files/i.sock2path +++ b/usr/src/pkgdefs/common_files/i.sock2path @@ -3,9 +3,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -20,10 +19,7 @@ # # CDDL HEADER END # -# -#ident "%Z%%M% %I% %E% SMI" -# -# Copyright 2004 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # @@ -80,6 +76,25 @@ do echo >> $dest grep '/dev/spdsock' $src >> $dest fi + grep "^#" $dest | awk '{ + if ($5=="Path") {print $0 "|Module"} + else {print $0}}' > /tmp/i.$$ + grep -v "^#" $dest | awk '{ + if ($4=="/dev/tcp" || $4=="/dev/tcp6") { + print "\t" $1 "\t" $2 "\t" $3 "\ttcp" + } else if ($4=="/dev/udp" || $4=="/dev/udp6") { + print "\t" $1 "\t" $2 "\t" $3 "\tudp" + } else if ($4=="/dev/rawip" || $4=="/dev/rawip6") { + print "\t" $1 "\t" $2 "\t" $3 "\ticmp" + } else if ($4=="/dev/sctp" || $4=="/dev/sctp6") { + print "\t" $1 "\t" $2 "\t" $3 "\tsocksctp" + } else if ($4=="/dev/rts") { + print "\t" $1 "\t" $2 "\t" $3 "\trts" + } else if ($4=="/dev/sdp" || $4=="/dev/sdp6") { + print "\t" $1 "\t" $2 "\t" $3 "\tsocksdp" + } else {print $0}}' >> /tmp/i.$$ + cp /tmp/i.$$ $dest + rm -f /tmp/i.$$ fi done diff --git a/usr/src/uts/Makefile.targ b/usr/src/uts/Makefile.targ index 86adc21eb2..d9fc918b94 100644 --- a/usr/src/uts/Makefile.targ +++ b/usr/src/uts/Makefile.targ @@ -22,7 +22,6 @@ # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# # This Makefiles contains the common targets and definitions for # all kernels. It is to be included in the Makefiles for specific # implementation architectures and processor architecture dependent @@ -163,6 +162,9 @@ $(ROOT_FS_DIR)/%: $(OBJS_DIR)/% $(ROOT_FS_DIR) FRC $(ROOT_SCHED_DIR)/%: $(OBJS_DIR)/% $(ROOT_SCHED_DIR) FRC $(INS.file) +$(ROOT_SOCK_DIR)/%: $(OBJS_DIR)/% $(ROOT_SOCK_DIR) FRC + $(INS.file) + $(ROOT_STRMOD_DIR)/%: $(OBJS_DIR)/% $(ROOT_STRMOD_DIR) FRC $(INS.file) @@ -388,12 +390,10 @@ $(MODLIST_DEPS): FRC @case $@ in \ *32) \ class=32; \ - relmodule=`dirname $(RELMODULE)`; \ - rellink=`dirname $(RELLINK)`;; \ + relmodule=`dirname $(RELMODULE)`;; \ *64) \ class=64; \ - relmodule=`dirname $(RELMODULE)`/$(SUBDIR64); \ - rellink=`dirname $(RELLINK)`/$(SUBDIR64);; \ + relmodule=`dirname $(RELMODULE)`/$(SUBDIR64);; \ esac; \ if [ -z "$(THISIMPL)" ]; then \ impl=all; \ @@ -426,8 +426,16 @@ $(MODLIST_DEPS): FRC done \ fi; \ if [ -n "$(ROOTLINK)" ]; then \ + rellinks="$(RELLINK)"; \ + for r in $$rellinks; do \ + if [ $$class = 32 ]; then \ + linkdir=`dirname $$r`; \ + else \ + linkdir=`dirname $$r`/$(SUBDIR64); \ + fi; \ echo LINK $$relmodule $$module \ - $$rellink `basename $(RELLINK)` $$impl; \ + $$linkdir `basename $$r` $$impl; \ + done \ fi; \ if [ -n "$(UNIX32_LINK)" ]; then \ echo SYMLINK $(SUBDIR64)/$(UNIX) \ diff --git a/usr/src/uts/Makefile.uts b/usr/src/uts/Makefile.uts index 86b39fc084..0f4718e3da 100644 --- a/usr/src/uts/Makefile.uts +++ b/usr/src/uts/Makefile.uts @@ -419,6 +419,7 @@ ROOT_DTRACE_DIR_32 = $(ROOT_MOD_DIR)/dtrace ROOT_EXEC_DIR_32 = $(ROOT_MOD_DIR)/exec ROOT_FS_DIR_32 = $(ROOT_MOD_DIR)/fs ROOT_SCHED_DIR_32 = $(ROOT_MOD_DIR)/sched +ROOT_SOCK_DIR_32 = $(ROOT_MOD_DIR)/socketmod ROOT_STRMOD_DIR_32 = $(ROOT_MOD_DIR)/strmod ROOT_IPP_DIR_32 = $(ROOT_MOD_DIR)/ipp ROOT_SYS_DIR_32 = $(ROOT_MOD_DIR)/sys @@ -444,6 +445,7 @@ ROOT_DTRACE_DIR_64 = $(ROOT_MOD_DIR)/dtrace/$(SUBDIR64) ROOT_EXEC_DIR_64 = $(ROOT_MOD_DIR)/exec/$(SUBDIR64) ROOT_FS_DIR_64 = $(ROOT_MOD_DIR)/fs/$(SUBDIR64) ROOT_SCHED_DIR_64 = $(ROOT_MOD_DIR)/sched/$(SUBDIR64) +ROOT_SOCK_DIR_64 = $(ROOT_MOD_DIR)/socketmod/$(SUBDIR64) ROOT_STRMOD_DIR_64 = $(ROOT_MOD_DIR)/strmod/$(SUBDIR64) ROOT_IPP_DIR_64 = $(ROOT_MOD_DIR)/ipp/$(SUBDIR64) ROOT_SYS_DIR_64 = $(ROOT_MOD_DIR)/sys/$(SUBDIR64) @@ -469,6 +471,7 @@ ROOT_DTRACE_DIR = $(ROOT_DTRACE_DIR_$(CLASS)) ROOT_EXEC_DIR = $(ROOT_EXEC_DIR_$(CLASS)) ROOT_FS_DIR = $(ROOT_FS_DIR_$(CLASS)) ROOT_SCHED_DIR = $(ROOT_SCHED_DIR_$(CLASS)) +ROOT_SOCK_DIR = $(ROOT_SOCK_DIR_$(CLASS)) ROOT_STRMOD_DIR = $(ROOT_STRMOD_DIR_$(CLASS)) ROOT_IPP_DIR = $(ROOT_IPP_DIR_$(CLASS)) ROOT_SYS_DIR = $(ROOT_SYS_DIR_$(CLASS)) @@ -492,7 +495,7 @@ ROOT_MOD_DIRS_32 = $(ROOT_BRAND_DIR_32) $(ROOT_DRV_DIR_32) ROOT_MOD_DIRS_32 += $(ROOT_EXEC_DIR_32) $(ROOT_DTRACE_DIR_32) ROOT_MOD_DIRS_32 += $(ROOT_FS_DIR_32) $(ROOT_SCHED_DIR_32) ROOT_MOD_DIRS_32 += $(ROOT_STRMOD_DIR_32) $(ROOT_SYS_DIR_32) -ROOT_MOD_DIRS_32 += $(ROOT_IPP_DIR_32) +ROOT_MOD_DIRS_32 += $(ROOT_IPP_DIR_32) $(ROOT_SOCK_DIR_32) ROOT_MOD_DIRS_32 += $(ROOT_MISC_DIR_32) $(ROOT_MACH_DIR_32) ROOT_MOD_DIRS_32 += $(ROOT_KGSS_DIR_32) ROOT_MOD_DIRS_32 += $(ROOT_SCSI_VHCI_DIR_32) @@ -508,6 +511,7 @@ USR_DRV_DIR_32 = $(USR_MOD_DIR)/drv USR_EXEC_DIR_32 = $(USR_MOD_DIR)/exec USR_FS_DIR_32 = $(USR_MOD_DIR)/fs USR_SCHED_DIR_32 = $(USR_MOD_DIR)/sched +USR_SOCK_DIR_32 = $(USR_MOD_DIR)/socketmod USR_STRMOD_DIR_32 = $(USR_MOD_DIR)/strmod USR_SYS_DIR_32 = $(USR_MOD_DIR)/sys USR_MISC_DIR_32 = $(USR_MOD_DIR)/misc @@ -521,6 +525,7 @@ USR_DRV_DIR_64 = $(USR_MOD_DIR)/drv/$(SUBDIR64) USR_EXEC_DIR_64 = $(USR_MOD_DIR)/exec/$(SUBDIR64) USR_FS_DIR_64 = $(USR_MOD_DIR)/fs/$(SUBDIR64) USR_SCHED_DIR_64 = $(USR_MOD_DIR)/sched/$(SUBDIR64) +USR_SOCK_DIR_64 = $(USR_MOD_DIR)/socketmod/$(SUBDIR64) USR_STRMOD_DIR_64 = $(USR_MOD_DIR)/strmod/$(SUBDIR64) USR_SYS_DIR_64 = $(USR_MOD_DIR)/sys/$(SUBDIR64) USR_MISC_DIR_64 = $(USR_MOD_DIR)/misc/$(SUBDIR64) @@ -534,6 +539,7 @@ USR_DRV_DIR = $(USR_DRV_DIR_$(CLASS)) USR_EXEC_DIR = $(USR_EXEC_DIR_$(CLASS)) USR_FS_DIR = $(USR_FS_DIR_$(CLASS)) USR_SCHED_DIR = $(USR_SCHED_DIR_$(CLASS)) +USR_SOCK_DIR = $(USR_SOCK_DIR_$(CLASS)) USR_STRMOD_DIR = $(USR_STRMOD_DIR_$(CLASS)) USR_SYS_DIR = $(USR_SYS_DIR_$(CLASS)) USR_MISC_DIR = $(USR_MISC_DIR_$(CLASS)) @@ -599,7 +605,8 @@ PARALLEL_KMODS = $(DRV_KMODS) $(EXEC_KMODS) $(FS_KMODS) $(SCHED_KMODS) \ $(MMU_KMODS) $(DACF_KMODS) $(EXPORT_KMODS) $(IPP_KMODS) \ $(CRYPTO_KMODS) $(PCBE_KMODS) \ $(DRV_KMODS_$(CLASS)) $(MISC_KMODS_$(CLASS)) $(MAC_KMODS) \ - $(DEVNAME_KMODS) $(BRAND_KMODS) $(KICONV_KMODS) + $(DEVNAME_KMODS) $(BRAND_KMODS) $(KICONV_KMODS) \ + $(SOCKET_KMODS) KMODS = $(GENUNIX_KMODS) $(PARALLEL_KMODS) @@ -614,7 +621,7 @@ LINT_KMODS = $(DRV_KMODS) $(EXEC_KMODS) $(FS_KMODS) $(SCHED_KMODS) \ $(MACH_KMODS) $(GSS_KMODS) $(DACF_KMODS) $(IPP_KMODS) \ $(CRYPTO_KMODS) $(PCBE_KMODS) $(DEVNAME_KMODS) \ $(DRV_KMODS_$(CLASS)) $(MISC_KMODS_$(CLASS)) $(MAC_KMODS) \ - $(BRAND_KMODS) $(KICONV_KMODS) + $(BRAND_KMODS) $(KICONV_KMODS) $(SOCKET_KMODS) $(CLOSED_BUILD)CLOSED_LINT_KMODS = $(CLOSED_DRV_KMODS) $(CLOSED_TOD_KMODS) \ $(CLOSED_MISC_KMODS) $(CLOSED_DRV_KMODS_$(CLASS)) diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 564b2cf72e..f0951c280b 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -289,6 +289,7 @@ GENUNIX_OBJS += \ sigsuspend.o \ sigtimedwait.o \ sleepq.o \ + sock_conf.o \ space.o \ sscanf.o \ ssig.o \ @@ -489,7 +490,8 @@ IP_OBJS += igmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \ ip_multi.o ip_ndp.o ip_opt_data.o ip_rts.o ip_srcid.o \ ipddi.o ipdrop.o mi.o nd.o optcom.o snmpcom.o ipsec_loader.o \ spd.o ipclassifier.o inet_common.o ip_squeue.o squeue.o \ - ip_sadb.o ip_ftable.o radix.o ip_dummy.o \ + ip_sadb.o ip_ftable.o proto_set.o radix.o ip_dummy.o \ + ip_helper_stream.o \ $(IP_ICMP_OBJS) \ $(IP_RTS_OBJS) \ $(IP_TCP_OBJS) \ @@ -531,6 +533,10 @@ SCTP6_OBJS += sctp6ddi.o NCA_OBJS += ncaddi.o +SDP_SOCK_MOD_OBJS += sockmod_sdp.o socksdp.o socksdpsubr.o + +SCTP_SOCK_MOD_OBJS += sockmod_sctp.o socksctp.o socksctpsubr.o + TUN_OBJS += tun.o ATUN_OBJS += atun.o @@ -1138,10 +1144,10 @@ SHAREFS_OBJS += sharetab.o sharefs_vfsops.o sharefs_vnops.o SPEC_OBJS += specsubr.o specvfsops.o specvnops.o -SOCK_OBJS += socksubr.o sockvfsops.o sockvnops.o \ - socksyscalls.o socktpi.o sockstr.o \ - socksctp.o socksctpsubr.o socksctpvnops.o sockssl.o \ - socksdp.o socksdpsubr.o socksdpvnops.o \ +SOCK_OBJS += socksubr.o sockvfsops.o sockparams.o \ + socksyscalls.o socktpi.o sockstr.o sockssl.o \ + sockcommon_vnops.o sockcommon_subr.o \ + sockcommon_sops.o sockcommon.o socknotify.o \ nl7c.o nl7curi.o nl7chttp.o nl7clogd.o \ nl7cnca.o @@ -1456,6 +1462,8 @@ KGSSD_DERIVED_OBJS = gssd_xdr.o KGSS_DUMMY_OBJS += dmech.o +KSOCKET_OBJS += ksocket.o ksocket_mod.o + CRYPTO= cksumtypes.o decrypt.o encrypt.o encrypt_length.o etypes.o \ nfold.o verify_checksum.o prng.o block_size.o make_checksum.o\ checksum_length.o hmac.o default_state.o mandatory_sumtype.o diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules index 0035b502b9..35fe0895f1 100644 --- a/usr/src/uts/common/Makefile.rules +++ b/usr/src/uts/common/Makefile.rules @@ -481,6 +481,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/nca/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/sockmods/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/vni/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -681,6 +685,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/kbtrans/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/ksocket/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/aggr/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) @@ -1548,6 +1556,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/idmap/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/sockmods/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/arp/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) @@ -1732,6 +1743,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/kb8042/%.c $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/kbtrans/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) +$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/ksocket/%.c + @($(LHEAD) $(LINT.c) $< $(LTAIL)) + $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/aggr/%.c @($(LHEAD) $(LINT.c) $< $(LTAIL)) diff --git a/usr/src/uts/common/c2/audit_event.c b/usr/src/uts/common/c2/audit_event.c index 723212aa52..92559a3575 100644 --- a/usr/src/uts/common/c2/audit_event.c +++ b/usr/src/uts/common/c2/audit_event.c @@ -72,6 +72,8 @@ #include <sys/tihdr.h> #include <sys/socket.h> #include <sys/socketvar.h> +#include <sys/vfs_opreg.h> +#include <fs/sockfs/sockcommon.h> #include <netinet/in.h> #include <sys/ddi.h> #include <sys/port_impl.h> @@ -3328,7 +3330,6 @@ auf_accept( char so_laddr[sizeof (struct sockaddr_in6)]; char so_faddr[sizeof (struct sockaddr_in6)]; int err; - int len; short so_family, so_type; int add_sock_token = 0; @@ -3374,28 +3375,17 @@ auf_accept( * XXX - what about other socket types for AF_INET (e.g. DGRAM) */ if (so->so_type == SOCK_STREAM) { + socklen_t len; bzero((void *)so_laddr, sizeof (so_laddr)); bzero((void *)so_faddr, sizeof (so_faddr)); - /* - * no local address then need to get it from lower - * levels. only put out record on first read ala - * AUE_WRITE. - */ - if (so->so_state & SS_ISBOUND) { - /* only done once on a connection */ - (void) SOP_GETSOCKNAME(so); - (void) SOP_GETPEERNAME(so); - - /* get local and foreign addresses */ - mutex_enter(&so->so_lock); - len = min(so->so_laddr_len, sizeof (so_laddr)); - bcopy(so->so_laddr_sa, so_laddr, len); - len = min(so->so_faddr_len, sizeof (so_faddr)); - bcopy(so->so_faddr_sa, so_faddr, len); - mutex_exit(&so->so_lock); - } + len = sizeof (so_laddr); + (void) socket_getsockname(so, + (struct sockaddr *)so_laddr, &len, CRED()); + len = sizeof (so_faddr); + (void) socket_getpeername(so, + (struct sockaddr *)so_faddr, &len, B_FALSE, CRED()); add_sock_token = 1; } @@ -3434,7 +3424,7 @@ auf_bind(struct t_audit_data *tad, int error, rval_t *rvp) char so_laddr[sizeof (struct sockaddr_in6)]; char so_faddr[sizeof (struct sockaddr_in6)]; int err, fd; - int len; + socklen_t len; short so_family, so_type; int add_sock_token = 0; @@ -3466,17 +3456,10 @@ auf_bind(struct t_audit_data *tad, int error, rval_t *rvp) case AF_INET6: bzero(so_faddr, sizeof (so_faddr)); + len = sizeof (so_faddr); - if (so->so_state & SS_ISBOUND) { - /* only done once on a connection */ - (void) SOP_GETSOCKNAME(so); - } - - mutex_enter(&so->so_lock); - len = min(so->so_laddr_len, sizeof (so_laddr)); - bcopy(so->so_laddr_sa, so_laddr, len); - mutex_exit(&so->so_lock); - + (void) socket_getpeername(so, + (struct sockaddr *)so_faddr, &len, B_FALSE, CRED()); add_sock_token = 1; break; @@ -3517,7 +3500,7 @@ auf_connect(struct t_audit_data *tad, int error, rval_t *rval) char so_laddr[sizeof (struct sockaddr_in6)]; char so_faddr[sizeof (struct sockaddr_in6)]; int err, fd; - int len; + socklen_t len; short so_family, so_type; int add_sock_token = 0; @@ -3539,24 +3522,14 @@ auf_connect(struct t_audit_data *tad, int error, rval_t *rval) switch (so_family) { case AF_INET: case AF_INET6: - /* - * no local address then need to get it from lower - * levels. - */ - if (so->so_state & SS_ISBOUND) { - /* only done once on a connection */ - (void) SOP_GETSOCKNAME(so); - (void) SOP_GETPEERNAME(so); - } bzero(so_laddr, sizeof (so_laddr)); bzero(so_faddr, sizeof (so_faddr)); - mutex_enter(&so->so_lock); - len = min(so->so_laddr_len, sizeof (so_laddr)); - bcopy(so->so_laddr_sa, so_laddr, len); + len = sizeof (so_laddr); + (void) socket_getsockname(so, (struct sockaddr *)so_laddr, + &len, CRED()); if (error) { - mutex_exit(&so->so_lock); if (uap->addr == NULL) break; if (uap->len <= 0) @@ -3569,9 +3542,9 @@ auf_connect(struct t_audit_data *tad, int error, rval_t *rval) #endif } else { /* sanity check on length */ - len = min(so->so_faddr_len, sizeof (so_faddr)); - bcopy(so->so_faddr_sa, so_faddr, len); - mutex_exit(&so->so_lock); + len = sizeof (so_faddr); + (void) socket_getpeername(so, + (struct sockaddr *)so_faddr, &len, B_FALSE, CRED()); } add_sock_token = 1; @@ -3614,7 +3587,7 @@ aus_shutdown(struct t_audit_data *tad) char so_laddr[sizeof (struct sockaddr_in6)]; char so_faddr[sizeof (struct sockaddr_in6)]; int err, fd; - int len; + socklen_t len; short so_family, so_type; int add_sock_token = 0; file_t *fp; /* unix domain sockets */ @@ -3641,23 +3614,12 @@ aus_shutdown(struct t_audit_data *tad) bzero(so_laddr, sizeof (so_laddr)); bzero(so_faddr, sizeof (so_faddr)); - if (so->so_state & SS_ISBOUND) { - /* - * no local address then need to get it from lower - * levels. - */ - if (so->so_laddr_len == 0) - (void) SOP_GETSOCKNAME(so); - if (so->so_faddr_len == 0) - (void) SOP_GETPEERNAME(so); - } - - mutex_enter(&so->so_lock); - len = min(so->so_laddr_len, sizeof (so_laddr)); - bcopy(so->so_laddr_sa, so_laddr, len); - len = min(so->so_faddr_len, sizeof (so_faddr)); - bcopy(so->so_faddr_sa, so_faddr, len); - mutex_exit(&so->so_lock); + len = sizeof (so_laddr); + (void) socket_getsockname(so, + (struct sockaddr *)so_laddr, &len, CRED()); + len = sizeof (so_faddr); + (void) socket_getpeername(so, + (struct sockaddr *)so_faddr, &len, B_FALSE, CRED()); add_sock_token = 1; @@ -3721,7 +3683,7 @@ auf_setsockopt(struct t_audit_data *tad, int error, rval_t *rval) char so_faddr[sizeof (struct sockaddr_in6)]; char val[AU_BUFSIZE]; int err, fd; - int len; + socklen_t len; short so_family, so_type; int add_sock_token = 0; file_t *fp; /* unix domain sockets */ @@ -3751,24 +3713,16 @@ auf_setsockopt(struct t_audit_data *tad, int error, rval_t *rval) switch (so_family) { case AF_INET: case AF_INET6: - bzero((void *)so_laddr, sizeof (so_laddr)); bzero((void *)so_faddr, sizeof (so_faddr)); - if (so->so_state & SS_ISBOUND) { - if (so->so_laddr_len == 0) - (void) SOP_GETSOCKNAME(so); - if (so->so_faddr_len == 0) - (void) SOP_GETPEERNAME(so); - } - /* get local and foreign addresses */ - mutex_enter(&so->so_lock); - len = min(so->so_laddr_len, sizeof (so_laddr)); - bcopy(so->so_laddr_sa, so_laddr, len); - len = min(so->so_faddr_len, sizeof (so_faddr)); - bcopy(so->so_faddr_sa, so_faddr, len); - mutex_exit(&so->so_lock); + len = sizeof (so_laddr); + (void) socket_getsockname(so, (struct sockaddr *)so_laddr, + &len, CRED()); + len = sizeof (so_faddr); + (void) socket_getpeername(so, (struct sockaddr *)so_faddr, + &len, B_FALSE, CRED()); add_sock_token = 1; @@ -3892,7 +3846,7 @@ auf_recvmsg( int err; char so_laddr[sizeof (struct sockaddr_in6)]; char so_faddr[sizeof (struct sockaddr_in6)]; - int len; + socklen_t len; file_t *fp; /* unix domain sockets */ struct f_audit_data *fad; /* unix domain sockets */ short so_family, so_type; @@ -3942,10 +3896,9 @@ auf_recvmsg( bzero((void *)so_faddr, sizeof (so_faddr)); /* get local address */ - mutex_enter(&so->so_lock); - len = min(so->so_laddr_len, sizeof (so_laddr)); - bcopy(so->so_laddr_sa, so_laddr, len); - mutex_exit(&so->so_lock); + len = sizeof (so_laddr); + (void) socket_getsockname(so, + (struct sockaddr *)so_laddr, &len, CRED()); /* get peer address */ STRUCT_INIT(msg, get_udatamodel()); @@ -3995,21 +3948,13 @@ auf_recvmsg( bzero((void *)so_laddr, sizeof (so_laddr)); bzero((void *)so_faddr, sizeof (so_faddr)); - if (so->so_state & SS_ISBOUND) { - - if (so->so_laddr_len == 0) - (void) SOP_GETSOCKNAME(so); - if (so->so_faddr_len == 0) - (void) SOP_GETPEERNAME(so); - - /* get local and foreign addresses */ - mutex_enter(&so->so_lock); - len = min(so->so_laddr_len, sizeof (so_laddr)); - bcopy(so->so_laddr_sa, so_laddr, len); - len = min(so->so_faddr_len, sizeof (so_faddr)); - bcopy(so->so_faddr_sa, so_faddr, len); - mutex_exit(&so->so_lock); - } + /* get local and foreign addresses */ + len = sizeof (so_laddr); + (void) socket_getsockname(so, + (struct sockaddr *)so_laddr, &len, CRED()); + len = sizeof (so_faddr); + (void) socket_getpeername(so, + (struct sockaddr *)so_faddr, &len, B_FALSE, CRED()); add_sock_token = 1; } @@ -4103,7 +4048,7 @@ auf_recvfrom( int fd; short so_family, so_type; int add_sock_token = 0; - int len; + socklen_t len; int err; struct file *fp; struct f_audit_data *fad; /* unix domain sockets */ @@ -4149,10 +4094,9 @@ auf_recvfrom( add_sock_token = 1; /* get local address */ - mutex_enter(&so->so_lock); - len = min(so->so_laddr_len, sizeof (so_laddr)); - bcopy(so->so_laddr_sa, so_laddr, len); - mutex_exit(&so->so_lock); + len = sizeof (so_laddr); + (void) socket_getsockname(so, + (struct sockaddr *)so_laddr, &len, CRED()); /* get peer address */ bzero((void *)so_faddr, sizeof (so_faddr)); @@ -4206,21 +4150,13 @@ auf_recvfrom( bzero((void *)so_laddr, sizeof (so_laddr)); bzero((void *)so_faddr, sizeof (so_faddr)); - if (so->so_state & SS_ISBOUND) { - - if (so->so_laddr_len == 0) - (void) SOP_GETSOCKNAME(so); - if (so->so_faddr_len == 0) - (void) SOP_GETPEERNAME(so); - - /* get local and foreign addresses */ - mutex_enter(&so->so_lock); - len = min(so->so_laddr_len, sizeof (so_laddr)); - bcopy(so->so_laddr_sa, so_laddr, len); - len = min(so->so_faddr_len, sizeof (so_faddr)); - bcopy(so->so_faddr_sa, so_faddr, len); - mutex_exit(&so->so_lock); - } + /* get local and foreign addresses */ + len = sizeof (so_laddr); + (void) socket_getsockname(so, + (struct sockaddr *)so_laddr, &len, CRED()); + len = sizeof (so_faddr); + (void) socket_getpeername(so, + (struct sockaddr *)so_faddr, &len, B_FALSE, CRED()); add_sock_token = 1; } @@ -4306,7 +4242,7 @@ auf_sendmsg(struct t_audit_data *tad, int error, rval_t *rval) int fd; short so_family, so_type; int add_sock_token = 0; - int len; + socklen_t len; struct file *fp; struct f_audit_data *fad; caddr_t msg_name; @@ -4351,10 +4287,9 @@ auf_sendmsg(struct t_audit_data *tad, int error, rval_t *rval) bzero((void *)so_faddr, sizeof (so_faddr)); /* get local address */ - mutex_enter(&so->so_lock); - len = min(so->so_laddr_len, sizeof (so_laddr)); - bcopy(so->so_laddr_sa, so_laddr, len); - mutex_exit(&so->so_lock); + len = sizeof (so_laddr); + (void) socket_getsockname(so, + (struct sockaddr *)so_laddr, &len, CRED()); /* get peer address */ STRUCT_INIT(msg, get_udatamodel()); @@ -4405,21 +4340,13 @@ auf_sendmsg(struct t_audit_data *tad, int error, rval_t *rval) bzero((void *)so_laddr, sizeof (so_laddr)); bzero((void *)so_faddr, sizeof (so_faddr)); - if (so->so_state & SS_ISBOUND) { - - if (so->so_laddr_len == 0) - (void) SOP_GETSOCKNAME(so); - if (so->so_faddr_len == 0) - (void) SOP_GETPEERNAME(so); - - /* get local and foreign addresses */ - mutex_enter(&so->so_lock); - len = min(so->so_laddr_len, sizeof (so_laddr)); - bcopy(so->so_laddr_sa, so_laddr, len); - len = min(so->so_faddr_len, sizeof (so_faddr)); - bcopy(so->so_faddr_sa, so_faddr, len); - mutex_exit(&so->so_lock); - } + /* get local and foreign addresses */ + len = sizeof (so_laddr); + (void) socket_getsockname(so, + (struct sockaddr *)so_laddr, &len, CRED()); + len = sizeof (so_faddr); + (void) socket_getpeername(so, + (struct sockaddr *)so_faddr, &len, B_FALSE, CRED()); add_sock_token = 1; } @@ -4506,7 +4433,7 @@ auf_sendto(struct t_audit_data *tad, int error, rval_t *rval) socklen_t tolen; int err; int fd; - int len; + socklen_t len; short so_family, so_type; int add_sock_token = 0; struct file *fp; @@ -4556,10 +4483,9 @@ auf_sendto(struct t_audit_data *tad, int error, rval_t *rval) bzero((void *)so_faddr, sizeof (so_faddr)); /* get local address */ - mutex_enter(&so->so_lock); - len = min(so->so_laddr_len, sizeof (so_laddr)); - bcopy(so->so_laddr_sa, so_laddr, len); - mutex_exit(&so->so_lock); + len = sizeof (so_laddr); + (void) socket_getsockname(so, + (struct sockaddr *)so_laddr, &len, CRED()); /* get peer address */ @@ -4610,21 +4536,13 @@ auf_sendto(struct t_audit_data *tad, int error, rval_t *rval) bzero((void *)so_laddr, sizeof (so_laddr)); bzero((void *)so_faddr, sizeof (so_faddr)); - if (so->so_state & SS_ISBOUND) { - - if (so->so_laddr_len == 0) - (void) SOP_GETSOCKNAME(so); - if (so->so_faddr_len == 0) - (void) SOP_GETPEERNAME(so); - - /* get local and foreign addresses */ - mutex_enter(&so->so_lock); - len = min(so->so_laddr_len, sizeof (so_laddr)); - bcopy(so->so_laddr_sa, so_laddr, len); - len = min(so->so_faddr_len, sizeof (so_faddr)); - bcopy(so->so_faddr_sa, so_faddr, len); - mutex_exit(&so->so_lock); - } + /* get local and foreign addresses */ + len = sizeof (so_laddr); + (void) socket_getsockname(so, + (struct sockaddr *)so_laddr, &len, CRED()); + len = sizeof (so_faddr); + (void) socket_getpeername(so, + (struct sockaddr *)so_faddr, &len, B_FALSE, CRED()); add_sock_token = 1; } @@ -5394,7 +5312,7 @@ auf_recv(tad, error, rval) struct f_audit_data *fad; int fd; int err; - int len; + socklen_t len; short so_family, so_type; register struct a { long fd; @@ -5457,17 +5375,13 @@ auf_recv(tad, error, rval) bzero((void *)so_laddr, sizeof (so_laddr)); bzero((void *)so_faddr, sizeof (so_faddr)); - /* only done once on a connection */ - (void) SOP_GETSOCKNAME(so); - (void) SOP_GETPEERNAME(so); - /* get local and foreign addresses */ - mutex_enter(&so->so_lock); - len = min(so->so_laddr_len, sizeof (so_laddr)); - bcopy(so->so_laddr_sa, so_laddr, len); - len = min(so->so_faddr_len, sizeof (so_faddr)); - bcopy(so->so_faddr_sa, so_faddr, len); - mutex_exit(&so->so_lock); + len = sizeof (so_laddr); + (void) socket_getsockname(so, + (struct sockaddr *)so_laddr, &len, CRED()); + len = sizeof (so_faddr); + (void) socket_getpeername(so, + (struct sockaddr *)so_faddr, &len, B_FALSE, CRED()); /* * only way to drop out of switch. Note that we @@ -5532,7 +5446,7 @@ auf_send(tad, error, rval) struct f_audit_data *fad; int fd; int err; - int len; + socklen_t len; short so_family, so_type; register struct a { long fd; @@ -5597,17 +5511,13 @@ auf_send(tad, error, rval) bzero((void *)so_laddr, sizeof (so_laddr)); bzero((void *)so_faddr, sizeof (so_faddr)); - /* only done once on a connection */ - (void) SOP_GETSOCKNAME(so); - (void) SOP_GETPEERNAME(so); - /* get local and foreign addresses */ - mutex_enter(&so->so_lock); - len = min(so->so_laddr_len, sizeof (so_laddr)); - bcopy(so->so_laddr_sa, so_laddr, len); - len = min(so->so_faddr_len, sizeof (so_faddr)); - bcopy(so->so_faddr_sa, so_faddr, len); - mutex_exit(&so->so_lock); + len = sizeof (so_laddr); + (void) socket_getsockname(so, + (struct sockaddr *)so_laddr, &len, CRED()); + len = sizeof (so_faddr); + (void) socket_getpeername(so, + (struct sockaddr *)so_faddr, &len, B_FALSE, CRED()); /* * only way to drop out of switch. Note that we diff --git a/usr/src/uts/common/fs/smbsrv/smb_negotiate.c b/usr/src/uts/common/fs/smbsrv/smb_negotiate.c index fb3498f545..48f6e53458 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_negotiate.c +++ b/usr/src/uts/common/fs/smbsrv/smb_negotiate.c @@ -293,9 +293,9 @@ smb_com_negotiate(smb_request_t *sr) switch (dialect) { case PC_NETWORK_PROGRAM_1_0: /* core */ - (void) sosetsockopt(sr->session->sock, SOL_SOCKET, SO_RCVBUF, - (const void *)&smb_dos_tcp_rcvbuf, - sizeof (smb_dos_tcp_rcvbuf)); + (void) ksocket_setsockopt(sr->session->sock, SOL_SOCKET, + SO_RCVBUF, (const void *)&smb_dos_tcp_rcvbuf, + sizeof (smb_dos_tcp_rcvbuf), CRED()); rc = smbsr_encode_result(sr, 1, 0, "bww", 1, sel_pos, 0); break; @@ -306,9 +306,9 @@ smb_com_negotiate(smb_request_t *sr) case LANMAN1_0: case LM1_2X002: case DOS_LM1_2X002: - (void) sosetsockopt(sr->session->sock, SOL_SOCKET, SO_RCVBUF, - (const void *)&smb_dos_tcp_rcvbuf, - sizeof (smb_dos_tcp_rcvbuf)); + (void) ksocket_setsockopt(sr->session->sock, SOL_SOCKET, + SO_RCVBUF, (const void *)&smb_dos_tcp_rcvbuf, + sizeof (smb_dos_tcp_rcvbuf), CRED()); sr->smb_flg |= SMB_FLAGS_LOCK_AND_READ_OK; rc = smbsr_encode_result(sr, 13, VAR_BCC, "bwwwwwwlYww2.w#c", @@ -331,9 +331,9 @@ smb_com_negotiate(smb_request_t *sr) case DOS_LANMAN2_1: case LANMAN2_1: - (void) sosetsockopt(sr->session->sock, SOL_SOCKET, SO_RCVBUF, - (const void *)&smb_dos_tcp_rcvbuf, - sizeof (smb_dos_tcp_rcvbuf)); + (void) ksocket_setsockopt(sr->session->sock, SOL_SOCKET, + SO_RCVBUF, (const void *)&smb_dos_tcp_rcvbuf, + sizeof (smb_dos_tcp_rcvbuf), CRED()); sr->smb_flg |= SMB_FLAGS_LOCK_AND_READ_OK; rc = smbsr_encode_result(sr, 13, VAR_BCC, "bwwwwwwlYww2.w#cs", @@ -356,9 +356,9 @@ smb_com_negotiate(smb_request_t *sr) break; case NT_LM_0_12: - (void) sosetsockopt(sr->session->sock, SOL_SOCKET, SO_RCVBUF, - (const void *)&smb_nt_tcp_rcvbuf, - sizeof (smb_nt_tcp_rcvbuf)); + (void) ksocket_setsockopt(sr->session->sock, SOL_SOCKET, + SO_RCVBUF, (const void *)&smb_nt_tcp_rcvbuf, + sizeof (smb_nt_tcp_rcvbuf), CRED()); capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_STATUS32 diff --git a/usr/src/uts/common/fs/smbsrv/smb_net.c b/usr/src/uts/common/fs/smbsrv/smb_net.c index 4593cfec6b..ef41d911db 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_net.c +++ b/usr/src/uts/common/fs/smbsrv/smb_net.c @@ -35,6 +35,7 @@ #include <sys/fs/snode.h> #include <sys/fs/dv_node.h> #include <sys/vnode.h> +#include <sys/ksocket.h> #undef mem_free /* XXX Remove this after we convert everything to kmem_alloc */ #include <smbsrv/smb_vops.h> @@ -103,58 +104,19 @@ smb_net_fini(void) * smb_iov_sorecv: Receive data into an iovec from a socket */ -struct sonode * +ksocket_t smb_socreate(int domain, int type, int protocol) { - vnode_t *dvp = NULL; - vnode_t *vp = NULL; - struct snode *csp = NULL; - int err = 0; - major_t maj; - - if ((vp = solookup(domain, type, protocol, NULL, &err)) == NULL) { - - /* - * solookup calls sogetvp if the vp is not found in the cache. - * Since the call to sogetvp is hardwired to use USERSPACE - * and declared static we'll do the work here instead. - */ - err = lookupname(type == SOCK_STREAM ? "/dev/tcp" : "/dev/udp", - UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); - if (err) - return (NULL); - - /* Check that it is the correct vnode */ - if (vp->v_type != VCHR) { - VN_RELE(vp); - return (NULL); - } + ksocket_t sock; + int err = 0; - csp = VTOS(VTOS(vp)->s_commonvp); - if (!(csp->s_flag & SDIPSET)) { - char *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); - err = ddi_dev_pathname(vp->v_rdev, S_IFCHR, - pathname); - if (err == 0) { - err = devfs_lookupname(pathname, NULLVPP, - &dvp); - } - VN_RELE(vp); - kmem_free(pathname, MAXPATHLEN); - if (err != 0) { - return (NULL); - } - vp = dvp; - } - - maj = getmajor(vp->v_rdev); - if (!STREAMSTAB(maj)) { - VN_RELE(vp); - return (NULL); - } - } + err = ksocket_socket(&sock, domain, type, protocol, KSOCKET_SLEEP, + CRED()); - return (socreate(vp, domain, type, protocol, SOV_DEFAULT, NULL, &err)); + if (err != 0) + return (NULL); + else + return (sock); } /* @@ -165,9 +127,9 @@ smb_socreate(int domain, int type, int protocol) * regain control of a thread stuck in smb_sorecv. */ void -smb_soshutdown(struct sonode *so) +smb_soshutdown(ksocket_t so) { - (void) soshutdown(so, SHUT_RDWR); + (void) ksocket_shutdown(so, SHUT_RDWR, CRED()); } /* @@ -177,82 +139,27 @@ smb_soshutdown(struct sonode *so) * behavior will result. */ void -smb_sodestroy(struct sonode *so) +smb_sodestroy(ksocket_t so) { - vnode_t *vp = SOTOV(so); - - (void) VOP_CLOSE(vp, 0, 1, 0, kcred, NULL); - VN_RELE(vp); + (void) ksocket_close(so, CRED()); } int -smb_sorecv(struct sonode *so, void *msg, size_t len) +smb_sorecv(ksocket_t so, void *msg, size_t len) { - iovec_t iov; + size_t recvd; int err; ASSERT(so != NULL); ASSERT(len != 0); - /* - * Fill in iovec and receive data - */ - iov.iov_base = msg; - iov.iov_len = len; - - if ((err = smb_iov_sorecv(so, &iov, 1, len)) != 0) { + if ((err = ksocket_recv(so, msg, len, MSG_WAITALL, &recvd, + CRED())) != 0) { return (err); } /* Successful receive */ - return (0); -} - -/* - * smb_iov_sorecv - Receives an iovec from a connection - * - * This function gets the data asked for from the socket. It will return - * only when all the requested data has been retrieved or if an error - * occurs. - * - * Returns 0 for success, the socket errno value if sorecvmsg fails, and - * -1 if sorecvmsg returns success but uio_resid != 0 - */ -int -smb_iov_sorecv(struct sonode *so, iovec_t *iop, int iovlen, size_t total_len) -{ - struct msghdr msg; - struct uio uio; - int error; - - ASSERT(iop != NULL); - - /* Initialization of the message header. */ - bzero(&msg, sizeof (msg)); - msg.msg_iov = iop; - msg.msg_flags = MSG_WAITALL; - msg.msg_iovlen = iovlen; - - /* Initialization of the uio structure. */ - bzero(&uio, sizeof (uio)); - uio.uio_iov = iop; - uio.uio_iovcnt = iovlen; - uio.uio_segflg = UIO_SYSSPACE; - uio.uio_resid = total_len; - - if ((error = sorecvmsg(so, &msg, &uio)) == 0) { - /* Received data */ - if (uio.uio_resid == 0) { - /* All requested data received. Success */ - return (0); - } else { - /* Not all data was sent. Failure */ - return (-1); - } - } - - /* Receive failed */ - return (error); + return ((recvd == len) ? 0 : -1); } /* @@ -327,13 +234,12 @@ smb_net_txr_free(smb_txreq_t *txr) * queued and the routine returns immediately. */ int -smb_net_txr_send(struct sonode *so, smb_txlst_t *txl, smb_txreq_t *txr) +smb_net_txr_send(ksocket_t so, smb_txlst_t *txl, smb_txreq_t *txr) { list_t local; int rc = 0; - iovec_t iov; - struct msghdr msg; - struct uio uio; + size_t sent = 0; + size_t len; ASSERT(txl->tl_magic == SMB_TXLST_MAGIC); @@ -355,25 +261,11 @@ smb_net_txr_send(struct sonode *so, smb_txlst_t *txl, smb_txreq_t *txr) ASSERT(txr->tr_magic == SMB_TXREQ_MAGIC); list_remove(&local, txr); - iov.iov_base = (void *)txr->tr_buf; - iov.iov_len = txr->tr_len; - - bzero(&msg, sizeof (msg)); - msg.msg_iov = &iov; - msg.msg_flags = MSG_WAITALL; - msg.msg_iovlen = 1; - - bzero(&uio, sizeof (uio)); - uio.uio_iov = &iov; - uio.uio_iovcnt = 1; - uio.uio_segflg = UIO_SYSSPACE; - uio.uio_resid = txr->tr_len; - - rc = sosendmsg(so, &msg, &uio); - + len = txr->tr_len; + rc = ksocket_send(so, txr->tr_buf, txr->tr_len, + MSG_WAITALL, &sent, CRED()); smb_net_txr_free(txr); - - if ((rc == 0) && (uio.uio_resid == 0)) + if ((rc == 0) && (sent == len)) continue; if (rc == 0) diff --git a/usr/src/uts/common/fs/smbsrv/smb_server.c b/usr/src/uts/common/fs/smbsrv/smb_server.c index eb3f1d82a3..9296f123be 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_server.c +++ b/usr/src/uts/common/fs/smbsrv/smb_server.c @@ -1242,7 +1242,7 @@ smb_server_listen( int pthread_create_error) { int rc; - struct sonode *s_so; + ksocket_t s_so; uint32_t on = 1; smb_session_t *session; @@ -1263,14 +1263,16 @@ smb_server_listen( if (ld->ld_so) { - (void) sosetsockopt(ld->ld_so, SOL_SOCKET, - SO_REUSEADDR, (const void *)&on, sizeof (on)); + (void) ksocket_setsockopt(ld->ld_so, SOL_SOCKET, + SO_REUSEADDR, (const void *)&on, sizeof (on), + CRED()); - rc = sobind(ld->ld_so, (struct sockaddr *)&ld->ld_sin, - sizeof (ld->ld_sin), 0, 0); + rc = ksocket_bind(ld->ld_so, + (struct sockaddr *)&ld->ld_sin, + sizeof (ld->ld_sin), CRED()); if (rc == 0) { - rc = solisten(ld->ld_so, 20); + rc = ksocket_listen(ld->ld_so, 20, CRED()); if (rc < 0) { cmn_err(CE_WARN, "Port %d: listen failed", port); @@ -1297,19 +1299,22 @@ smb_server_listen( DTRACE_PROBE1(so__wait__accept, struct sonode *, ld->ld_so); for (;;) { - rc = soaccept(ld->ld_so, 0, &s_so); + rc = ksocket_accept(ld->ld_so, NULL, NULL, &s_so, CRED()); if (rc == 0) { uint32_t txbuf_size = 128*1024; uint32_t on = 1; DTRACE_PROBE1(so__accept, struct sonode *, s_so); - (void) sosetsockopt(s_so, IPPROTO_TCP, TCP_NODELAY, - (const void *)&on, sizeof (on)); - (void) sosetsockopt(s_so, SOL_SOCKET, SO_KEEPALIVE, - (const void *)&on, sizeof (on)); - (void) sosetsockopt(s_so, SOL_SOCKET, SO_SNDBUF, - (const void *)&txbuf_size, sizeof (txbuf_size)); + (void) ksocket_setsockopt(s_so, IPPROTO_TCP, + TCP_NODELAY, (const void *)&on, sizeof (on), + CRED()); + (void) ksocket_setsockopt(s_so, SOL_SOCKET, + SO_KEEPALIVE, (const void *)&on, sizeof (on), + CRED()); + (void) ksocket_setsockopt(s_so, SOL_SOCKET, SO_SNDBUF, + (const void *)&txbuf_size, sizeof (txbuf_size), + CRED()); /* * Create a session for this connection. */ diff --git a/usr/src/uts/common/fs/smbsrv/smb_session.c b/usr/src/uts/common/fs/smbsrv/smb_session.c index f76c6d77d1..571dee63c3 100644 --- a/usr/src/uts/common/fs/smbsrv/smb_session.c +++ b/usr/src/uts/common/fs/smbsrv/smb_session.c @@ -634,11 +634,10 @@ smb_session_message(smb_session_t *session) * Port will be SSN_SRVC_TCP_PORT or SMB_SRVC_TCP_PORT. */ smb_session_t * -smb_session_create(struct sonode *new_so, uint16_t port, smb_server_t *sv) +smb_session_create(ksocket_t new_so, uint16_t port, smb_server_t *sv) { - uint32_t ipaddr; - uint32_t local_ipaddr; struct sockaddr_in sin; + socklen_t slen; smb_session_t *session; session = kmem_cache_alloc(sv->si_cache_session, KM_SLEEP); @@ -670,13 +669,18 @@ smb_session_create(struct sonode *new_so, uint16_t port, smb_server_t *sv) smb_rwx_init(&session->s_lock); if (new_so) { - bcopy(new_so->so_faddr_sa, &sin, new_so->so_faddr_len); - ipaddr = sin.sin_addr.s_addr; - bcopy(new_so->so_laddr_sa, &sin, new_so->so_faddr_len); - local_ipaddr = sin.sin_addr.s_addr; + slen = sizeof (sin); + + (void) ksocket_getsockname(new_so, (struct sockaddr *)&sin, + &slen, CRED()); + session->local_ipaddr = sin.sin_addr.s_addr; + + slen = sizeof (sin); + (void) ksocket_getpeername(new_so, (struct sockaddr *)&sin, + &slen, CRED()); + session->ipaddr = sin.sin_addr.s_addr; + session->s_local_port = port; - session->ipaddr = ipaddr; - session->local_ipaddr = local_ipaddr; session->sock = new_so; } diff --git a/usr/src/uts/common/fs/sockfs/nl7c.c b/usr/src/uts/common/fs/sockfs/nl7c.c index 002d111c3a..fe3619ab6c 100644 --- a/usr/src/uts/common/fs/sockfs/nl7c.c +++ b/usr/src/uts/common/fs/sockfs/nl7c.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * NL7C (Network Layer 7 Cache) as part of SOCKFS provides an in-kernel * gateway cache for the request/response message based L7 protocol HTTP @@ -57,6 +55,7 @@ #include <netinet/in.h> #include <fs/sockfs/nl7c.h> #include <fs/sockfs/nl7curi.h> +#include <fs/sockfs/socktpi.h> #include <inet/nca/ncadoorhdr.h> #include <inet/nca/ncalogd.h> @@ -90,7 +89,7 @@ extern void nl7c_nca_init(void); * * This list is searched at bind(3SOCKET) time when an application doesn't * explicitly set AF_NCA but instead uses AF_INET, if a match is found then - * the underlying socket is marked so_nl7c_flags NL7C_ENABLED. + * the underlying socket is marked sti_nl7c_flags NL7C_ENABLED. */ typedef struct nl7c_addr_s { @@ -121,7 +120,7 @@ nl7c_listener_addr(void *arg, struct sonode *so) if (p->listener == NULL) p->listener = so; - so->so_nl7c_addr = arg; + SOTOTPI(so)->sti_nl7c_addr = arg; } struct sonode * @@ -256,7 +255,7 @@ nl7c_mi_report_addr(mblk_t *mp) int a4 = ip & 0xFF; (void) mi_sprintf(addr, "%d.%d.%d.%d", - a1, a2, a3, a4); + a1, a2, a3, a4); } so = p->listener; (void) mi_mpprintf(mp, "%p %s:%d %d", @@ -398,7 +397,7 @@ ncaportconf_read(void) if (ret != 0) { /* Error of some sort, tell'm about it */ cmn_err(CE_WARN, "%s: read error %d", - portconf, ret); + portconf, ret); break; } if (resid == sizeof (buf)) { @@ -564,7 +563,7 @@ ncakmodconf_read(void) if (ret != 0) { /* Error of some sort, tell'm about it */ cmn_err(CE_WARN, "%s: read error %d", - status, ret); + status, ret); break; } if (resid == sizeof (buf)) { @@ -687,7 +686,7 @@ ncalogdconf_read(void) if (ret != 0) { /* Error of some sort, tell'm about it */ cmn_err(CE_WARN, "%s: read error %d", - ncalogd, ret); + ncalogd, ret); break; } if (resid == sizeof (buf)) { @@ -933,7 +932,8 @@ boolean_t nl7c_process(struct sonode *so, boolean_t nonblocking) { vnode_t *vp = SOTOV(so); - mblk_t *rmp = so->so_nl7c_rcv_mp; + sotpi_info_t *sti = SOTOTPI(so); + mblk_t *rmp = sti->sti_nl7c_rcv_mp; clock_t timout; rval_t rval; uchar_t pri; @@ -942,7 +942,7 @@ nl7c_process(struct sonode *so, boolean_t nonblocking) boolean_t more; boolean_t ret = B_FALSE; boolean_t first = B_TRUE; - boolean_t pollin = (so->so_nl7c_flags & NL7C_POLLIN); + boolean_t pollin = (sti->sti_nl7c_flags & NL7C_POLLIN); nl7c_proc_cnt++; @@ -950,7 +950,7 @@ nl7c_process(struct sonode *so, boolean_t nonblocking) error = so_lock_read_intr(so, nonblocking ? FNDELAY|FNONBLOCK : 0); if (error) { /* Couldn't read lock, pass on this socket */ - so->so_nl7c_flags = 0; + sti->sti_nl7c_flags = 0; nl7c_proc_noLRI++; return (B_FALSE); } @@ -958,7 +958,7 @@ nl7c_process(struct sonode *so, boolean_t nonblocking) mutex_exit(&so->so_lock); if (pollin) - so->so_nl7c_flags &= ~NL7C_POLLIN; + sti->sti_nl7c_flags &= ~NL7C_POLLIN; /* Initialize some kstrgetmsg() constants */ pflag = MSG_ANY | MSG_DELAYERROR; @@ -966,7 +966,7 @@ nl7c_process(struct sonode *so, boolean_t nonblocking) if (nonblocking) { /* Non blocking so don't block */ timout = 0; - } else if (so->so_nl7c_flags & NL7C_SOPERSIST) { + } else if (sti->sti_nl7c_flags & NL7C_SOPERSIST) { /* 2nd or more time(s) here so use keep-alive value */ timout = nca_http_keep_alive_timeout; } else { @@ -996,18 +996,18 @@ nl7c_process(struct sonode *so, boolean_t nonblocking) /* Error of some sort */ nl7c_proc_error++; rval.r_v.r_v2 = error; - so->so_nl7c_flags = 0; + sti->sti_nl7c_flags = 0; break; } error = 0; } if (rmp != NULL) { - mblk_t *mp = so->so_nl7c_rcv_mp; + mblk_t *mp = sti->sti_nl7c_rcv_mp; if (mp == NULL) { /* Just new data, common case */ - so->so_nl7c_rcv_mp = rmp; + sti->sti_nl7c_rcv_mp = rmp; } else { /* Add new data to tail */ while (mp->b_cont != NULL) @@ -1015,13 +1015,14 @@ nl7c_process(struct sonode *so, boolean_t nonblocking) mp->b_cont = rmp; } } - if (so->so_nl7c_rcv_mp == NULL) { + if (sti->sti_nl7c_rcv_mp == NULL) { /* No data */ nl7c_proc_nodata++; if (timout > 0 || (first && pollin)) { /* Expected data so EOF */ ret = B_TRUE; - } else if (so->so_nl7c_flags & NL7C_SOPERSIST) { + } else if (sti->sti_nl7c_flags & + NL7C_SOPERSIST) { /* Persistent so just checking */ ret = B_FALSE; } @@ -1035,7 +1036,7 @@ nl7c_process(struct sonode *so, boolean_t nonblocking) more = nl7c_parse(so, nonblocking, &ret); - if (ret == B_TRUE && (so->so_nl7c_flags & NL7C_SOPERSIST)) { + if (ret == B_TRUE && (sti->sti_nl7c_flags & NL7C_SOPERSIST)) { /* * Parse complete, cache hit, response on its way, * socket is persistent so try to process the next @@ -1045,7 +1046,7 @@ nl7c_process(struct sonode *so, boolean_t nonblocking) ret = B_FALSE; break; } - if (so->so_nl7c_rcv_mp) { + if (sti->sti_nl7c_rcv_mp) { /* More recv-side data, pipelined */ nl7c_proc_again++; goto again; @@ -1061,10 +1062,10 @@ nl7c_process(struct sonode *so, boolean_t nonblocking) } while (more); - if (so->so_nl7c_rcv_mp) { + if (sti->sti_nl7c_rcv_mp) { nl7c_proc_rcv++; } - so->so_nl7c_rcv_rval = rval.r_vals; + sti->sti_nl7c_rcv_rval = rval.r_vals; /* Renter so_lock, caller called with it enter()ed */ mutex_enter(&so->so_lock); so_unlock_read(so); diff --git a/usr/src/uts/common/fs/sockfs/nl7c.h b/usr/src/uts/common/fs/sockfs/nl7c.h index 68914a3a58..6cd27c5efd 100644 --- a/usr/src/uts/common/fs/sockfs/nl7c.h +++ b/usr/src/uts/common/fs/sockfs/nl7c.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_SOCKFS_NL7C_H #define _SYS_SOCKFS_NL7C_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -39,8 +37,17 @@ extern "C" { #include <sys/socket.h> #include <sys/socketvar.h> + /* - * NL7C (uint64_t)(struct sonode).so_nl7c_flags: + * NCA_DEV NCA device + * + * NCA_INET_DEV TPI device for the INET based transport that NCA will use. + */ +#define NCA_DEV "/dev/nca" +#define NCA_INET_DEV "/dev/tcp" + +/* + * NL7C (uint64_t)(sotpi_info_t).sti_nl7c_flags: */ #define NL7C_ENABLED 0x00000001 /* NL7C enabled socket */ @@ -71,6 +78,10 @@ void nl7c_urifree(struct sonode *); void nl7c_close(struct sonode *); boolean_t nl7c_parse(struct sonode *, boolean_t, boolean_t *); +extern void *nl7c_lookup_addr(void *, t_uscalar_t); +extern void *nl7c_add_addr(void *, t_uscalar_t); +extern void nl7c_listener_addr(void *, struct sonode *); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/fs/sockfs/nl7chttp.c b/usr/src/uts/common/fs/sockfs/nl7chttp.c index 20f726a4c2..81dd8a99a5 100644 --- a/usr/src/uts/common/fs/sockfs/nl7chttp.c +++ b/usr/src/uts/common/fs/sockfs/nl7chttp.c @@ -19,16 +19,15 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/sysmacros.h> #include <sys/strsubr.h> #include <fs/sockfs/nl7c.h> #include <fs/sockfs/nl7curi.h> +#include <fs/sockfs/socktpi.h> #include <inet/nca/ncadoorhdr.h> #include <inet/nca/ncalogd.h> @@ -578,7 +577,7 @@ http_date2time_t(char *cp, char *ep) leap--; leap = leap / 4 - leap / 100 + leap / 400 - zeroleap; secs = ((((year - 1970) * 365 + dom[month] + day - 1 + leap) * 24 - + hour) * 60 + min) * 60 + sec; + + hour) * 60 + min) * 60 + sec; return (secs); } @@ -1167,7 +1166,7 @@ nl7c_http_cond(uri_desc_t *req, uri_desc_t *res) mblk_t * nl7c_http_persist(struct sonode *so) { - uint64_t flags = so->so_nl7c_flags & NL7C_SCHEMEPRIV; + uint64_t flags = SOTOTPI(so)->sti_nl7c_flags & NL7C_SCHEMEPRIV; mblk_t *mp; if (flags & HTTP_CONN_CL) @@ -1187,6 +1186,7 @@ nl7c_http_persist(struct sonode *so) boolean_t nl7c_http_request(char **cpp, char *ep, uri_desc_t *uri, struct sonode *so) { + sotpi_info_t *sti = SOTOTPI(so); http_t *http = kmem_cache_alloc(http_kmc, KM_SLEEP); char *cp = *cpp; char *hp; @@ -1429,20 +1429,20 @@ done: * */ if (persist) - so->so_nl7c_flags |= NL7C_SOPERSIST; + sti->sti_nl7c_flags |= NL7C_SOPERSIST; else - so->so_nl7c_flags &= ~NL7C_SOPERSIST; + sti->sti_nl7c_flags &= ~NL7C_SOPERSIST; if (http->major == 1) { - so->so_nl7c_flags &= ~NL7C_SCHEMEPRIV; + sti->sti_nl7c_flags &= ~NL7C_SCHEMEPRIV; if (http->minor >= 1) { if (! persist) - so->so_nl7c_flags |= HTTP_CONN_CL; + sti->sti_nl7c_flags |= HTTP_CONN_CL; } else { if (persist) - so->so_nl7c_flags |= HTTP_CONN_KA; + sti->sti_nl7c_flags |= HTTP_CONN_KA; else - so->so_nl7c_flags |= HTTP_CONN_CL; + sti->sti_nl7c_flags |= HTTP_CONN_CL; } } /* @@ -1464,6 +1464,7 @@ more: boolean_t nl7c_http_response(char **cpp, char *ep, uri_desc_t *uri, struct sonode *so) { + sotpi_info_t *sti = SOTOTPI(so); http_t *http = uri->scheme; char *cp = *cpp; char *hp; @@ -1753,20 +1754,20 @@ done: /* Set socket persist state */ if (persist) - so->so_nl7c_flags |= NL7C_SOPERSIST; + sti->sti_nl7c_flags |= NL7C_SOPERSIST; else - so->so_nl7c_flags &= ~NL7C_SOPERSIST; + sti->sti_nl7c_flags &= ~NL7C_SOPERSIST; if (http->major == 1) { - so->so_nl7c_flags &= ~NL7C_SCHEMEPRIV; + sti->sti_nl7c_flags &= ~NL7C_SCHEMEPRIV; if (http->minor >= 1) { if (! persist) - so->so_nl7c_flags |= HTTP_CONN_CL; + sti->sti_nl7c_flags |= HTTP_CONN_CL; } else { if (persist) - so->so_nl7c_flags |= HTTP_CONN_KA; + sti->sti_nl7c_flags |= HTTP_CONN_KA; else - so->so_nl7c_flags |= HTTP_CONN_CL; + sti->sti_nl7c_flags |= HTTP_CONN_CL; } } diff --git a/usr/src/uts/common/fs/sockfs/nl7curi.c b/usr/src/uts/common/fs/sockfs/nl7curi.c index fb1bf2f000..61f72258fc 100644 --- a/usr/src/uts/common/fs/sockfs/nl7curi.c +++ b/usr/src/uts/common/fs/sockfs/nl7curi.c @@ -33,6 +33,7 @@ #include <sys/sendfile.h> #include <fs/sockfs/nl7c.h> #include <fs/sockfs/nl7curi.h> +#include <fs/sockfs/socktpi_impl.h> #include <inet/common.h> #include <inet/ip.h> @@ -1017,9 +1018,10 @@ next: void nl7c_urifree(struct sonode *so) { - uri_desc_t *uri = (uri_desc_t *)so->so_nl7c_uri; + sotpi_info_t *sti = SOTOTPI(so); + uri_desc_t *uri = (uri_desc_t *)sti->sti_nl7c_uri; - so->so_nl7c_uri = NULL; + sti->sti_nl7c_uri = NULL; if (uri->hash != URI_TEMP) { uri_delete(uri); mutex_enter(&uri->proclock); @@ -1109,7 +1111,8 @@ pass: int nl7c_data(struct sonode *so, uio_t *uio) { - uri_desc_t *uri = (uri_desc_t *)so->so_nl7c_uri; + sotpi_info_t *sti = SOTOTPI(so); + uri_desc_t *uri = (uri_desc_t *)sti->sti_nl7c_uri; iovec_t *iov; int cnt; int sz = uio->uio_resid; @@ -1123,13 +1126,13 @@ nl7c_data(struct sonode *so, uio_t *uio) if (uri == NULL) { /* Socket & NL7C out of sync, disable NL7C */ - so->so_nl7c_flags = 0; + sti->sti_nl7c_flags = 0; nl7c_uri_NULL1++; return (-1); } - if (so->so_nl7c_flags & NL7C_WAITWRITE) { - so->so_nl7c_flags &= ~NL7C_WAITWRITE; + if (sti->sti_nl7c_flags & NL7C_WAITWRITE) { + sti->sti_nl7c_flags &= ~NL7C_WAITWRITE; first = B_TRUE; } else { first = B_FALSE; @@ -1191,9 +1194,9 @@ nl7c_data(struct sonode *so, uio_t *uio) * so close the URI processing for this so. */ nl7c_close(so); - if (! (so->so_nl7c_flags & NL7C_SOPERSIST)) { + if (! (sti->sti_nl7c_flags & NL7C_SOPERSIST)) { /* Not a persistent connection */ - so->so_nl7c_flags = 0; + sti->sti_nl7c_flags = 0; } } @@ -1203,7 +1206,7 @@ fail: if (alloc != NULL) { kmem_free(alloc, sz); } - so->so_nl7c_flags = 0; + sti->sti_nl7c_flags = 0; nl7c_urifree(so); return (error); @@ -1275,7 +1278,8 @@ int nl7c_sendfilev(struct sonode *so, u_offset_t *fileoff, sendfilevec_t *sfvp, int sfvc, ssize_t *xfer) { - uri_desc_t *uri = (uri_desc_t *)so->so_nl7c_uri; + sotpi_info_t *sti = SOTOTPI(so); + uri_desc_t *uri = (uri_desc_t *)sti->sti_nl7c_uri; file_t *fp = NULL; vnode_t *vp = NULL; char *data = NULL; @@ -1294,13 +1298,13 @@ nl7c_sendfilev(struct sonode *so, u_offset_t *fileoff, sendfilevec_t *sfvp, if (uri == NULL) { /* Socket & NL7C out of sync, disable NL7C */ - so->so_nl7c_flags = 0; + sti->sti_nl7c_flags = 0; nl7c_uri_NULL2++; return (0); } - if (so->so_nl7c_flags & NL7C_WAITWRITE) - so->so_nl7c_flags &= ~NL7C_WAITWRITE; + if (sti->sti_nl7c_flags & NL7C_WAITWRITE) + sti->sti_nl7c_flags &= ~NL7C_WAITWRITE; while (sfvc-- > 0) { /* @@ -1435,15 +1439,18 @@ nl7c_sendfilev(struct sonode *so, u_offset_t *fileoff, sendfilevec_t *sfvp, * so close the URI processing for this so. */ nl7c_close(so); - if (! (so->so_nl7c_flags & NL7C_SOPERSIST)) { + if (! (sti->sti_nl7c_flags & NL7C_SOPERSIST)) { /* Not a persistent connection */ - so->so_nl7c_flags = 0; + sti->sti_nl7c_flags = 0; } } return (0); fail: + if (error == EPIPE) + tsignal(curthread, SIGPIPE); + if (alloc != NULL) kmem_free(data, len); @@ -1457,7 +1464,7 @@ fail: atomic_add_64(&nl7c_uri_bytes, total_count); } - so->so_nl7c_flags = 0; + sti->sti_nl7c_flags = 0; nl7c_urifree(so); return (error); @@ -1472,7 +1479,8 @@ fail: void nl7c_close(struct sonode *so) { - uri_desc_t *uri = (uri_desc_t *)so->so_nl7c_uri; + sotpi_info_t *sti = SOTOTPI(so); + uri_desc_t *uri = (uri_desc_t *)sti->sti_nl7c_uri; if (uri == NULL) { /* @@ -1484,7 +1492,7 @@ nl7c_close(struct sonode *so) } return; } - so->so_nl7c_uri = NULL; + sti->sti_nl7c_uri = NULL; if (uri->hash != URI_TEMP) { mutex_enter(&uri->proclock); uri->proc = NULL; @@ -1679,7 +1687,6 @@ kstrwritempnoqwait(struct vnode *vp, mblk_t *mp) if (error != 0) { if (!(stp->sd_flag & STPLEX) && (stp->sd_wput_opt & SW_SIGPIPE)) { - tsignal(curthread, SIGPIPE); error = EPIPE; } return (error); @@ -1700,7 +1707,7 @@ uri_rd_response(struct sonode *so, boolean_t first) { vnode_t *vp = SOTOV(so); - int max_mblk = (int)((tcp_t *)so->so_priv)->tcp_mss; + int max_mblk = (int)vp->v_stream->sd_maxblk; int wsz; mblk_t *mp, *wmp, *persist; int write_bytes; @@ -1934,8 +1941,9 @@ static char pchars[] = { boolean_t nl7c_parse(struct sonode *so, boolean_t nonblocking, boolean_t *ret) { - char *cp = (char *)so->so_nl7c_rcv_mp->b_rptr; - char *ep = (char *)so->so_nl7c_rcv_mp->b_wptr; + sotpi_info_t *sti = SOTOTPI(so); + char *cp = (char *)sti->sti_nl7c_rcv_mp->b_rptr; + char *ep = (char *)sti->sti_nl7c_rcv_mp->b_wptr; char *get = "GET "; char *post = "POST "; char c; @@ -1945,7 +1953,7 @@ nl7c_parse(struct sonode *so, boolean_t nonblocking, boolean_t *ret) mblk_t *reqmp; uint32_t hv = 0; - if ((reqmp = dupb(so->so_nl7c_rcv_mp)) == NULL) { + if ((reqmp = dupb(sti->sti_nl7c_rcv_mp)) == NULL) { nl7c_uri_pass_dupbfail++; goto pass; } @@ -1965,7 +1973,7 @@ nl7c_parse(struct sonode *so, boolean_t nonblocking, boolean_t *ret) /* * Set request time to current time. */ - so->so_nl7c_rtime = gethrestime_sec(); + sti->sti_nl7c_rtime = gethrestime_sec(); /* * Parse the Request-Line for the URI. @@ -2043,7 +2051,7 @@ nl7c_parse(struct sonode *so, boolean_t nonblocking, boolean_t *ret) } if (uri->hash == URI_TEMP) { - if (so->so_nl7c_flags & NL7C_SOPERSIST) { + if (sti->sti_nl7c_flags & NL7C_SOPERSIST) { /* Temporary URI so skip hash processing */ nl7c_uri_request++; nl7c_uri_temp++; @@ -2073,10 +2081,10 @@ nl7c_parse(struct sonode *so, boolean_t nonblocking, boolean_t *ret) * We have the response cached, update recv mblk rptr * to reflect the data consumed in parse. */ - mblk_t *mp = so->so_nl7c_rcv_mp; + mblk_t *mp = sti->sti_nl7c_rcv_mp; if (cp == (char *)mp->b_wptr) { - so->so_nl7c_rcv_mp = mp->b_cont; + sti->sti_nl7c_rcv_mp = mp->b_cont; mp->b_cont = NULL; freeb(mp); } else { @@ -2094,12 +2102,12 @@ nl7c_parse(struct sonode *so, boolean_t nonblocking, boolean_t *ret) if (so->so_family == AF_INET) { /* Only support IPv4 addrs */ faddr = ((struct sockaddr_in *) - so->so_faddr_sa) ->sin_addr.s_addr; + sti->sti_faddr_sa) ->sin_addr.s_addr; } else { faddr = 0; } /* XXX need to pass response type, e.g. 200, 304 */ - nl7c_logd_log(ruri, uri, so->so_nl7c_rtime, faddr); + nl7c_logd_log(ruri, uri, sti->sti_nl7c_rtime, faddr); } /* * Release reference on request URI, send the response out @@ -2125,11 +2133,11 @@ temp: * read-side processing is suspended (so the next read() gets * the request data) until a write() is processed by NL7C. * - * Note, so->so_nl7c_uri now owns the REF_INIT() ref. + * Note, sti->sti_nl7c_uri now owns the REF_INIT() ref. */ uri->proc = so; - so->so_nl7c_uri = uri; - so->so_nl7c_flags |= NL7C_WAITWRITE; + sti->sti_nl7c_uri = uri; + sti->sti_nl7c_flags |= NL7C_WAITWRITE; *ret = B_FALSE; return (B_FALSE); @@ -2147,7 +2155,7 @@ pass: if (uri) { REF_RELE(uri); } - so->so_nl7c_flags = 0; + sti->sti_nl7c_flags = 0; *ret = B_FALSE; return (B_FALSE); } diff --git a/usr/src/uts/common/fs/sockfs/sockcommon.c b/usr/src/uts/common/fs/sockfs/sockcommon.c new file mode 100644 index 0000000000..02c3c16df5 --- /dev/null +++ b/usr/src/uts/common/fs/sockfs/sockcommon.c @@ -0,0 +1,1092 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/debug.h> +#include <sys/cmn_err.h> +#include <sys/vfs.h> +#include <sys/policy.h> +#include <sys/modctl.h> + +#include <sys/sunddi.h> + +#include <sys/strsun.h> +#include <sys/stropts.h> +#include <sys/strsubr.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sodirect.h> +#include <sys/uio.h> + +#include <inet/ipclassifier.h> +#include <fs/sockfs/sockcommon.h> +#include <fs/sockfs/nl7c.h> +#include <inet/ip.h> + +extern int xnet_skip_checks, xnet_check_print, xnet_truncate_print; + +static struct kmem_cache *sock_sod_cache; + +/* + * Common socket access functions. + * + * Instead of accessing the sonode switch directly (i.e., SOP_xxx()), + * the socket_xxx() function should be used. + */ + +/* + * Try to create a new sonode of the requested <family, type, protocol>. + */ +/* ARGSUSED */ +struct sonode * +socket_create(int family, int type, int protocol, char *devpath, char *mod, + int flags, int version, struct cred *cr, int *errorp) +{ + struct sonode *so; + struct sockparams *sp = NULL; + + /* + * Look for a sockparams entry that match the given criteria. + * solookup() returns with the entry held. + */ + *errorp = solookup(family, type, protocol, &sp); + if (sp == NULL) { + int kmflags = (flags == SOCKET_SLEEP) ? KM_SLEEP : KM_NOSLEEP; + /* + * There is no matching sockparams entry. An ephemeral entry is + * created if the caller specifies a device or a socket module. + */ + if (devpath != NULL) { + sp = sockparams_hold_ephemeral_bydev(family, type, + protocol, devpath, kmflags, errorp); + } else if (mod != NULL) { + sp = sockparams_hold_ephemeral_bymod(family, type, + protocol, mod, kmflags, errorp); + } else { + return (NULL); + } + + if (sp == NULL) + return (NULL); + } + + ASSERT(sp->sp_smod_info != NULL); + ASSERT(flags == SOCKET_SLEEP || flags == SOCKET_NOSLEEP); + so = sp->sp_smod_info->smod_sock_create_func(sp, family, type, + protocol, version, flags, errorp, cr); + if (so == NULL) { + SOCKPARAMS_DEC_REF(sp); + } else { + if ((*errorp = SOP_INIT(so, NULL, cr, flags)) == 0) { + /* Cannot fail, only bumps so_count */ + (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL); + } else { + socket_destroy(so); + so = NULL; + } + } + return (so); +} + +struct sonode * +socket_newconn(struct sonode *parent, sock_lower_handle_t lh, + sock_downcalls_t *dc, int flags, int *errorp) +{ + struct sonode *so; + struct sockparams *sp; + struct cred *cr; + + if ((cr = CRED()) == NULL) + cr = kcred; + + sp = parent->so_sockparams; + ASSERT(sp != NULL); + + so = sp->sp_smod_info->smod_sock_create_func(sp, parent->so_family, + parent->so_type, parent->so_protocol, parent->so_version, flags, + errorp, cr); + if (so != NULL) { + SOCKPARAMS_INC_REF(sp); + + so->so_proto_handle = lh; + so->so_downcalls = dc; + /* + * This function may be called in interrupt context, and CRED() + * will be NULL. In this case, pass in kcred. + */ + if ((*errorp = SOP_INIT(so, parent, cr, flags)) == 0) { + /* Cannot fail, only bumps so_count */ + (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL); + } else { + socket_destroy(so); + so = NULL; + } + } + + return (so); +} + +/* + * Bind local endpoint. + */ +int +socket_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen, + int flags, cred_t *cr) +{ + return (SOP_BIND(so, name, namelen, flags, cr)); +} + +/* + * Turn socket into a listen socket. + */ +int +socket_listen(struct sonode *so, int backlog, cred_t *cr) +{ + if (backlog < 0) { + backlog = 0; + } + + /* + * Use the same qlimit as in BSD. BSD checks the qlimit + * before queuing the next connection implying that a + * listen(sock, 0) allows one connection to be queued. + * BSD also uses 1.5 times the requested backlog. + * + * XNS Issue 4 required a strict interpretation of the backlog. + * This has been waived subsequently for Issue 4 and the change + * incorporated in XNS Issue 5. So we aren't required to do + * anything special for XPG apps. + */ + if (backlog >= (INT_MAX - 1) / 3) + backlog = INT_MAX; + else + backlog = backlog * 3 / 2 + 1; + + return (SOP_LISTEN(so, backlog, cr)); +} + +/* + * Accept incoming connection. + */ +int +socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop) +{ + return (SOP_ACCEPT(lso, fflag, cr, nsop)); +} + +/* + * Active open. + */ +int +socket_connect(struct sonode *so, const struct sockaddr *name, + socklen_t namelen, int fflag, int flags, cred_t *cr) +{ + int error; + + /* + * Handle a connect to a name parameter of type AF_UNSPEC like a + * connect to a null address. This is the portable method to + * unconnect a socket. + */ + if ((namelen >= sizeof (sa_family_t)) && + (name->sa_family == AF_UNSPEC)) { + name = NULL; + namelen = 0; + } + + error = SOP_CONNECT(so, name, namelen, fflag, flags, cr); + + if (error == EHOSTUNREACH && flags & _SOCONNECT_XPG4_2) { + /* + * X/Open specification contains a requirement that + * ENETUNREACH be returned but does not require + * EHOSTUNREACH. In order to keep the test suite + * happy we mess with the errno here. + */ + error = ENETUNREACH; + } + + return (error); +} + +/* + * Get address of remote node. + */ +int +socket_getpeername(struct sonode *so, struct sockaddr *addr, + socklen_t *addrlen, boolean_t accept, cred_t *cr) +{ + ASSERT(*addrlen > 0); + return (SOP_GETPEERNAME(so, addr, addrlen, accept, cr)); + +} + +/* + * Get local address. + */ +int +socket_getsockname(struct sonode *so, struct sockaddr *addr, + socklen_t *addrlen, cred_t *cr) +{ + return (SOP_GETSOCKNAME(so, addr, addrlen, cr)); + +} + +/* + * Called from shutdown(). + */ +int +socket_shutdown(struct sonode *so, int how, cred_t *cr) +{ + return (SOP_SHUTDOWN(so, how, cr)); +} + +/* + * Get socket options. + */ +/*ARGSUSED*/ +int +socket_getsockopt(struct sonode *so, int level, int option_name, + void *optval, socklen_t *optlenp, int flags, cred_t *cr) +{ + return (SOP_GETSOCKOPT(so, level, option_name, optval, + optlenp, flags, cr)); +} + +/* + * Set socket options + */ +int +socket_setsockopt(struct sonode *so, int level, int option_name, + const void *optval, t_uscalar_t optlen, cred_t *cr) +{ + /* Caller allocates aligned optval, or passes null */ + ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0); + /* If optval is null optlen is 0, and vice-versa */ + ASSERT(optval != NULL || optlen == 0); + ASSERT(optlen != 0 || optval == NULL); + + /* No options should be zero-length */ + if (optlen == 0) + return (EINVAL); + + return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr)); +} + +int +socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, + cred_t *cr) +{ + int error = 0; + ssize_t orig_resid = uiop->uio_resid; + + /* + * Do not bypass the cache if we are doing a local (AF_UNIX) write. + */ + if (so->so_family == AF_UNIX) + uiop->uio_extflg |= UIO_COPY_CACHED; + else + uiop->uio_extflg &= ~UIO_COPY_CACHED; + + error = SOP_SENDMSG(so, msg, uiop, cr); + switch (error) { + default: + break; + case EINTR: + case ETIME: + case EWOULDBLOCK: + /* We did a partial send */ + if (uiop->uio_resid != orig_resid) + error = 0; + break; + case EPIPE: + if ((so->so_mode & SM_KERNEL) == 0) + tsignal(curthread, SIGPIPE); + break; + } + + return (error); +} + +int +socket_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag, + struct cred *cr, mblk_t **mpp) +{ + int error = 0; + + error = SOP_SENDMBLK(so, msg, fflag, cr, mpp); + if (error == EPIPE) { + tsignal(curthread, SIGPIPE); + } + return (error); +} + +int +socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, + cred_t *cr) +{ + int error; + ssize_t orig_resid = uiop->uio_resid; + + /* + * Do not bypass the cache when reading data, as the application + * is likely to access the data shortly. + */ + uiop->uio_extflg |= UIO_COPY_CACHED; + + error = SOP_RECVMSG(so, msg, uiop, cr); + + switch (error) { + case EINTR: + case ETIME: + case EWOULDBLOCK: + /* We did a partial read */ + if (uiop->uio_resid != orig_resid) + error = 0; + break; + default: + break; + } + return (error); +} + +int +socket_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode, + struct cred *cr, int32_t *rvalp) +{ + return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp)); +} + +int +socket_poll(struct sonode *so, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + return (SOP_POLL(so, events, anyyet, reventsp, phpp)); +} + +int +socket_close(struct sonode *so, int flag, struct cred *cr) +{ + return (VOP_CLOSE(SOTOV(so), flag, 1, 0, cr, NULL)); +} + +int +socket_close_internal(struct sonode *so, int flag, cred_t *cr) +{ + ASSERT(so->so_count == 0); + + return (SOP_CLOSE(so, flag, cr)); +} + +void +socket_destroy(struct sonode *so) +{ + vn_invalid(SOTOV(so)); + VN_RELE(SOTOV(so)); +} + +/* ARGSUSED */ +void +socket_destroy_internal(struct sonode *so, cred_t *cr) +{ + struct sockparams *sp = so->so_sockparams; + ASSERT(so->so_count == 0 && sp != NULL); + + sp->sp_smod_info->smod_sock_destroy_func(so); + + SOCKPARAMS_DEC_REF(sp); +} + +/* + * TODO Once the common vnode ops is available, then the vnops argument + * should be removed. + */ +/*ARGSUSED*/ +int +sonode_constructor(void *buf, void *cdrarg, int kmflags) +{ + struct sonode *so = buf; + struct vnode *vp; + + vp = so->so_vnode = vn_alloc(kmflags); + if (vp == NULL) { + return (-1); + } + vp->v_data = so; + vn_setops(vp, socket_vnodeops); + + so->so_priv = NULL; + so->so_oobmsg = NULL; + + so->so_proto_handle = NULL; + + so->so_peercred = NULL; + + so->so_rcv_queued = 0; + so->so_rcv_q_head = NULL; + so->so_rcv_q_last_head = NULL; + so->so_rcv_head = NULL; + so->so_rcv_last_head = NULL; + so->so_rcv_wanted = 0; + so->so_rcv_timer_interval = SOCKET_NO_RCVTIMER; + so->so_rcv_timer_tid = 0; + so->so_rcv_thresh = 0; + + so->so_acceptq_head = NULL; + so->so_acceptq_tail = &so->so_acceptq_head; + so->so_acceptq_next = NULL; + so->so_acceptq_len = 0; + so->so_backlog = 0; + + so->so_snd_qfull = B_FALSE; + + mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL); + cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL); + cv_init(&so->so_want_cv, NULL, CV_DEFAULT, NULL); + + cv_init(&so->so_acceptq_cv, NULL, CV_DEFAULT, NULL); + cv_init(&so->so_snd_cv, NULL, CV_DEFAULT, NULL); + cv_init(&so->so_rcv_cv, NULL, CV_DEFAULT, NULL); + cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL); + cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL); + + return (0); +} + +/*ARGSUSED*/ +void +sonode_destructor(void *buf, void *cdrarg) +{ + struct sonode *so = buf; + struct vnode *vp = SOTOV(so); + + ASSERT(so->so_priv == NULL); + ASSERT(so->so_peercred == NULL); + + ASSERT(so->so_oobmsg == NULL); + + ASSERT(so->so_rcv_q_head == NULL); + + ASSERT(so->so_acceptq_head == NULL); + ASSERT(so->so_acceptq_tail == &so->so_acceptq_head); + ASSERT(so->so_acceptq_next == NULL); + + ASSERT(vp->v_data == so); + ASSERT(vn_matchops(vp, socket_vnodeops)); + + vn_free(vp); + + mutex_destroy(&so->so_lock); + mutex_destroy(&so->so_acceptq_lock); + rw_destroy(&so->so_fallback_rwlock); + + cv_destroy(&so->so_state_cv); + cv_destroy(&so->so_want_cv); + cv_destroy(&so->so_acceptq_cv); + cv_destroy(&so->so_snd_cv); + cv_destroy(&so->so_rcv_cv); + cv_destroy(&so->so_closing_cv); +} + +void +sonode_init(struct sonode *so, struct sockparams *sp, int family, + int type, int protocol, sonodeops_t *sops) +{ + vnode_t *vp; + + vp = SOTOV(so); + + so->so_flag = 0; + + so->so_state = 0; + so->so_mode = 0; + + so->so_count = 0; + + so->so_family = family; + so->so_type = type; + so->so_protocol = protocol; + + SOCK_CONNID_INIT(so->so_proto_connid); + + so->so_options = 0; + so->so_linger.l_onoff = 0; + so->so_linger.l_linger = 0; + so->so_sndbuf = 0; + so->so_error = 0; + so->so_rcvtimeo = 0; + so->so_sndtimeo = 0; + + ASSERT(so->so_oobmsg == NULL); + so->so_oobmark = 0; + so->so_pgrp = 0; + + ASSERT(so->so_peercred == NULL); + + so->so_zoneid = getzoneid(); + + so->so_sockparams = sp; + + so->so_ops = sops; + + so->so_proto_handle = NULL; + + so->so_downcalls = NULL; + + so->so_copyflag = 0; + + ASSERT(so->so_acceptq_head == NULL); + ASSERT(so->so_acceptq_tail == &so->so_acceptq_head); + ASSERT(so->so_acceptq_next == NULL); + + vn_reinit(vp); + vp->v_vfsp = rootvfs; + vp->v_type = VSOCK; + vp->v_rdev = sockdev; + + so->so_rcv_queued = 0; + so->so_rcv_q_head = NULL; + so->so_rcv_q_last_head = NULL; + so->so_rcv_head = NULL; + so->so_rcv_last_head = NULL; + + so->so_snd_qfull = B_FALSE; + so->so_minpsz = 0; + + so->so_rcv_wakeup = B_FALSE; + so->so_snd_wakeup = B_FALSE; + so->so_flowctrld = B_FALSE; + + so->so_pollev = 0; + bzero(&so->so_poll_list, sizeof (so->so_poll_list)); + bzero(&so->so_proto_props, sizeof (struct sock_proto_props)); + + bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t)); + so->so_ksock_cb_arg = NULL; + + so->so_max_addr_len = sizeof (struct sockaddr_storage); + + so->so_direct = NULL; + + vn_exists(vp); +} + +void +sonode_fini(struct sonode *so) +{ + mblk_t *mp; + vnode_t *vp; + + ASSERT(so->so_count == 0); + + if (so->so_rcv_timer_tid) { + ASSERT(MUTEX_NOT_HELD(&so->so_lock)); + (void) untimeout(so->so_rcv_timer_tid); + so->so_rcv_timer_tid = 0; + } + + so_acceptq_flush(so); + +#ifdef DEBUG + mutex_enter(&so->so_lock); + ASSERT(so_verify_oobstate(so)); + mutex_exit(&so->so_lock); +#endif /* DEBUG */ + if ((mp = so->so_oobmsg) != NULL) { + freemsg(mp); + so->so_oobmsg = NULL; + so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA| + SS_RCVATMARK); + } + + if (so->so_poll_list.ph_list != NULL) { + pollwakeup(&so->so_poll_list, POLLERR); + pollhead_clean(&so->so_poll_list); + } + + if (so->so_direct != NULL) { + sodirect_t *sodp = so->so_direct; + + ASSERT(sodp->sod_uioafh == NULL); + + so->so_direct = NULL; + kmem_cache_free(sock_sod_cache, sodp); + } + + vp = SOTOV(so); + vn_invalid(vp); + + if (so->so_peercred != NULL) { + crfree(so->so_peercred); + so->so_peercred = NULL; + } +} + +/* + * This function is called at the beginning of recvmsg(). + * + * If I/OAT is enabled on this sonode, initialize the uioa state machine + * with state UIOA_ALLOC. + */ +uio_t * +sod_rcv_init(struct sonode *so, int flags, struct uio **uiopp) +{ + struct uio *suiop; + struct uio *uiop; + sodirect_t *sodp = so->so_direct; + + if (sodp == NULL) + return (NULL); + + suiop = NULL; + uiop = *uiopp; + + mutex_enter(sodp->sod_lockp); + if (uiop->uio_resid >= uioasync.mincnt && + sodp != NULL && (sodp->sod_state & SOD_ENABLED) && + uioasync.enabled && !(flags & MSG_PEEK) && + !(so->so_state & SS_CANTRCVMORE)) { + /* + * Big enough I/O for uioa min setup and an sodirect socket + * and sodirect enabled and uioa enabled and I/O will be done + * and not EOF so initialize the sodirect_t uioa_t with "uiop". + */ + if (!uioainit(uiop, &sodp->sod_uioa)) { + /* + * Successful uioainit() so the uio_t part of the + * uioa_t will be used for all uio_t work to follow, + * we return the original "uiop" in "suiop". + */ + suiop = uiop; + *uiopp = (uio_t *)&sodp->sod_uioa; + /* + * Before returning to the caller the passed in uio_t + * "uiop" will be updated via a call to uioafini() + * below. + * + * Note, the uioa.uioa_state isn't set to UIOA_ENABLED + * here as first we have to uioamove() any currently + * queued M_DATA mblk_t(s) so it will be done later. + */ + } + /* + * In either uioainit() success or not case note the number + * of uio bytes the caller wants for sod framework and/or + * transport (e.g. TCP) strategy. + */ + sodp->sod_want = uiop->uio_resid; + } else if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) { + /* + * No uioa but still using sodirect so note the number of + * uio bytes the caller wants for sodirect framework and/or + * transport (e.g. TCP) strategy. + */ + sodp->sod_want = uiop->uio_resid; + } + mutex_exit(sodp->sod_lockp); + + return (suiop); +} + +/* + * This function is called at the end of recvmsg(), it finializes all the I/OAT + * operations, and reset the uioa state to UIOA_ALLOC. + */ +int +sod_rcv_done(struct sonode *so, struct uio *suiop, struct uio *uiop) +{ + int error = 0; + sodirect_t *sodp = so->so_direct; + mblk_t *mp; + + if (sodp == NULL) { + return (0); + } + + ASSERT(MUTEX_HELD(sodp->sod_lockp)); + /* Finish any sodirect and uioa processing */ + if (suiop != NULL) { + /* Finish any uioa_t processing */ + + ASSERT(uiop == (uio_t *)&sodp->sod_uioa); + error = uioafini(suiop, (uioa_t *)uiop); + if ((mp = sodp->sod_uioafh) != NULL) { + sodp->sod_uioafh = NULL; + sodp->sod_uioaft = NULL; + freemsg(mp); + } + } + ASSERT(sodp->sod_uioafh == NULL); + if (!(sodp->sod_state & SOD_WAKE_NOT)) { + /* Awoke */ + sodp->sod_state &= SOD_WAKE_CLR; + sodp->sod_state |= SOD_WAKE_NOT; + } + /* Last, clear sod_want value */ + sodp->sod_want = 0; + + return (error); +} + +/* + * Schedule a uioamove() on a mblk. This is ususally called from + * protocols (e.g. TCP) on a I/OAT enabled sonode. + */ +mblk_t * +sod_uioa_mblk_init(struct sodirect_s *sodp, mblk_t *mp, size_t msg_size) +{ + uioa_t *uioap = &sodp->sod_uioa; + mblk_t *mp1 = mp; + mblk_t *lmp = NULL; + + ASSERT(DB_TYPE(mp) == M_DATA); + ASSERT(msg_size == msgdsize(mp)); + + /* Caller must have lock held */ + ASSERT(MUTEX_HELD(sodp->sod_lockp)); + + if (uioap->uioa_state & UIOA_ENABLED) { + /* Uioa is enabled */ + + if (msg_size > uioap->uio_resid) { + /* + * There isn't enough uio space for the mblk_t chain + * so disable uioa such that this and any additional + * mblk_t data is handled by the socket and schedule + * the socket for wakeup to finish this uioa. + */ + uioap->uioa_state &= UIOA_CLR; + uioap->uioa_state |= UIOA_FINI; + if (sodp->sod_state & SOD_WAKE_NOT) { + sodp->sod_state &= SOD_WAKE_CLR; + sodp->sod_state |= SOD_WAKE_NEED; + } + return (mp); + } + do { + uint32_t len = MBLKL(mp1); + + if (!uioamove(mp1->b_rptr, len, UIO_READ, uioap)) { + /* Scheduled, mark dblk_t as such */ + DB_FLAGS(mp1) |= DBLK_UIOA; + } else { + /* Error, turn off async processing */ + uioap->uioa_state &= UIOA_CLR; + uioap->uioa_state |= UIOA_FINI; + break; + } + lmp = mp1; + } while ((mp1 = mp1->b_cont) != NULL); + + if (mp1 != NULL || uioap->uio_resid == 0) { + /* + * Not all mblk_t(s) uioamoved (error) or all uio + * space has been consumed so schedule the socket + * for wakeup to finish this uio. + */ + sodp->sod_state &= SOD_WAKE_CLR; + sodp->sod_state |= SOD_WAKE_NEED; + + /* Break the mblk chain if neccessary. */ + if (mp1 != NULL && lmp != NULL) { + mp->b_next = mp1; + lmp->b_cont = NULL; + } + } + } + return (mp1); +} + +/* + * This function is called on a mblk that thas been successfully uioamoved(). + */ +void +sod_uioa_mblk_done(sodirect_t *sodp, mblk_t *bp) +{ + if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) { + /* + * A uioa flaged mblk_t chain, already uio processed, + * add it to the sodirect uioa pending free list. + * + * Note, a b_cont chain headed by a DBLK_UIOA enable + * mblk_t must have all mblk_t(s) DBLK_UIOA enabled. + */ + mblk_t *bpt = sodp->sod_uioaft; + + ASSERT(sodp != NULL); + + /* + * Add first mblk_t of "bp" chain to current sodirect uioa + * free list tail mblk_t, if any, else empty list so new head. + */ + if (bpt == NULL) + sodp->sod_uioafh = bp; + else + bpt->b_cont = bp; + + /* + * Walk mblk_t "bp" chain to find tail and adjust rptr of + * each to reflect that uioamove() has consumed all data. + */ + bpt = bp; + for (;;) { + ASSERT(bpt->b_datap->db_flags & DBLK_UIOA); + + bpt->b_rptr = bpt->b_wptr; + if (bpt->b_cont == NULL) + break; + bpt = bpt->b_cont; + } + /* New sodirect uioa free list tail */ + sodp->sod_uioaft = bpt; + + /* Only dequeue once with data returned per uioa_t */ + if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) { + sodp->sod_uioa.uioa_state &= UIOA_CLR; + sodp->sod_uioa.uioa_state |= UIOA_FINI; + } + } +} + +/* + * When transit from UIOA_INIT state to UIOA_ENABLE state in recvmsg(), call + * this function on a non-STREAMS socket to schedule uioamove() on the data + * that has already queued in this socket. + */ +void +sod_uioa_so_init(struct sonode *so, struct sodirect_s *sodp, struct uio *uiop) +{ + uioa_t *uioap = (uioa_t *)uiop; + mblk_t *lbp; + mblk_t *wbp; + mblk_t *bp; + int len; + int error; + boolean_t in_rcv_q = B_TRUE; + + ASSERT(MUTEX_HELD(sodp->sod_lockp)); + ASSERT(&sodp->sod_uioa == uioap); + + /* + * Walk first b_cont chain in sod_q + * and schedule any M_DATA mblk_t's for uio asynchronous move. + */ + bp = so->so_rcv_q_head; + +again: + /* Walk the chain */ + lbp = NULL; + wbp = bp; + + do { + if (bp == NULL) + break; + + if (wbp->b_datap->db_type != M_DATA) { + /* Not M_DATA, no more uioa */ + goto nouioa; + } + if ((len = wbp->b_wptr - wbp->b_rptr) > 0) { + /* Have a M_DATA mblk_t with data */ + if (len > uioap->uio_resid || (so->so_oobmark > 0 && + len + uioap->uioa_mbytes >= so->so_oobmark)) { + /* Not enough uio sapce, or beyond oobmark */ + goto nouioa; + } + ASSERT(!(wbp->b_datap->db_flags & DBLK_UIOA)); + error = uioamove(wbp->b_rptr, len, + UIO_READ, uioap); + if (!error) { + /* Scheduled, mark dblk_t as such */ + wbp->b_datap->db_flags |= DBLK_UIOA; + } else { + /* Break the mblk chain */ + goto nouioa; + } + } + /* Save last wbp processed */ + lbp = wbp; + } while ((wbp = wbp->b_cont) != NULL); + + if (in_rcv_q && (bp == NULL || bp->b_next == NULL)) { + /* + * We get here only once to process the sonode dump area + * if so_rcv_q_head is NULL or all the mblks have been + * successfully uioamoved()ed. + */ + in_rcv_q = B_FALSE; + + /* move to dump area */ + bp = so->so_rcv_head; + goto again; + } + + return; + +nouioa: + /* No more uioa */ + uioap->uioa_state &= UIOA_CLR; + uioap->uioa_state |= UIOA_FINI; + + /* + * If we processed 1 or more mblk_t(s) then we need to split the + * current mblk_t chain in 2 so that all the uioamove()ed mblk_t(s) + * are in the current chain and the rest are in the following new + * chain. + */ + if (lbp != NULL) { + /* New end of current chain */ + lbp->b_cont = NULL; + + /* Insert new chain wbp after bp */ + if ((wbp->b_next = bp->b_next) == NULL) { + /* + * No need to grab so_lock, since sod_lockp + * points to so_lock. + */ + if (in_rcv_q) + so->so_rcv_q_last_head = wbp; + else + so->so_rcv_last_head = wbp; + } + bp->b_next = wbp; + bp->b_next->b_prev = bp->b_prev; + bp->b_prev = lbp; + } +} + +/* + * Initialize sodirect data structures on a socket. + */ +void +sod_sock_init(struct sonode *so, struct stdata *stp, sod_enq_func enq_func, + sod_wakeup_func wake_func, kmutex_t *lockp) +{ + sodirect_t *sodp; + + ASSERT(so->so_direct == NULL); + + so->so_state |= SS_SODIRECT; + + sodp = kmem_cache_alloc(sock_sod_cache, KM_SLEEP); + sodp->sod_state = SOD_ENABLED | SOD_WAKE_NOT; + sodp->sod_want = 0; + sodp->sod_q = (stp != NULL) ? RD(stp->sd_wrq) : NULL; + sodp->sod_enqueue = enq_func; + sodp->sod_wakeup = wake_func; + sodp->sod_uioafh = NULL; + sodp->sod_uioaft = NULL; + sodp->sod_lockp = lockp; + /* + * Remainder of the sod_uioa members are left uninitialized + * but will be initialized later by uioainit() before uioa + * is enabled. + */ + sodp->sod_uioa.uioa_state = UIOA_ALLOC; + so->so_direct = sodp; + if (stp != NULL) + stp->sd_sodirect = sodp; +} + +/* + * Init the sodirect kmem cache while sockfs is loading. + */ +void +sod_init() +{ + /* Allocate sodirect_t kmem_cache */ + sock_sod_cache = kmem_cache_create("sock_sod_cache", + sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +ssize_t +sod_uioa_mblk(struct sonode *so, mblk_t *mp) +{ + sodirect_t *sodp = so->so_direct; + + ASSERT(sodp != NULL); + ASSERT(MUTEX_HELD(sodp->sod_lockp)); + + ASSERT(sodp->sod_state & SOD_ENABLED); + ASSERT(sodp->sod_uioa.uioa_state != (UIOA_ALLOC|UIOA_INIT)); + + ASSERT(sodp->sod_uioa.uioa_state & (UIOA_ENABLED|UIOA_FINI)); + + if (mp == NULL && so->so_rcv_q_head != NULL) { + mp = so->so_rcv_q_head; + ASSERT(mp->b_prev != NULL); + mp->b_prev = NULL; + so->so_rcv_q_head = mp->b_next; + if (so->so_rcv_q_head == NULL) { + so->so_rcv_q_last_head = NULL; + } + mp->b_next = NULL; + } + + sod_uioa_mblk_done(sodp, mp); + + if (so->so_rcv_q_head == NULL && so->so_rcv_head != NULL && + DB_TYPE(so->so_rcv_head) == M_DATA && + (DB_FLAGS(so->so_rcv_head) & DBLK_UIOA)) { + /* more arrived */ + ASSERT(so->so_rcv_q_head == NULL); + mp = so->so_rcv_head; + so->so_rcv_head = mp->b_next; + if (so->so_rcv_head == NULL) + so->so_rcv_last_head = NULL; + mp->b_prev = mp->b_next = NULL; + sod_uioa_mblk_done(sodp, mp); + } + +#ifdef DEBUG + if (so->so_rcv_q_head != NULL) { + mblk_t *m = so->so_rcv_q_head; + while (m != NULL) { + if (DB_FLAGS(m) & DBLK_UIOA) { + cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p" + " in so_rcv_q_head.\n", (void *)m); + } + m = m->b_next; + } + } + if (so->so_rcv_head != NULL) { + mblk_t *m = so->so_rcv_head; + while (m != NULL) { + if (DB_FLAGS(m) & DBLK_UIOA) { + cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p" + " in so_rcv_head.\n", (void *)m); + } + m = m->b_next; + } + } +#endif + return (sodp->sod_uioa.uioa_mbytes); +} diff --git a/usr/src/uts/common/fs/sockfs/sockcommon.h b/usr/src/uts/common/fs/sockfs/sockcommon.h new file mode 100644 index 0000000000..fb4512c874 --- /dev/null +++ b/usr/src/uts/common/fs/sockfs/sockcommon.h @@ -0,0 +1,246 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SOCKCOMMON_H_ +#define _SOCKCOMMON_H_ + +#pragma ident "@(#)sockcommon.h 1.1 07/06/14 SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/filio.h> +#include <sys/socket_proto.h> + +struct sonode; + +extern kmem_cache_t *socket_cache; + +/* + * Socket access functions + * + * The following functions should only be used by sockfs, and are common + * functions that can be used both by kernel sockets (i.e., no file + * descriptors should ever be expected, or created), and to implement + * the socket system calls. + */ +extern struct sonode *socket_create(int, int, int, char *, char *, int, int, + struct cred *, int *); +extern struct sonode *socket_newconn(struct sonode *, sock_lower_handle_t, + sock_downcalls_t *, int, int *); +extern int socket_bind(struct sonode *, struct sockaddr *, socklen_t, int, + struct cred *); +extern int socket_accept(struct sonode *, int, struct cred *, struct sonode **); +extern int socket_listen(struct sonode *, int, struct cred *); +extern int socket_connect(struct sonode *, const struct sockaddr *, + socklen_t, int, int, struct cred *); +extern int socket_getpeername(struct sonode *, struct sockaddr *, socklen_t *, + boolean_t, struct cred *); +extern int socket_getsockname(struct sonode *, struct sockaddr *, socklen_t *, + struct cred *); +extern int socket_shutdown(struct sonode *, int, struct cred *); +extern int socket_getsockopt(struct sonode *, int, int, void *, socklen_t *, + int, struct cred *); +extern int socket_setsockopt(struct sonode *, int, int, const void *, + socklen_t, struct cred *); +extern int socket_recvmsg(struct sonode *, struct nmsghdr *, struct uio *, + struct cred *); +extern int socket_sendmsg(struct sonode *, struct nmsghdr *, struct uio *, + struct cred *); +extern int socket_sendmblk(struct sonode *, struct nmsghdr *, int, + struct cred *, mblk_t **); +extern int socket_ioctl(struct sonode *, int, intptr_t, int, struct cred *, + int32_t *); +extern int socket_poll(struct sonode *, short, int, short *, + struct pollhead **); +extern int socket_close(struct sonode *, int, struct cred *); +extern void socket_destroy(struct sonode *); + +/* + * Cancel the socket push timer. + */ +#define SOCKET_TIMER_CANCEL(so) { \ + timeout_id_t tid; \ + \ + ASSERT(MUTEX_HELD(&(so)->so_lock)); \ + if ((so)->so_rcv_timer_tid != 0) { \ + tid = (so)->so_rcv_timer_tid; \ + (so)->so_rcv_timer_tid = 0; \ + mutex_exit(&(so)->so_lock); \ + \ + (void) untimeout(tid); \ + \ + mutex_enter(&(so)->so_lock); \ + } \ +} + +#define SOCKET_TIMER_START(so) { \ + ASSERT(MUTEX_HELD(&(so)->so_lock)); \ + if ((so)->so_rcv_timer_interval != SOCKET_NO_RCVTIMER) { \ + (so)->so_rcv_timer_tid = timeout(so_timer_callback, \ + (so), MSEC_TO_TICK((so)->so_rcv_timer_interval)); \ + } \ +} + +/* Common sonode ops not support */ +extern int so_listen_notsupp(struct sonode *, int, struct cred *); +extern int so_accept_notsupp(struct sonode *, int, struct cred *, + struct sonode **); +extern int so_getpeername_notsupp(struct sonode *, struct sockaddr *, + socklen_t *, boolean_t, struct cred *); +extern int so_shutdown_notsupp(struct sonode *, int, struct cred *); +extern int so_sendmblk_notsupp(struct sonode *, struct nmsghdr *, + int, struct cred *, mblk_t **); + +/* Common sonode ops */ +extern int so_init(struct sonode *, struct sonode *, struct cred *, int); +extern int so_accept(struct sonode *, int, struct cred *, struct sonode **); +extern int so_bind(struct sonode *, struct sockaddr *, socklen_t, int, + struct cred *); +extern int so_listen(struct sonode *, int, struct cred *); +extern int so_connect(struct sonode *, const struct sockaddr *, + socklen_t, int, int, struct cred *); +extern int so_getsockopt(struct sonode *, int, int, void *, + socklen_t *, int, struct cred *); +extern int so_setsockopt(struct sonode *, int, int, const void *, + socklen_t, struct cred *); +extern int so_getpeername(struct sonode *, struct sockaddr *, + socklen_t *, boolean_t, struct cred *); +extern int so_getsockname(struct sonode *, struct sockaddr *, + socklen_t *, struct cred *); +extern int so_ioctl(struct sonode *, int, intptr_t, int, struct cred *, + int32_t *); +extern int so_poll(struct sonode *, short, int, short *, + struct pollhead **); +extern int so_sendmsg(struct sonode *, struct nmsghdr *, struct uio *, + struct cred *); +extern int so_sendmblk(struct sonode *, struct nmsghdr *, int, + struct cred *, mblk_t **); +extern int so_recvmsg(struct sonode *, struct nmsghdr *, struct uio *, + struct cred *); +extern int so_shutdown(struct sonode *, int, struct cred *); +extern int so_close(struct sonode *, int, struct cred *); + +extern int so_tpi_fallback(struct sonode *, struct cred *); + +/* Common upcalls */ +extern sock_upper_handle_t so_newconn(sock_upper_handle_t, + sock_lower_handle_t, sock_downcalls_t *, struct cred *, pid_t, + sock_upcalls_t **); +extern void so_set_prop(sock_upper_handle_t, + struct sock_proto_props *); +extern ssize_t so_queue_msg(sock_upper_handle_t, mblk_t *, size_t, int, + int *, boolean_t *); +extern void so_signal_oob(sock_upper_handle_t, ssize_t); + +extern void so_connected(sock_upper_handle_t, sock_connid_t, struct cred *, + pid_t); +extern int so_disconnected(sock_upper_handle_t, sock_connid_t, int); +extern void so_txq_full(sock_upper_handle_t, boolean_t); +extern void so_opctl(sock_upper_handle_t, sock_opctl_action_t, uintptr_t); +/* Common misc. functions */ + + /* accept queue */ +extern int so_acceptq_enqueue(struct sonode *, struct sonode *); +extern int so_acceptq_enqueue_locked(struct sonode *, struct sonode *); +extern int so_acceptq_dequeue(struct sonode *, boolean_t, + struct sonode **); +extern void so_acceptq_flush(struct sonode *); + + /* connect */ +extern int so_wait_connected(struct sonode *, boolean_t, sock_connid_t); + + /* send */ +extern int so_snd_wait_qnotfull(struct sonode *, boolean_t); +extern void so_snd_qfull(struct sonode *so); +extern void so_snd_qnotfull(struct sonode *so); + +extern int socket_chgpgrp(struct sonode *, pid_t); +extern void socket_sendsig(struct sonode *, int); +extern int so_dequeue_msg(struct sonode *, mblk_t **, struct uio *, + rval_t *, int); +extern void so_enqueue_msg(struct sonode *, mblk_t *, size_t); + +extern mblk_t *socopyinuio(uio_t *, ssize_t, size_t, ssize_t, size_t, int *); +extern mblk_t *socopyoutuio(mblk_t *, struct uio *, ssize_t, int *); + +extern boolean_t somsghasdata(mblk_t *); +extern void so_rcv_flush(struct sonode *); +extern int sorecvoob(struct sonode *, struct nmsghdr *, struct uio *, + int, boolean_t); + +extern void so_timer_callback(void *); + +extern struct sonode *socket_sonode_create(struct sockparams *, int, int, int, + int, int, int *, struct cred *); + +extern void socket_sonode_destroy(struct sonode *); +extern int socket_init_common(struct sonode *, struct sonode *, int flags, + struct cred *); +extern int socket_getopt_common(struct sonode *, int, int, void *, socklen_t *); +extern int socket_ioctl_common(struct sonode *, int, intptr_t, int, + struct cred *, int32_t *); +extern int socket_strioc_common(struct sonode *, int, intptr_t, int, + struct cred *, int32_t *); + +extern int so_zcopy_wait(struct sonode *); +extern int so_get_mod_version(struct sockparams *); + +/* Notification functions */ +extern void so_notify_connected(struct sonode *); +extern void so_notify_disconnecting(struct sonode *); +extern void so_notify_disconnected(struct sonode *, int); +extern void so_notify_writable(struct sonode *); +extern void so_notify_data(struct sonode *, size_t); +extern void so_notify_oobsig(struct sonode *); +extern void so_notify_oobdata(struct sonode *, boolean_t); +extern void so_notify_eof(struct sonode *); +extern void so_notify_newconn(struct sonode *); +extern void so_notify_shutdown(struct sonode *); +extern void so_notify_error(struct sonode *); + +/* Common sonode functions */ +extern int sonode_constructor(void *, void *, int); +extern void sonode_destructor(void *, void *); +extern void sonode_init(struct sonode *, struct sockparams *, + int, int, int, sonodeops_t *); +extern void sonode_fini(struct sonode *); + +/* + * Event flags to socket_sendsig(). + */ +#define SOCKETSIG_WRITE 0x1 +#define SOCKETSIG_READ 0x2 +#define SOCKETSIG_URG 0x4 + +extern sonodeops_t so_sonodeops; +extern sock_upcalls_t so_upcalls; + +#ifdef __cplusplus +} +#endif +#endif /* _SOCKCOMMON_H_ */ diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c new file mode 100644 index 0000000000..e8fc18552d --- /dev/null +++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c @@ -0,0 +1,1696 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "@(#)sockcommon_sops.c 1.1 07/06/14 SMI" + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/debug.h> +#include <sys/cmn_err.h> + +#include <sys/stropts.h> +#include <sys/socket.h> +#include <sys/socketvar.h> + +#define _SUN_TPI_VERSION 2 +#include <sys/tihdr.h> +#include <sys/sockio.h> +#include <sys/sodirect.h> +#include <sys/kmem_impl.h> + +#include <sys/strsubr.h> +#include <sys/strsun.h> +#include <sys/ddi.h> +#include <netinet/in.h> +#include <inet/ip.h> + +#include <fs/sockfs/sockcommon.h> + +#include <sys/socket_proto.h> + +#include <fs/sockfs/socktpi_impl.h> +#include <sys/tihdr.h> +#include <fs/sockfs/nl7c.h> +#include <inet/kssl/ksslapi.h> + + +extern int xnet_skip_checks; +extern int xnet_check_print; + +static void so_queue_oob(sock_upper_handle_t, mblk_t *, size_t); + + +/*ARGSUSED*/ +int +so_accept_notsupp(struct sonode *lso, int fflag, + struct cred *cr, struct sonode **nsop) +{ + return (EOPNOTSUPP); +} + +/*ARGSUSED*/ +int +so_listen_notsupp(struct sonode *so, int backlog, struct cred *cr) +{ + return (EOPNOTSUPP); +} + +/*ARGSUSED*/ +int +so_getsockname_notsupp(struct sonode *so, struct sockaddr *sa, + socklen_t *len, struct cred *cr) +{ + return (EOPNOTSUPP); +} + +/*ARGSUSED*/ +int +so_getpeername_notsupp(struct sonode *so, struct sockaddr *addr, + socklen_t *addrlen, boolean_t accept, struct cred *cr) +{ + return (EOPNOTSUPP); +} + +/*ARGSUSED*/ +int +so_shutdown_notsupp(struct sonode *so, int how, struct cred *cr) +{ + return (EOPNOTSUPP); +} + +/*ARGSUSED*/ +int +so_sendmblk_notsupp(struct sonode *so, struct msghdr *msg, int fflag, + struct cred *cr, mblk_t **mpp) +{ + return (EOPNOTSUPP); +} + +/* + * Generic Socket Ops + */ + +/* ARGSUSED */ +int +so_init(struct sonode *so, struct sonode *pso, struct cred *cr, int flags) +{ + return (socket_init_common(so, pso, flags, cr)); +} + +int +so_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen, + int flags, struct cred *cr) +{ + int error; + + SO_BLOCK_FALLBACK(so, SOP_BIND(so, name, namelen, flags, cr)); + + ASSERT(flags == _SOBIND_XPG4_2 || flags == _SOBIND_SOCKBSD); + + /* X/Open requires this check */ + if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) { + if (xnet_check_print) { + printf("sockfs: X/Open bind state check " + "caused EINVAL\n"); + } + error = EINVAL; + goto done; + } + + /* + * a bind to a NULL address is interpreted as unbind. So just + * do the downcall. + */ + if (name == NULL) + goto dobind; + + switch (so->so_family) { + case AF_INET: + if ((size_t)namelen != sizeof (sin_t)) { + error = name->sa_family != so->so_family ? + EAFNOSUPPORT : EINVAL; + eprintsoline(so, error); + goto done; + } + + if ((flags & _SOBIND_XPG4_2) && + (name->sa_family != so->so_family)) { + /* + * This check has to be made for X/Open + * sockets however application failures have + * been observed when it is applied to + * all sockets. + */ + error = EAFNOSUPPORT; + eprintsoline(so, error); + goto done; + } + /* + * Force a zero sa_family to match so_family. + * + * Some programs like inetd(1M) don't set the + * family field. Other programs leave + * sin_family set to garbage - SunOS 4.X does + * not check the family field on a bind. + * We use the family field that + * was passed in to the socket() call. + */ + name->sa_family = so->so_family; + break; + + case AF_INET6: { +#ifdef DEBUG + sin6_t *sin6 = (sin6_t *)name; +#endif + if ((size_t)namelen != sizeof (sin6_t)) { + error = name->sa_family != so->so_family ? + EAFNOSUPPORT : EINVAL; + eprintsoline(so, error); + goto done; + } + + if (name->sa_family != so->so_family) { + /* + * With IPv6 we require the family to match + * unlike in IPv4. + */ + error = EAFNOSUPPORT; + eprintsoline(so, error); + goto done; + } +#ifdef DEBUG + /* + * Verify that apps don't forget to clear + * sin6_scope_id etc + */ + if (sin6->sin6_scope_id != 0 && + !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) { + zcmn_err(getzoneid(), CE_WARN, + "bind with uninitialized sin6_scope_id " + "(%d) on socket. Pid = %d\n", + (int)sin6->sin6_scope_id, + (int)curproc->p_pid); + } + if (sin6->__sin6_src_id != 0) { + zcmn_err(getzoneid(), CE_WARN, + "bind with uninitialized __sin6_src_id " + "(%d) on socket. Pid = %d\n", + (int)sin6->__sin6_src_id, + (int)curproc->p_pid); + } +#endif /* DEBUG */ + + break; + } + default: + /* Just pass the request to the protocol */ + goto dobind; + } + + /* + * First we check if either NCA or KSSL has been enabled for + * the requested address, and if so, we fall back to TPI. + * If neither of those two services are enabled, then we just + * pass the request to the protocol. + * + * Note that KSSL can only be enabled on a socket if NCA is NOT + * enabled for that socket, hence the else-statement below. + */ + if (nl7c_enabled && ((so->so_family == AF_INET || + so->so_family == AF_INET6) && + nl7c_lookup_addr(name, namelen) != NULL)) { + /* + * NL7C is not supported in non-global zones, + * we enforce this restriction here. + */ + if (so->so_zoneid == GLOBAL_ZONEID) { + /* NCA should be used, so fall back to TPI */ + error = so_tpi_fallback(so, cr); + SO_UNBLOCK_FALLBACK(so); + if (error) + return (error); + else + return (SOP_BIND(so, name, namelen, flags, cr)); + } + } else if (so->so_type == SOCK_STREAM) { + /* Check if KSSL has been configured for this address */ + kssl_ent_t ent; + kssl_endpt_type_t type; + struct T_bind_req bind_req; + mblk_t *mp; + + /* + * TODO: Check with KSSL team if we could add a function call + * that only queries whether KSSL is enabled for the given + * address. + */ + bind_req.PRIM_type = T_BIND_REQ; + bind_req.ADDR_length = namelen; + bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req); + mp = soallocproto2(&bind_req, sizeof (bind_req), + name, namelen, 0, _ALLOC_SLEEP); + + type = kssl_check_proxy(mp, so, &ent); + freemsg(mp); + + if (type != KSSL_NO_PROXY) { + /* + * KSSL has been configured for this address, so + * we must fall back to TPI. + */ + kssl_release_ent(ent, so, type); + error = so_tpi_fallback(so, cr); + SO_UNBLOCK_FALLBACK(so); + if (error) + return (error); + else + return (SOP_BIND(so, name, namelen, flags, cr)); + } + } + +dobind: + error = (*so->so_downcalls->sd_bind) + (so->so_proto_handle, name, namelen, cr); +done: + SO_UNBLOCK_FALLBACK(so); + + return (error); +} + +int +so_listen(struct sonode *so, int backlog, struct cred *cr) +{ + int error = 0; + + ASSERT(MUTEX_NOT_HELD(&so->so_lock)); + SO_BLOCK_FALLBACK(so, SOP_LISTEN(so, backlog, cr)); + + error = (*so->so_downcalls->sd_listen)(so->so_proto_handle, backlog, + cr); + + SO_UNBLOCK_FALLBACK(so); + + return (error); +} + + +int +so_connect(struct sonode *so, const struct sockaddr *name, + socklen_t namelen, int fflag, int flags, struct cred *cr) +{ + int error = 0; + sock_connid_t id; + + ASSERT(MUTEX_NOT_HELD(&so->so_lock)); + SO_BLOCK_FALLBACK(so, SOP_CONNECT(so, name, namelen, fflag, flags, cr)); + + /* + * If there is a pending error, return error + * This can happen if a non blocking operation caused an error. + */ + + if (so->so_error != 0) { + mutex_enter(&so->so_lock); + error = sogeterr(so, B_TRUE); + mutex_exit(&so->so_lock); + if (error != 0) + goto done; + } + + error = (*so->so_downcalls->sd_connect)(so->so_proto_handle, + name, namelen, &id, cr); + + if (error == EINPROGRESS) + error = so_wait_connected(so, fflag & (FNONBLOCK|FNDELAY), id); + +done: + SO_UNBLOCK_FALLBACK(so); + return (error); +} + +/*ARGSUSED*/ +int +so_accept(struct sonode *so, int fflag, struct cred *cr, struct sonode **nsop) +{ + int error = 0; + struct sonode *nso; + + *nsop = NULL; + + SO_BLOCK_FALLBACK(so, SOP_ACCEPT(so, fflag, cr, nsop)); + if ((so->so_state & SS_ACCEPTCONN) == 0) { + SO_UNBLOCK_FALLBACK(so); + return ((so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW) ? + EOPNOTSUPP : EINVAL); + } + + if ((error = so_acceptq_dequeue(so, (fflag & (FNONBLOCK|FNDELAY)), + &nso)) == 0) { + ASSERT(nso != NULL); + + /* finish the accept */ + error = (*so->so_downcalls->sd_accept)(so->so_proto_handle, + nso->so_proto_handle, (sock_upper_handle_t)nso, cr); + if (error != 0) { + (void) socket_close(nso, 0, cr); + socket_destroy(nso); + } else { + *nsop = nso; + } + } + + SO_UNBLOCK_FALLBACK(so); + return (error); +} + +int +so_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, + struct cred *cr) +{ + int error, flags; + boolean_t dontblock; + ssize_t orig_resid; + mblk_t *mp; + + SO_BLOCK_FALLBACK(so, SOP_SENDMSG(so, msg, uiop, cr)); + + flags = msg->msg_flags; + error = 0; + dontblock = (flags & MSG_DONTWAIT) || + (uiop->uio_fmode & (FNONBLOCK|FNDELAY)); + + if (!(flags & MSG_XPG4_2) && msg->msg_controllen != 0) { + /* + * Old way of passing fd's is not supported + */ + SO_UNBLOCK_FALLBACK(so); + return (EOPNOTSUPP); + } + + if ((so->so_mode & SM_ATOMIC) && + uiop->uio_resid > so->so_proto_props.sopp_maxpsz && + so->so_proto_props.sopp_maxpsz != -1) { + SO_UNBLOCK_FALLBACK(so); + return (EMSGSIZE); + } + + /* + * For atomic sends we will only do one iteration. + */ + do { + if (so->so_state & SS_CANTSENDMORE) { + error = EPIPE; + break; + } + + if (so->so_error != 0) { + mutex_enter(&so->so_lock); + error = sogeterr(so, B_TRUE); + mutex_exit(&so->so_lock); + if (error != 0) + break; + } + + /* + * Send down OOB messages even if the send path is being + * flow controlled (assuming the protocol supports OOB data). + */ + if (flags & MSG_OOB) { + if ((so->so_mode & SM_EXDATA) == 0) { + error = EOPNOTSUPP; + break; + } + } else if (so->so_snd_qfull) { + /* + * Need to wait until the protocol is ready to receive + * more data for transmission. + */ + if ((error = so_snd_wait_qnotfull(so, dontblock)) != 0) + break; + } + + /* + * Time to send data to the protocol. We either copy the + * data into mblks or pass the uio directly to the protocol. + * We decide what to do based on the available down calls. + */ + if (so->so_downcalls->sd_send_uio != NULL) { + error = (*so->so_downcalls->sd_send_uio) + (so->so_proto_handle, uiop, msg, cr); + if (error != 0) + break; + } else { + /* save the resid in case of failure */ + orig_resid = uiop->uio_resid; + + if ((mp = socopyinuio(uiop, + so->so_proto_props.sopp_maxpsz, + so->so_proto_props.sopp_wroff, + so->so_proto_props.sopp_maxblk, + so->so_proto_props.sopp_tail, &error)) == NULL) { + break; + } + ASSERT(uiop->uio_resid >= 0); + + error = (*so->so_downcalls->sd_send) + (so->so_proto_handle, mp, msg, cr); + if (error != 0) { + /* + * The send failed. We do not have to free the + * mblks, because that is the protocol's + * responsibility. However, uio_resid must + * remain accurate, so adjust that here. + */ + uiop->uio_resid = orig_resid; + break; + } + } + } while (uiop->uio_resid > 0); + + SO_UNBLOCK_FALLBACK(so); + + return (error); +} + +int +so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag, + struct cred *cr, mblk_t **mpp) +{ + int error; + boolean_t dontblock; + size_t size; + mblk_t *mp = *mpp; + + SO_BLOCK_FALLBACK(so, SOP_SENDMBLK(so, msg, fflag, cr, mpp)); + + error = 0; + dontblock = (msg->msg_flags & MSG_DONTWAIT) || + (fflag & (FNONBLOCK|FNDELAY)); + size = msgdsize(mp); + + if (so->so_downcalls->sd_send == NULL) { + SO_UNBLOCK_FALLBACK(so); + return (EOPNOTSUPP); + } + + if ((so->so_mode & SM_ATOMIC) && + size > so->so_proto_props.sopp_maxpsz && + so->so_proto_props.sopp_maxpsz != -1) { + SO_UNBLOCK_FALLBACK(so); + return (EMSGSIZE); + } + + while (mp != NULL) { + mblk_t *nmp, *last_mblk; + size_t mlen; + + if (so->so_state & SS_CANTSENDMORE) { + error = EPIPE; + break; + } + if (so->so_error != 0) { + mutex_enter(&so->so_lock); + error = sogeterr(so, B_TRUE); + mutex_exit(&so->so_lock); + if (error != 0) + break; + } + if (so->so_snd_qfull) { + /* + * Need to wait until the protocol is ready to receive + * more data for transmission. + */ + if ((error = so_snd_wait_qnotfull(so, dontblock)) != 0) + break; + } + + /* + * We only allow so_maxpsz of data to be sent down to + * the protocol at time. + */ + mlen = MBLKL(mp); + nmp = mp->b_cont; + last_mblk = mp; + while (nmp != NULL) { + mlen += MBLKL(nmp); + if (mlen > so->so_proto_props.sopp_maxpsz) { + last_mblk->b_cont = NULL; + break; + } + last_mblk = nmp; + nmp = nmp->b_cont; + } + + error = (*so->so_downcalls->sd_send) + (so->so_proto_handle, mp, msg, cr); + if (error != 0) { + /* + * The send failed. The protocol will free the mblks + * that were sent down. Let the caller deal with the + * rest. + */ + *mpp = nmp; + break; + } + + *mpp = mp = nmp; + } + + SO_UNBLOCK_FALLBACK(so); + + return (error); +} + +int +so_shutdown(struct sonode *so, int how, struct cred *cr) +{ + int error; + + SO_BLOCK_FALLBACK(so, SOP_SHUTDOWN(so, how, cr)); + + /* + * SunOS 4.X has no check for datagram sockets. + * 5.X checks that it is connected (ENOTCONN) + * X/Open requires that we check the connected state. + */ + if (!(so->so_state & SS_ISCONNECTED)) { + if (!xnet_skip_checks) { + error = ENOTCONN; + if (xnet_check_print) { + printf("sockfs: X/Open shutdown check " + "caused ENOTCONN\n"); + } + } + goto done; + } + + error = ((*so->so_downcalls->sd_shutdown)(so->so_proto_handle, + how, cr)); + + /* + * Protocol agreed to shutdown. We need to flush the + * receive buffer if the receive side is being shutdown. + */ + if (error == 0 && how != SHUT_WR) { + mutex_enter(&so->so_lock); + /* wait for active reader to finish */ + (void) so_lock_read(so, 0); + + so_rcv_flush(so); + + so_unlock_read(so); + mutex_exit(&so->so_lock); + } + +done: + SO_UNBLOCK_FALLBACK(so); + return (error); +} + +int +so_getsockname(struct sonode *so, struct sockaddr *addr, + socklen_t *addrlen, struct cred *cr) +{ + int error; + + SO_BLOCK_FALLBACK(so, SOP_GETSOCKNAME(so, addr, addrlen, cr)); + + error = (*so->so_downcalls->sd_getsockname) + (so->so_proto_handle, addr, addrlen, cr); + + SO_UNBLOCK_FALLBACK(so); + return (error); +} + +int +so_getpeername(struct sonode *so, struct sockaddr *addr, + socklen_t *addrlen, boolean_t accept, struct cred *cr) +{ + int error; + + SO_BLOCK_FALLBACK(so, SOP_GETPEERNAME(so, addr, addrlen, accept, cr)); + + if (accept) { + error = (*so->so_downcalls->sd_getpeername) + (so->so_proto_handle, addr, addrlen, cr); + } else if (!(so->so_state & SS_ISCONNECTED)) { + error = ENOTCONN; + } else if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) { + /* Added this check for X/Open */ + error = EINVAL; + if (xnet_check_print) { + printf("sockfs: X/Open getpeername check => EINVAL\n"); + } + } else { + error = (*so->so_downcalls->sd_getpeername) + (so->so_proto_handle, addr, addrlen, cr); + } + + SO_UNBLOCK_FALLBACK(so); + return (error); +} + +int +so_getsockopt(struct sonode *so, int level, int option_name, + void *optval, socklen_t *optlenp, int flags, struct cred *cr) +{ + int error = 0; + + ASSERT(MUTEX_NOT_HELD(&so->so_lock)); + SO_BLOCK_FALLBACK(so, + SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, flags, cr)); + + error = socket_getopt_common(so, level, option_name, optval, + optlenp); + if (error < 0) { + error = (*so->so_downcalls->sd_getsockopt) + (so->so_proto_handle, level, option_name, optval, optlenp, + cr); + if (error == ENOPROTOOPT) { + if (level == SOL_SOCKET) { + /* + * If a protocol does not support a particular + * socket option, set can fail (not allowed) + * but get can not fail. This is the previous + * sockfs bahvior. + */ + switch (option_name) { + case SO_LINGER: + if (*optlenp < (t_uscalar_t) + sizeof (struct linger)) { + error = EINVAL; + break; + } + error = 0; + bzero(optval, sizeof (struct linger)); + *optlenp = sizeof (struct linger); + break; + case SO_RCVTIMEO: + case SO_SNDTIMEO: + if (*optlenp < (t_uscalar_t) + sizeof (struct timeval)) { + error = EINVAL; + break; + } + error = 0; + bzero(optval, sizeof (struct timeval)); + *optlenp = sizeof (struct timeval); + break; + case SO_SND_BUFINFO: + if (*optlenp < (t_uscalar_t) + sizeof (struct so_snd_bufinfo)) { + error = EINVAL; + break; + } + error = 0; + bzero(optval, + sizeof (struct so_snd_bufinfo)); + *optlenp = + sizeof (struct so_snd_bufinfo); + break; + case SO_DEBUG: + case SO_REUSEADDR: + case SO_KEEPALIVE: + case SO_DONTROUTE: + case SO_BROADCAST: + case SO_USELOOPBACK: + case SO_OOBINLINE: + case SO_DGRAM_ERRIND: + case SO_SNDBUF: + case SO_RCVBUF: + error = 0; + *((int32_t *)optval) = 0; + *optlenp = sizeof (int32_t); + break; + default: + break; + } + } + } + } + + SO_UNBLOCK_FALLBACK(so); + return (error); +} + +int +so_setsockopt(struct sonode *so, int level, int option_name, + const void *optval, socklen_t optlen, struct cred *cr) +{ + int error = 0; + + SO_BLOCK_FALLBACK(so, + SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr)); + + /* X/Open requires this check */ + if (so->so_state & SS_CANTSENDMORE && !xnet_skip_checks) { + SO_UNBLOCK_FALLBACK(so); + if (xnet_check_print) + printf("sockfs: X/Open setsockopt check => EINVAL\n"); + return (EINVAL); + } + + if (level == SOL_SOCKET && + ((option_name == SO_RCVTIMEO) || (option_name == SO_SNDTIMEO))) { + struct timeval *tl = (struct timeval *)optval; + clock_t t_usec; + + if (optlen != (t_uscalar_t)sizeof (struct timeval)) { + SO_UNBLOCK_FALLBACK(so); + return (EINVAL); + } + t_usec = tl->tv_sec * 1000 * 1000 + tl->tv_usec; + mutex_enter(&so->so_lock); + if (option_name == SO_RCVTIMEO) + so->so_rcvtimeo = drv_usectohz(t_usec); + else + so->so_sndtimeo = drv_usectohz(t_usec); + mutex_exit(&so->so_lock); + SO_UNBLOCK_FALLBACK(so); + return (0); + } + error = (*so->so_downcalls->sd_setsockopt) + (so->so_proto_handle, level, option_name, optval, optlen, cr); + + SO_UNBLOCK_FALLBACK(so); + return (error); +} + +int +so_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode, + struct cred *cr, int32_t *rvalp) +{ + int error = 0; + + SO_BLOCK_FALLBACK(so, SOP_IOCTL(so, cmd, arg, mode, cr, rvalp)); + + /* + * If there is a pending error, return error + * This can happen if a non blocking operation caused an error. + */ + if (so->so_error != 0) { + mutex_enter(&so->so_lock); + error = sogeterr(so, B_TRUE); + mutex_exit(&so->so_lock); + if (error != 0) + goto done; + } + + /* + * calling strioc can result in the socket falling back to TPI, + * if that is supported. + */ + if ((error = socket_ioctl_common(so, cmd, arg, mode, cr, rvalp)) < 0 && + (error = socket_strioc_common(so, cmd, arg, mode, cr, rvalp)) < 0) { + error = (*so->so_downcalls->sd_ioctl)(so->so_proto_handle, + cmd, arg, mode, rvalp, cr); + } + +done: + SO_UNBLOCK_FALLBACK(so); + + return (error); +} + +int +so_poll(struct sonode *so, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + int state = so->so_state; + *reventsp = 0; + + if (so->so_error != 0 && + ((POLLIN|POLLRDNORM|POLLOUT) & events) != 0) { + *reventsp = (POLLIN|POLLRDNORM|POLLOUT) & events; + return (0); + } + + /* + * As long as there is buffer to send data, and the socket is + * in a state where it can send data (i.e., connected for + * connection oriented protocols), then turn on POLLOUT events + */ + if (!so->so_snd_qfull && ((so->so_mode & SM_CONNREQUIRED) == 0 || + state & SS_ISCONNECTED)) { + *reventsp |= POLLOUT & events; + } + + /* + * Turn on POLLIN whenever there is data on the receive queue, + * or the socket is in a state where no more data will be received. + * Also, if the socket is accepting connections, flip the bit if + * there is something on the queue. + */ + + /* Pending connections */ + if (so->so_acceptq_len > 0) + *reventsp |= (POLLIN|POLLRDNORM) & events; + + /* Data */ + /* so_downcalls is null for sctp */ + if (so->so_downcalls != NULL && so->so_downcalls->sd_poll != NULL) { + *reventsp |= (*so->so_downcalls->sd_poll) + (so->so_proto_handle, events & SO_PROTO_POLLEV, anyyet, + CRED()) & events; + ASSERT((*reventsp & ~events) == 0); + /* do not recheck events */ + events &= ~SO_PROTO_POLLEV; + } else { + if (SO_HAVE_DATA(so)) + *reventsp |= (POLLIN|POLLRDNORM) & events; + + /* Urgent data */ + if ((state & SS_OOBPEND) != 0) + *reventsp |= (POLLRDBAND) & events; + } + + if (!*reventsp && !anyyet) { + /* Check for read events again, but this time under lock */ + if (events & (POLLIN|POLLRDNORM)) { + mutex_enter(&so->so_lock); + if (SO_HAVE_DATA(so) || so->so_acceptq_len > 0) { + mutex_exit(&so->so_lock); + *reventsp |= (POLLIN|POLLRDNORM) & events; + return (0); + } else { + so->so_pollev |= SO_POLLEV_IN; + mutex_exit(&so->so_lock); + } + } + *phpp = &so->so_poll_list; + } + return (0); +} + +/* + * Generic Upcalls + */ +void +so_connected(sock_upper_handle_t sock_handle, sock_connid_t id, + cred_t *peer_cred, pid_t peer_cpid) +{ + struct sonode *so = (struct sonode *)sock_handle; + + mutex_enter(&so->so_lock); + ASSERT(so->so_proto_handle != NULL); + + if (peer_cred != NULL) { + if (so->so_peercred != NULL) + crfree(so->so_peercred); + crhold(peer_cred); + so->so_peercred = peer_cred; + so->so_cpid = peer_cpid; + } + + so->so_proto_connid = id; + soisconnected(so); + /* + * Wake ones who're waiting for conn to become established. + */ + so_notify_connected(so); +} + +int +so_disconnected(sock_upper_handle_t sock_handle, sock_connid_t id, int error) +{ + struct sonode *so = (struct sonode *)sock_handle; + + mutex_enter(&so->so_lock); + + so->so_proto_connid = id; + soisdisconnected(so, error); + so_notify_disconnected(so, error); + + return (0); +} + +void +so_opctl(sock_upper_handle_t sock_handle, sock_opctl_action_t action, + uintptr_t arg) +{ + struct sonode *so = (struct sonode *)sock_handle; + + switch (action) { + case SOCK_OPCTL_SHUT_SEND: + mutex_enter(&so->so_lock); + socantsendmore(so); + so_notify_disconnecting(so); + break; + case SOCK_OPCTL_SHUT_RECV: { + mutex_enter(&so->so_lock); + socantrcvmore(so); + so_notify_eof(so); + break; + } + case SOCK_OPCTL_ENAB_ACCEPT: + mutex_enter(&so->so_lock); + so->so_state |= SS_ACCEPTCONN; + so->so_backlog = (unsigned int)arg; + mutex_exit(&so->so_lock); + break; + default: + ASSERT(0); + break; + } +} + +void +so_txq_full(sock_upper_handle_t sock_handle, boolean_t qfull) +{ + struct sonode *so = (struct sonode *)sock_handle; + + if (qfull) { + so_snd_qfull(so); + } else { + so_snd_qnotfull(so); + mutex_enter(&so->so_lock); + so_notify_writable(so); + } +} + +sock_upper_handle_t +so_newconn(sock_upper_handle_t parenthandle, + sock_lower_handle_t proto_handle, sock_downcalls_t *sock_downcalls, + struct cred *peer_cred, pid_t peer_cpid, sock_upcalls_t **sock_upcallsp) +{ + struct sonode *so = (struct sonode *)parenthandle; + struct sonode *nso; + int error; + + ASSERT(proto_handle != NULL); + + if ((so->so_state & SS_ACCEPTCONN) == 0 || + so->so_acceptq_len >= so->so_backlog) + return (NULL); + + nso = socket_newconn(so, proto_handle, sock_downcalls, SOCKET_NOSLEEP, + &error); + if (nso == NULL) + return (NULL); + + if (peer_cred != NULL) { + crhold(peer_cred); + nso->so_peercred = peer_cred; + nso->so_cpid = peer_cpid; + } + + (void) so_acceptq_enqueue(so, nso); + mutex_enter(&so->so_lock); + so_notify_newconn(so); + + *sock_upcallsp = &so_upcalls; + + return ((sock_upper_handle_t)nso); +} + +void +so_set_prop(sock_upper_handle_t sock_handle, struct sock_proto_props *soppp) +{ + struct sonode *so; + + so = (struct sonode *)sock_handle; + + mutex_enter(&so->so_lock); + + if (soppp->sopp_flags & SOCKOPT_MAXBLK) + so->so_proto_props.sopp_maxblk = soppp->sopp_maxblk; + if (soppp->sopp_flags & SOCKOPT_WROFF) + so->so_proto_props.sopp_wroff = soppp->sopp_wroff; + if (soppp->sopp_flags & SOCKOPT_TAIL) + so->so_proto_props.sopp_tail = soppp->sopp_tail; + if (soppp->sopp_flags & SOCKOPT_RCVHIWAT) + so->so_proto_props.sopp_rxhiwat = soppp->sopp_rxhiwat; + if (soppp->sopp_flags & SOCKOPT_RCVLOWAT) + so->so_proto_props.sopp_rxlowat = soppp->sopp_rxlowat; + if (soppp->sopp_flags & SOCKOPT_MAXPSZ) + so->so_proto_props.sopp_maxpsz = soppp->sopp_maxpsz; + if (soppp->sopp_flags & SOCKOPT_MINPSZ) + so->so_proto_props.sopp_minpsz = soppp->sopp_minpsz; + if (soppp->sopp_flags & SOCKOPT_ZCOPY) { + if (soppp->sopp_zcopyflag & ZCVMSAFE) { + so->so_proto_props.sopp_zcopyflag |= STZCVMSAFE; + so->so_proto_props.sopp_zcopyflag &= ~STZCVMUNSAFE; + } else if (soppp->sopp_zcopyflag & ZCVMUNSAFE) { + so->so_proto_props.sopp_zcopyflag |= STZCVMUNSAFE; + so->so_proto_props.sopp_zcopyflag &= ~STZCVMSAFE; + } + + if (soppp->sopp_zcopyflag & COPYCACHED) { + so->so_proto_props.sopp_zcopyflag |= STRCOPYCACHED; + } + } + if (soppp->sopp_flags & SOCKOPT_OOBINLINE) + so->so_proto_props.sopp_oobinline = soppp->sopp_oobinline; + if (soppp->sopp_flags & SOCKOPT_RCVTIMER) + so->so_proto_props.sopp_rcvtimer = soppp->sopp_rcvtimer; + if (soppp->sopp_flags & SOCKOPT_RCVTHRESH) + so->so_proto_props.sopp_rcvthresh = soppp->sopp_rcvthresh; + if (soppp->sopp_flags & SOCKOPT_MAXADDRLEN) + so->so_proto_props.sopp_maxaddrlen = soppp->sopp_maxaddrlen; + + mutex_exit(&so->so_lock); + +#ifdef DEBUG + soppp->sopp_flags &= ~(SOCKOPT_MAXBLK | SOCKOPT_WROFF | SOCKOPT_TAIL | + SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | SOCKOPT_MAXPSZ | + SOCKOPT_ZCOPY | SOCKOPT_OOBINLINE | SOCKOPT_RCVTIMER | + SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ); + ASSERT(soppp->sopp_flags == 0); +#endif +} + +/* ARGSUSED */ +ssize_t +so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp, + size_t msg_size, int flags, int *errorp, boolean_t *force_pushp) +{ + struct sonode *so = (struct sonode *)sock_handle; + boolean_t force_push = B_TRUE; + int space_left; + sodirect_t *sodp = so->so_direct; + + ASSERT(errorp != NULL); + *errorp = 0; + if (mp == NULL) { + if (msg_size > 0) { + ASSERT(so->so_downcalls->sd_recv_uio != NULL); + mutex_enter(&so->so_lock); + /* the notify functions will drop the lock */ + if (flags & MSG_OOB) + so_notify_oobdata(so, IS_SO_OOB_INLINE(so)); + else + so_notify_data(so, msg_size); + return (0); + } + /* + * recv space check + */ + mutex_enter(&so->so_lock); + space_left = so->so_rcvbuf - so->so_rcv_queued; + if (space_left <= 0) { + so->so_flowctrld = B_TRUE; + *errorp = ENOSPC; + space_left = -1; + } + goto done_unlock; + } + + ASSERT(mp->b_next == NULL); + ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO); + ASSERT(msg_size == msgdsize(mp)); + + if (flags & MSG_OOB) { + so_queue_oob(sock_handle, mp, msg_size); + return (0); + } + + if (force_pushp != NULL) + force_push = *force_pushp; + + if (DB_TYPE(mp) == M_PROTO && !__TPI_PRIM_ISALIGNED(mp->b_rptr)) { + /* The read pointer is not aligned correctly for TPI */ + zcmn_err(getzoneid(), CE_WARN, + "sockfs: Unaligned TPI message received. rptr = %p\n", + (void *)mp->b_rptr); + freemsg(mp); + mutex_enter(sodp->sod_lockp); + SOD_UIOAFINI(sodp); + mutex_exit(sodp->sod_lockp); + + return (so->so_rcvbuf - so->so_rcv_queued); + } + + mutex_enter(&so->so_lock); + if (so->so_state & (SS_FALLBACK_PENDING | SS_FALLBACK_COMP)) { + SOD_DISABLE(sodp); + mutex_exit(&so->so_lock); + *errorp = EOPNOTSUPP; + return (-1); + } + if (so->so_state & SS_CANTRCVMORE) { + freemsg(mp); + SOD_DISABLE(sodp); + mutex_exit(&so->so_lock); + return (0); + } + + /* process the mblk via I/OAT if capable */ + if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) { + if (DB_TYPE(mp) == M_DATA) { + (void) sod_uioa_mblk_init(sodp, mp, msg_size); + } else { + SOD_UIOAFINI(sodp); + } + } + + if (mp->b_next == NULL) { + so_enqueue_msg(so, mp, msg_size); + } else { + do { + mblk_t *nmp; + + if ((nmp = mp->b_next) != NULL) { + mp->b_next = NULL; + } + so_enqueue_msg(so, mp, msgdsize(mp)); + mp = nmp; + } while (mp != NULL); + } + + space_left = so->so_rcvbuf - so->so_rcv_queued; + if (space_left <= 0) { + so->so_flowctrld = B_TRUE; + *errorp = ENOSPC; + space_left = -1; + } + + if (force_push || so->so_rcv_queued >= so->so_rcv_thresh || + so->so_rcv_queued >= so->so_rcv_wanted || + (sodp != NULL && so->so_rcv_queued >= sodp->sod_want)) { + SOCKET_TIMER_CANCEL(so); + /* + * so_notify_data will release the lock + */ + so_notify_data(so, so->so_rcv_queued); + + if (force_pushp != NULL) + *force_pushp = B_TRUE; + goto done; + } else if (so->so_rcv_timer_tid == 0) { + /* Make sure the recv push timer is running */ + SOCKET_TIMER_START(so); + } + +done_unlock: + mutex_exit(&so->so_lock); +done: + return (space_left); +} + +/* + * Set the offset of where the oob data is relative to the bytes in + * queued. Also generate SIGURG + */ +void +so_signal_oob(sock_upper_handle_t sock_handle, ssize_t offset) +{ + struct sonode *so; + + ASSERT(offset >= 0); + so = (struct sonode *)sock_handle; + mutex_enter(&so->so_lock); + SOD_UIOAFINI(so->so_direct); + + /* + * New urgent data on the way so forget about any old + * urgent data. + */ + so->so_state &= ~(SS_HAVEOOBDATA|SS_HADOOBDATA); + + /* + * Record that urgent data is pending. + */ + so->so_state |= SS_OOBPEND; + + if (so->so_oobmsg != NULL) { + dprintso(so, 1, ("sock: discarding old oob\n")); + freemsg(so->so_oobmsg); + so->so_oobmsg = NULL; + } + + /* + * set the offset where the urgent byte is + */ + so->so_oobmark = so->so_rcv_queued + offset; + if (so->so_oobmark == 0) + so->so_state |= SS_RCVATMARK; + else + so->so_state &= ~SS_RCVATMARK; + + so_notify_oobsig(so); +} + +/* + * Queue the OOB byte + */ +static void +so_queue_oob(sock_upper_handle_t sock_handle, mblk_t *mp, size_t len) +{ + struct sonode *so; + + so = (struct sonode *)sock_handle; + mutex_enter(&so->so_lock); + SOD_UIOAFINI(so->so_direct); + + ASSERT(mp != NULL); + if (!IS_SO_OOB_INLINE(so)) { + so->so_oobmsg = mp; + so->so_state |= SS_HAVEOOBDATA; + } else { + so_enqueue_msg(so, mp, len); + } + + so_notify_oobdata(so, IS_SO_OOB_INLINE(so)); +} + +int +so_close(struct sonode *so, int flag, struct cred *cr) +{ + int error; + + error = (*so->so_downcalls->sd_close)(so->so_proto_handle, flag, cr); + + /* + * At this point there will be no more upcalls from the protocol + */ + mutex_enter(&so->so_lock); + so_rcv_flush(so); + mutex_exit(&so->so_lock); + + return (error); +} + +void +so_zcopy_notify(sock_upper_handle_t sock_handle) +{ + struct sonode *so = (struct sonode *)sock_handle; + + mutex_enter(&so->so_lock); + so->so_copyflag |= STZCNOTIFY; + cv_broadcast(&so->so_copy_cv); + mutex_exit(&so->so_lock); +} + +void +so_set_error(sock_upper_handle_t sock_handle, int error) +{ + struct sonode *so = (struct sonode *)sock_handle; + + mutex_enter(&so->so_lock); + + soseterror(so, error); + + so_notify_error(so); +} + +/* + * so_recvmsg - read data from the socket + * + * There are two ways of obtaining data; either we ask the protocol to + * copy directly into the supplied buffer, or we copy data from the + * sonode's receive queue. The decision which one to use depends on + * whether the protocol has a sd_recv_uio down call. + */ +int +so_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, + struct cred *cr) +{ + rval_t rval; + int flags = 0; + t_uscalar_t controllen, namelen; + int error = 0; + int ret; + mblk_t *mctlp = NULL; + union T_primitives *tpr; + void *control; + ssize_t saved_resid; + struct uio *suiop; + + SO_BLOCK_FALLBACK(so, SOP_RECVMSG(so, msg, uiop, cr)); + + if ((so->so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 && + (so->so_mode & SM_CONNREQUIRED)) { + SO_UNBLOCK_FALLBACK(so); + return (ENOTCONN); + } + + if (msg->msg_flags & MSG_PEEK) + msg->msg_flags &= ~MSG_WAITALL; + + if (so->so_mode & SM_ATOMIC) + msg->msg_flags |= MSG_TRUNC; + + if (msg->msg_flags & MSG_OOB) { + if ((so->so_mode & SM_EXDATA) == 0) { + error = EOPNOTSUPP; + } else if (so->so_downcalls->sd_recv_uio != NULL) { + error = (*so->so_downcalls->sd_recv_uio) + (so->so_proto_handle, uiop, msg, cr); + } else { + error = sorecvoob(so, msg, uiop, msg->msg_flags, + IS_SO_OOB_INLINE(so)); + } + SO_UNBLOCK_FALLBACK(so); + return (error); + } + + /* + * If the protocol has the recv down call, then pass the request + * down. + */ + if (so->so_downcalls->sd_recv_uio != NULL) { + error = (*so->so_downcalls->sd_recv_uio) + (so->so_proto_handle, uiop, msg, cr); + SO_UNBLOCK_FALLBACK(so); + return (error); + } + + /* + * Reading data from the socket buffer + */ + flags = msg->msg_flags; + msg->msg_flags = 0; + + /* + * Set msg_controllen and msg_namelen to zero here to make it + * simpler in the cases that no control or name is returned. + */ + controllen = msg->msg_controllen; + namelen = msg->msg_namelen; + msg->msg_controllen = 0; + msg->msg_namelen = 0; + + mutex_enter(&so->so_lock); + /* Set SOREADLOCKED */ + error = so_lock_read_intr(so, + uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0)); + mutex_exit(&so->so_lock); + if (error) { + SO_UNBLOCK_FALLBACK(so); + return (error); + } + + suiop = sod_rcv_init(so, flags, &uiop); +retry: + saved_resid = uiop->uio_resid; + error = so_dequeue_msg(so, &mctlp, uiop, &rval, flags); + if (error != 0) { + goto out; + } + /* + * For datagrams the MOREDATA flag is used to set MSG_TRUNC. + * For non-datagrams MOREDATA is used to set MSG_EOR. + */ + ASSERT(!(rval.r_val1 & MORECTL)); + if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC)) + msg->msg_flags |= MSG_TRUNC; + if (mctlp == NULL) { + dprintso(so, 1, ("so_recvmsg: got M_DATA\n")); + + mutex_enter(&so->so_lock); + /* Set MSG_EOR based on MOREDATA */ + if (!(rval.r_val1 & MOREDATA)) { + if (so->so_state & SS_SAVEDEOR) { + msg->msg_flags |= MSG_EOR; + so->so_state &= ~SS_SAVEDEOR; + } + } + /* + * If some data was received (i.e. not EOF) and the + * read/recv* has not been satisfied wait for some more. + */ + if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) && + uiop->uio_resid != saved_resid && uiop->uio_resid > 0) { + mutex_exit(&so->so_lock); + goto retry; + } + + goto out_locked; + } + /* strsock_proto has already verified length and alignment */ + tpr = (union T_primitives *)mctlp->b_rptr; + dprintso(so, 1, ("so_recvmsg: type %d\n", tpr->type)); + switch (tpr->type) { + case T_DATA_IND: { + /* + * Set msg_flags to MSG_EOR based on + * MORE_flag and MOREDATA. + */ + mutex_enter(&so->so_lock); + so->so_state &= ~SS_SAVEDEOR; + if (!(tpr->data_ind.MORE_flag & 1)) { + if (!(rval.r_val1 & MOREDATA)) + msg->msg_flags |= MSG_EOR; + else + so->so_state |= SS_SAVEDEOR; + } + freemsg(mctlp); + /* + * If some data was received (i.e. not EOF) and the + * read/recv* has not been satisfied wait for some more. + */ + if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) && + uiop->uio_resid != saved_resid && uiop->uio_resid > 0) { + mutex_exit(&so->so_lock); + goto retry; + } + goto out_locked; + } + case T_UNITDATA_IND: { + void *addr; + t_uscalar_t addrlen; + void *abuf; + t_uscalar_t optlen; + void *opt; + + if (namelen != 0) { + /* Caller wants source address */ + addrlen = tpr->unitdata_ind.SRC_length; + addr = sogetoff(mctlp, tpr->unitdata_ind.SRC_offset, + addrlen, 1); + if (addr == NULL) { + freemsg(mctlp); + error = EPROTO; + eprintsoline(so, error); + goto out; + } + ASSERT(so->so_family != AF_UNIX); + } + optlen = tpr->unitdata_ind.OPT_length; + if (optlen != 0) { + t_uscalar_t ncontrollen; + + /* + * Extract any source address option. + * Determine how large cmsg buffer is needed. + */ + opt = sogetoff(mctlp, tpr->unitdata_ind.OPT_offset, + optlen, __TPI_ALIGN_SIZE); + + if (opt == NULL) { + freemsg(mctlp); + error = EPROTO; + eprintsoline(so, error); + goto out; + } + if (so->so_family == AF_UNIX) + so_getopt_srcaddr(opt, optlen, &addr, &addrlen); + ncontrollen = so_cmsglen(mctlp, opt, optlen, + !(flags & MSG_XPG4_2)); + if (controllen != 0) + controllen = ncontrollen; + else if (ncontrollen != 0) + msg->msg_flags |= MSG_CTRUNC; + } else { + controllen = 0; + } + + if (namelen != 0) { + /* + * Return address to caller. + * Caller handles truncation if length + * exceeds msg_namelen. + * NOTE: AF_UNIX NUL termination is ensured by + * the sender's copyin_name(). + */ + abuf = kmem_alloc(addrlen, KM_SLEEP); + + bcopy(addr, abuf, addrlen); + msg->msg_name = abuf; + msg->msg_namelen = addrlen; + } + + if (controllen != 0) { + /* + * Return control msg to caller. + * Caller handles truncation if length + * exceeds msg_controllen. + */ + control = kmem_zalloc(controllen, KM_SLEEP); + + error = so_opt2cmsg(mctlp, opt, optlen, + !(flags & MSG_XPG4_2), control, controllen); + if (error) { + freemsg(mctlp); + if (msg->msg_namelen != 0) + kmem_free(msg->msg_name, + msg->msg_namelen); + kmem_free(control, controllen); + eprintsoline(so, error); + goto out; + } + msg->msg_control = control; + msg->msg_controllen = controllen; + } + + freemsg(mctlp); + goto out; + } + case T_OPTDATA_IND: { + struct T_optdata_req *tdr; + void *opt; + t_uscalar_t optlen; + + tdr = (struct T_optdata_req *)mctlp->b_rptr; + optlen = tdr->OPT_length; + if (optlen != 0) { + t_uscalar_t ncontrollen; + /* + * Determine how large cmsg buffer is needed. + */ + opt = sogetoff(mctlp, + tpr->optdata_ind.OPT_offset, optlen, + __TPI_ALIGN_SIZE); + + if (opt == NULL) { + freemsg(mctlp); + error = EPROTO; + eprintsoline(so, error); + goto out; + } + + ncontrollen = so_cmsglen(mctlp, opt, optlen, + !(flags & MSG_XPG4_2)); + if (controllen != 0) + controllen = ncontrollen; + else if (ncontrollen != 0) + msg->msg_flags |= MSG_CTRUNC; + } else { + controllen = 0; + } + + if (controllen != 0) { + /* + * Return control msg to caller. + * Caller handles truncation if length + * exceeds msg_controllen. + */ + control = kmem_zalloc(controllen, KM_SLEEP); + + error = so_opt2cmsg(mctlp, opt, optlen, + !(flags & MSG_XPG4_2), control, controllen); + if (error) { + freemsg(mctlp); + kmem_free(control, controllen); + eprintsoline(so, error); + goto out; + } + msg->msg_control = control; + msg->msg_controllen = controllen; + } + + /* + * Set msg_flags to MSG_EOR based on + * DATA_flag and MOREDATA. + */ + mutex_enter(&so->so_lock); + so->so_state &= ~SS_SAVEDEOR; + if (!(tpr->data_ind.MORE_flag & 1)) { + if (!(rval.r_val1 & MOREDATA)) + msg->msg_flags |= MSG_EOR; + else + so->so_state |= SS_SAVEDEOR; + } + freemsg(mctlp); + /* + * If some data was received (i.e. not EOF) and the + * read/recv* has not been satisfied wait for some more. + * Not possible to wait if control info was received. + */ + if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) && + controllen == 0 && + uiop->uio_resid != saved_resid && uiop->uio_resid > 0) { + mutex_exit(&so->so_lock); + goto retry; + } + goto out_locked; + } + default: + cmn_err(CE_CONT, "so_recvmsg bad type %x \n", + tpr->type); + freemsg(mctlp); + error = EPROTO; + ASSERT(0); + } +out: + mutex_enter(&so->so_lock); +out_locked: + /* The sod_lockp pointers to the sonode so_lock */ + ret = sod_rcv_done(so, suiop, uiop); + if (ret != 0 && error == 0) + error = ret; + + so_unlock_read(so); /* Clear SOREADLOCKED */ + mutex_exit(&so->so_lock); + + SO_UNBLOCK_FALLBACK(so); + + return (error); +} + +sonodeops_t so_sonodeops = { + so_init, /* sop_init */ + so_accept, /* sop_accept */ + so_bind, /* sop_bind */ + so_listen, /* sop_listen */ + so_connect, /* sop_connect */ + so_recvmsg, /* sop_recvmsg */ + so_sendmsg, /* sop_sendmsg */ + so_sendmblk, /* sop_sendmblk */ + so_getpeername, /* sop_getpeername */ + so_getsockname, /* sop_getsockname */ + so_shutdown, /* sop_shutdown */ + so_getsockopt, /* sop_getsockopt */ + so_setsockopt, /* sop_setsockopt */ + so_ioctl, /* sop_ioctl */ + so_poll, /* sop_poll */ + so_close, /* sop_close */ +}; + +sock_upcalls_t so_upcalls = { + so_newconn, + so_connected, + so_disconnected, + so_opctl, + so_queue_msg, + so_set_prop, + so_txq_full, + so_signal_oob, + so_zcopy_notify, + so_set_error +}; diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c new file mode 100644 index 0000000000..c1cfa6bf5f --- /dev/null +++ b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c @@ -0,0 +1,1970 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/signal.h> +#include <sys/cmn_err.h> + +#include <sys/stropts.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sockio.h> +#include <sys/sodirect.h> +#include <sys/strsubr.h> +#include <sys/strsun.h> +#include <sys/atomic.h> + +#include <fs/sockfs/sockcommon.h> +#include <fs/sockfs/socktpi.h> +#include <sys/ddi.h> +#include <inet/ip.h> +#include <sys/time.h> +#include <sys/cmn_err.h> + +#ifdef SOCK_TEST +extern int do_useracc; +extern clock_t sock_test_timelimit; +#endif /* SOCK_TEST */ + +#define MBLK_PULL_LEN 64 +uint32_t so_mblk_pull_len = MBLK_PULL_LEN; + +#ifdef DEBUG +boolean_t so_debug_length = B_FALSE; +static boolean_t so_check_length(sonode_t *so); +#endif + +int +so_acceptq_enqueue_locked(struct sonode *so, struct sonode *nso) +{ + ASSERT(MUTEX_HELD(&so->so_acceptq_lock)); + ASSERT(nso->so_acceptq_next == NULL); + + *so->so_acceptq_tail = nso; + so->so_acceptq_tail = &nso->so_acceptq_next; + so->so_acceptq_len++; + + /* Wakeup a single consumer */ + cv_signal(&so->so_acceptq_cv); + + return (so->so_acceptq_len); +} + +/* + * int so_acceptq_enqueue(struct sonode *so, struct sonode *nso) + * + * Enqueue an incoming connection on a listening socket. + * + * Arguments: + * so - listening socket + * nso - new connection + * + * Returns: + * Number of queued connections, including the new connection + */ +int +so_acceptq_enqueue(struct sonode *so, struct sonode *nso) +{ + int conns; + + mutex_enter(&so->so_acceptq_lock); + conns = so_acceptq_enqueue_locked(so, nso); + mutex_exit(&so->so_acceptq_lock); + + return (conns); +} + +static int +so_acceptq_dequeue_locked(struct sonode *so, boolean_t dontblock, + struct sonode **nsop) +{ + struct sonode *nso = NULL; + + *nsop = NULL; + ASSERT(MUTEX_HELD(&so->so_acceptq_lock)); + while ((nso = so->so_acceptq_head) == NULL) { + /* + * No need to check so_error here, because it is not + * possible for a listening socket to be reset or otherwise + * disconnected. + * + * So now we just need check if it's ok to wait. + */ + if (dontblock) + return (EWOULDBLOCK); + if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING)) + return (EINTR); + + if (cv_wait_sig_swap(&so->so_acceptq_cv, + &so->so_acceptq_lock) == 0) + return (EINTR); + } + + ASSERT(nso != NULL); + so->so_acceptq_head = nso->so_acceptq_next; + nso->so_acceptq_next = NULL; + + if (so->so_acceptq_head == NULL) { + ASSERT(so->so_acceptq_tail == &nso->so_acceptq_next); + so->so_acceptq_tail = &so->so_acceptq_head; + } + ASSERT(so->so_acceptq_len > 0); + --so->so_acceptq_len; + + *nsop = nso; + + return (0); +} + +/* + * int so_acceptq_dequeue(struct sonode *, boolean_t, struct sonode **) + * + * Pulls a connection off of the accept queue. + * + * Arguments: + * so - listening socket + * dontblock - indicate whether it's ok to sleep if there are no + * connections on the queue + * nsop - Value-return argument + * + * Return values: + * 0 when a connection is successfully dequeued, in which case nsop + * is set to point to the new connection. Upon failure a non-zero + * value is returned, and the value of nsop is set to NULL. + * + * Note: + * so_acceptq_dequeue() may return prematurly if the socket is falling + * back to TPI. + */ +int +so_acceptq_dequeue(struct sonode *so, boolean_t dontblock, + struct sonode **nsop) +{ + int error; + + mutex_enter(&so->so_acceptq_lock); + error = so_acceptq_dequeue_locked(so, dontblock, nsop); + mutex_exit(&so->so_acceptq_lock); + + return (error); +} + +/* + * void so_acceptq_flush(struct sonode *so) + * + * Removes all pending connections from a listening socket, and + * frees the associated resources. + * + * Arguments + * so - listening socket + * + * Return values: + * None. + * + * Note: + * The caller has to ensure that no calls to so_acceptq_enqueue() or + * so_acceptq_dequeue() occur while the accept queue is being flushed. + * So either the socket needs to be in a state where no operations + * would come in, or so_lock needs to be obtained. + */ +void +so_acceptq_flush(struct sonode *so) +{ + struct sonode *nso; + + nso = so->so_acceptq_head; + + while (nso != NULL) { + struct sonode *nnso = NULL; + + nnso = nso->so_acceptq_next; + nso->so_acceptq_next = NULL; + /* + * Since the socket is on the accept queue, there can + * only be one reference. We drop the reference and + * just blow off the socket. + */ + ASSERT(nso->so_count == 1); + nso->so_count--; + socket_destroy(nso); + nso = nnso; + } + + so->so_acceptq_head = NULL; + so->so_acceptq_tail = &so->so_acceptq_head; + so->so_acceptq_len = 0; +} + +int +so_wait_connected_locked(struct sonode *so, boolean_t nonblock, + sock_connid_t id) +{ + ASSERT(MUTEX_HELD(&so->so_lock)); + + /* + * The protocol has notified us that a connection attempt is being + * made, so before we wait for a notification to arrive we must + * clear out any errors associated with earlier connection attempts. + */ + if (so->so_error != 0 && SOCK_CONNID_LT(so->so_proto_connid, id)) + so->so_error = 0; + + while (SOCK_CONNID_LT(so->so_proto_connid, id)) { + if (nonblock) + return (EINPROGRESS); + + if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING)) + return (EINTR); + + if (cv_wait_sig_swap(&so->so_state_cv, &so->so_lock) == 0) + return (EINTR); + } + + if (so->so_error != 0) + return (sogeterr(so, B_TRUE)); + /* + * Under normal circumstances, so_error should contain an error + * in case the connect failed. However, it is possible for another + * thread to come in a consume the error, so generate a sensible + * error in that case. + */ + if ((so->so_state & SS_ISCONNECTED) == 0) + return (ECONNREFUSED); + + return (0); +} + +/* + * int so_wait_connected(struct sonode *so, boolean_t nonblock, + * sock_connid_t id) + * + * Wait until the socket is connected or an error has occured. + * + * Arguments: + * so - socket + * nonblock - indicate whether it's ok to sleep if the connection has + * not yet been established + * gen - generation number that was returned by the protocol + * when the operation was started + * + * Returns: + * 0 if the connection attempt was successful, or an error indicating why + * the connection attempt failed. + */ +int +so_wait_connected(struct sonode *so, boolean_t nonblock, sock_connid_t id) +{ + int error; + + mutex_enter(&so->so_lock); + error = so_wait_connected_locked(so, nonblock, id); + mutex_exit(&so->so_lock); + + return (error); +} + +int +so_snd_wait_qnotfull_locked(struct sonode *so, boolean_t dontblock) +{ + int error; + + ASSERT(MUTEX_HELD(&so->so_lock)); + while (so->so_snd_qfull) { + if (so->so_state & SS_CANTSENDMORE) + return (EPIPE); + if (dontblock) + return (EWOULDBLOCK); + + if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING)) + return (EINTR); + + if (so->so_sndtimeo == 0) { + /* + * Zero means disable timeout. + */ + error = cv_wait_sig(&so->so_snd_cv, &so->so_lock); + } else { + clock_t now; + + time_to_wait(&now, so->so_sndtimeo); + error = cv_timedwait_sig(&so->so_snd_cv, &so->so_lock, + now); + } + if (error == 0) + return (EINTR); + else if (error == -1) + return (ETIME); + } + return (0); +} + +/* + * int so_wait_sendbuf(struct sonode *so, boolean_t dontblock) + * + * Wait for the transport to notify us about send buffers becoming + * available. + */ +int +so_snd_wait_qnotfull(struct sonode *so, boolean_t dontblock) +{ + int error = 0; + + mutex_enter(&so->so_lock); + if (so->so_snd_qfull) { + so->so_snd_wakeup = B_TRUE; + error = so_snd_wait_qnotfull_locked(so, dontblock); + so->so_snd_wakeup = B_FALSE; + } + mutex_exit(&so->so_lock); + + return (error); +} + +void +so_snd_qfull(struct sonode *so) +{ + mutex_enter(&so->so_lock); + so->so_snd_qfull = B_TRUE; + mutex_exit(&so->so_lock); +} + +void +so_snd_qnotfull(struct sonode *so) +{ + mutex_enter(&so->so_lock); + so->so_snd_qfull = B_FALSE; + /* wake up everyone waiting for buffers */ + cv_broadcast(&so->so_snd_cv); + mutex_exit(&so->so_lock); +} + +/* + * Change the process/process group to which SIGIO is sent. + */ +int +socket_chgpgrp(struct sonode *so, pid_t pid) +{ + int error; + + ASSERT(MUTEX_HELD(&so->so_lock)); + if (pid != 0) { + /* + * Permissions check by sending signal 0. + * Note that when kill fails it does a + * set_errno causing the system call to fail. + */ + error = kill(pid, 0); + if (error != 0) { + return (error); + } + } + so->so_pgrp = pid; + return (0); +} + + +/* + * Generate a SIGIO, for 'writable' events include siginfo structure, + * for read events just send the signal. + */ +/*ARGSUSED*/ +static void +socket_sigproc(proc_t *proc, int event) +{ + k_siginfo_t info; + + ASSERT(event & (SOCKETSIG_WRITE | SOCKETSIG_READ | SOCKETSIG_URG)); + + if (event & SOCKETSIG_WRITE) { + info.si_signo = SIGPOLL; + info.si_code = POLL_OUT; + info.si_errno = 0; + info.si_fd = 0; + info.si_band = 0; + sigaddq(proc, NULL, &info, KM_NOSLEEP); + } + if (event & SOCKETSIG_READ) { + sigtoproc(proc, NULL, SIGPOLL); + } + if (event & SOCKETSIG_URG) { + sigtoproc(proc, NULL, SIGURG); + } +} + +void +socket_sendsig(struct sonode *so, int event) +{ + proc_t *proc; + + ASSERT(MUTEX_HELD(&so->so_lock)); + + if (so->so_pgrp == 0 || (!(so->so_state & SS_ASYNC) && + event != SOCKETSIG_URG)) { + return; + } + + dprint(3, ("sending sig %d to %d\n", event, so->so_pgrp)); + + if (so->so_pgrp > 0) { + /* + * XXX This unfortunately still generates + * a signal when a fd is closed but + * the proc is active. + */ + mutex_enter(&pidlock); + proc = prfind(so->so_pgrp); + if (proc == NULL) { + mutex_exit(&pidlock); + return; + } + mutex_enter(&proc->p_lock); + mutex_exit(&pidlock); + socket_sigproc(proc, event); + mutex_exit(&proc->p_lock); + } else { + /* + * Send to process group. Hold pidlock across + * calls to socket_sigproc(). + */ + pid_t pgrp = -so->so_pgrp; + + mutex_enter(&pidlock); + proc = pgfind(pgrp); + while (proc != NULL) { + mutex_enter(&proc->p_lock); + socket_sigproc(proc, event); + mutex_exit(&proc->p_lock); + proc = proc->p_pglink; + } + mutex_exit(&pidlock); + } +} + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +/* Copy userdata into a new mblk_t */ +mblk_t * +socopyinuio(uio_t *uiop, ssize_t iosize, size_t wroff, ssize_t maxblk, + size_t tail_len, int *errorp) +{ + mblk_t *head = NULL, **tail = &head; + + ASSERT(iosize == INFPSZ || iosize > 0); + + if (iosize == INFPSZ || iosize > uiop->uio_resid) + iosize = uiop->uio_resid; + + if (maxblk == INFPSZ) + maxblk = iosize; + + /* Nothing to do in these cases, so we're done */ + if (iosize < 0 || maxblk < 0 || (maxblk == 0 && iosize > 0)) + goto done; + + /* + * We will enter the loop below if iosize is 0; it will allocate an + * empty message block and call uiomove(9F) which will just return. + * We could avoid that with an extra check but would only slow + * down the much more likely case where iosize is larger than 0. + */ + do { + ssize_t blocksize; + mblk_t *mp; + + blocksize = MIN(iosize, maxblk); + ASSERT(blocksize >= 0); + if ((mp = allocb(wroff + blocksize + tail_len, + BPRI_MED)) == NULL) { + *errorp = ENOMEM; + return (head); + } + mp->b_rptr += wroff; + mp->b_wptr = mp->b_rptr + blocksize; + + *tail = mp; + tail = &mp->b_cont; + + /* uiomove(9F) either returns 0 or EFAULT */ + if ((*errorp = uiomove(mp->b_rptr, (size_t)blocksize, + UIO_WRITE, uiop)) != 0) { + ASSERT(*errorp != ENOMEM); + freemsg(head); + return (NULL); + } + + iosize -= blocksize; + } while (iosize > 0); + +done: + *errorp = 0; + return (head); +} + +mblk_t * +socopyoutuio(mblk_t *mp, struct uio *uiop, ssize_t max_read, int *errorp) +{ + int error; + ptrdiff_t n; + mblk_t *nmp; + + ASSERT(mp->b_wptr >= mp->b_rptr); + + /* + * max_read is the offset of the oobmark and read can not go pass + * the oobmark. + */ + if (max_read == INFPSZ || max_read > uiop->uio_resid) + max_read = uiop->uio_resid; + + do { + if ((n = MIN(max_read, MBLKL(mp))) != 0) { + ASSERT(n > 0); + + error = uiomove(mp->b_rptr, n, UIO_READ, uiop); + if (error != 0) { + freemsg(mp); + *errorp = error; + return (NULL); + } + } + + mp->b_rptr += n; + max_read -= n; + while (mp != NULL && (mp->b_rptr >= mp->b_wptr)) { + /* + * get rid of zero length mblks + */ + nmp = mp; + mp = mp->b_cont; + freeb(nmp); + } + } while (mp != NULL && max_read > 0); + + *errorp = 0; + return (mp); +} + +static void +so_prepend_msg(struct sonode *so, mblk_t *mp, mblk_t *last_tail) +{ + ASSERT(last_tail != NULL); + mp->b_next = so->so_rcv_q_head; + mp->b_prev = last_tail; + ASSERT(!(DB_FLAGS(mp) & DBLK_UIOA)); + + if (so->so_rcv_q_head == NULL) { + ASSERT(so->so_rcv_q_last_head == NULL); + so->so_rcv_q_last_head = mp; +#ifdef DEBUG + } else { + ASSERT(!(DB_FLAGS(so->so_rcv_q_head) & DBLK_UIOA)); +#endif + } + so->so_rcv_q_head = mp; + +#ifdef DEBUG + if (so_debug_length) { + mutex_enter(&so->so_lock); + ASSERT(so_check_length(so)); + mutex_exit(&so->so_lock); + } +#endif +} + +static void +process_new_message(struct sonode *so, mblk_t *mp_head, mblk_t *mp_last_head) +{ + ASSERT(mp_head->b_prev != NULL); + if (so->so_rcv_q_head == NULL) { + so->so_rcv_q_head = mp_head; + so->so_rcv_q_last_head = mp_last_head; + ASSERT(so->so_rcv_q_last_head->b_prev != NULL); + } else { + boolean_t flag_equal = ((DB_FLAGS(mp_head) & DBLK_UIOA) == + (DB_FLAGS(so->so_rcv_q_last_head) & DBLK_UIOA)); + + if (mp_head->b_next == NULL && + DB_TYPE(mp_head) == M_DATA && + DB_TYPE(so->so_rcv_q_last_head) == M_DATA && flag_equal) { + so->so_rcv_q_last_head->b_prev->b_cont = mp_head; + so->so_rcv_q_last_head->b_prev = mp_head->b_prev; + mp_head->b_prev = NULL; + } else if (flag_equal && (DB_FLAGS(mp_head) & DBLK_UIOA)) { + /* + * Append to last_head if more than one mblks, and both + * mp_head and last_head are I/OAT mblks. + */ + ASSERT(mp_head->b_next != NULL); + so->so_rcv_q_last_head->b_prev->b_cont = mp_head; + so->so_rcv_q_last_head->b_prev = mp_head->b_prev; + mp_head->b_prev = NULL; + + so->so_rcv_q_last_head->b_next = mp_head->b_next; + mp_head->b_next = NULL; + so->so_rcv_q_last_head = mp_last_head; + } else { +#ifdef DEBUG + { + mblk_t *tmp_mblk; + tmp_mblk = mp_head; + while (tmp_mblk != NULL) { + ASSERT(tmp_mblk->b_prev != NULL); + tmp_mblk = tmp_mblk->b_next; + } + } +#endif + so->so_rcv_q_last_head->b_next = mp_head; + so->so_rcv_q_last_head = mp_last_head; + } + } +} + +int +so_dequeue_msg(struct sonode *so, mblk_t **mctlp, struct uio *uiop, + rval_t *rvalp, int flags) +{ + mblk_t *mp, *nmp; + mblk_t *savemp, *savemptail; + mblk_t *new_msg_head; + mblk_t *new_msg_last_head; + mblk_t *last_tail; + boolean_t partial_read; + boolean_t reset_atmark = B_FALSE; + int more = 0; + int error; + ssize_t oobmark; + sodirect_t *sodp = so->so_direct; + + partial_read = B_FALSE; + *mctlp = NULL; +again: + mutex_enter(&so->so_lock); +again1: +#ifdef DEBUG + if (so_debug_length) { + ASSERT(so_check_length(so)); + } +#endif + /* + * First move messages from the dump area to processing area + */ + if (sodp != NULL) { + /* No need to grab sod_lockp since it pointers to so_lock */ + if (sodp->sod_state & SOD_ENABLED) { + ASSERT(sodp->sod_lockp == &so->so_lock); + + if (sodp->sod_uioa.uioa_state & UIOA_ALLOC) { + /* nothing to uioamove */ + sodp = NULL; + } else if (sodp->sod_uioa.uioa_state & UIOA_INIT) { + sodp->sod_uioa.uioa_state &= UIOA_CLR; + sodp->sod_uioa.uioa_state |= UIOA_ENABLED; + /* + * try to uioamove() the data that + * has already queued. + */ + sod_uioa_so_init(so, sodp, uiop); + } + } else { + sodp = NULL; + } + } + new_msg_head = so->so_rcv_head; + new_msg_last_head = so->so_rcv_last_head; + so->so_rcv_head = NULL; + so->so_rcv_last_head = NULL; + oobmark = so->so_oobmark; + /* + * We can release the lock as there can only be one reader + */ + mutex_exit(&so->so_lock); + + if (so->so_state & SS_RCVATMARK) { + reset_atmark = B_TRUE; + } + if (new_msg_head != NULL) { + process_new_message(so, new_msg_head, new_msg_last_head); + } + savemp = savemptail = NULL; + rvalp->r_val1 = 0; + error = 0; + mp = so->so_rcv_q_head; + + if (mp != NULL && + (so->so_rcv_timer_tid == 0 || + so->so_rcv_queued >= so->so_rcv_thresh)) { + partial_read = B_FALSE; + + if (flags & MSG_PEEK) { + if ((nmp = dupmsg(mp)) == NULL && + (nmp = copymsg(mp)) == NULL) { + size_t size = msgsize(mp); + + error = strwaitbuf(size, BPRI_HI); + if (error) { + return (error); + } + goto again; + } + mp = nmp; + } else { + ASSERT(mp->b_prev != NULL); + last_tail = mp->b_prev; + mp->b_prev = NULL; + so->so_rcv_q_head = mp->b_next; + if (so->so_rcv_q_head == NULL) { + so->so_rcv_q_last_head = NULL; + } + mp->b_next = NULL; + } + + ASSERT(mctlp != NULL); + /* + * First process PROTO or PCPROTO blocks, if any. + */ + if (DB_TYPE(mp) != M_DATA) { + *mctlp = mp; + savemp = mp; + savemptail = mp; + ASSERT(DB_TYPE(mp) == M_PROTO || + DB_TYPE(mp) == M_PCPROTO); + while (mp->b_cont != NULL && + DB_TYPE(mp->b_cont) != M_DATA) { + ASSERT(DB_TYPE(mp->b_cont) == M_PROTO || + DB_TYPE(mp->b_cont) == M_PCPROTO); + mp = mp->b_cont; + savemptail = mp; + } + mp = savemptail->b_cont; + savemptail->b_cont = NULL; + } + + ASSERT(DB_TYPE(mp) == M_DATA); + /* + * Now process DATA blocks, if any. Note that for sodirect + * enabled socket, uio_resid can be 0. + */ + if (uiop->uio_resid >= 0) { + ssize_t copied = 0; + + if (sodp != NULL && (DB_FLAGS(mp) & DBLK_UIOA)) { + mutex_enter(sodp->sod_lockp); + ASSERT(uiop == (uio_t *)&sodp->sod_uioa); + copied = sod_uioa_mblk(so, mp); + if (copied > 0) + partial_read = B_TRUE; + mutex_exit(sodp->sod_lockp); + /* mark this mblk as processed */ + mp = NULL; + } else { + ssize_t oldresid = uiop->uio_resid; + + if (MBLKL(mp) < so_mblk_pull_len) { + if (pullupmsg(mp, -1) == 1) { + last_tail = mp; + } + } + /* + * Can not read beyond the oobmark + */ + mp = socopyoutuio(mp, uiop, + oobmark == 0 ? INFPSZ : oobmark, &error); + if (error != 0) { + freemsg(*mctlp); + *mctlp = NULL; + more = 0; + goto done; + } + ASSERT(oldresid >= uiop->uio_resid); + copied = oldresid - uiop->uio_resid; + if (oldresid > uiop->uio_resid) + partial_read = B_TRUE; + } + ASSERT(copied >= 0); + if (copied > 0 && !(flags & MSG_PEEK)) { + mutex_enter(&so->so_lock); + so->so_rcv_queued -= copied; + ASSERT(so->so_oobmark >= 0); + if (so->so_oobmark > 0) { + so->so_oobmark -= copied; + ASSERT(so->so_oobmark >= 0); + if (so->so_oobmark == 0) { + ASSERT(so->so_state & + SS_OOBPEND); + so->so_oobmark = 0; + so->so_state |= SS_RCVATMARK; + } + } + if (so->so_flowctrld && so->so_rcv_queued < + so->so_rcvlowat) { + so->so_flowctrld = B_FALSE; + mutex_exit(&so->so_lock); + /* + * open up flow control + */ + (*so->so_downcalls->sd_clr_flowctrl) + (so->so_proto_handle); + } else { + mutex_exit(&so->so_lock); + } + } + } + if (mp != NULL) { /* more data blocks in msg */ + more |= MOREDATA; + if ((flags & (MSG_PEEK|MSG_TRUNC))) { + if (flags & MSG_TRUNC) { + mutex_enter(&so->so_lock); + so->so_rcv_queued -= msgdsize(mp); + mutex_exit(&so->so_lock); + } + freemsg(mp); + } else if (partial_read && !somsghasdata(mp)) { + /* + * Avoid queuing a zero-length tail part of + * a message. partial_read == 1 indicates that + * we read some of the message. + */ + freemsg(mp); + more &= ~MOREDATA; + } else { + if (savemp != NULL && + (flags & MSG_DUPCTRL)) { + mblk_t *nmp; + /* + * There should only be non data mblks + */ + ASSERT(DB_TYPE(savemp) != M_DATA && + DB_TYPE(savemptail) != M_DATA); +try_again: + if ((nmp = dupmsg(savemp)) == NULL && + (nmp = copymsg(savemp)) == NULL) { + + size_t size = msgsize(savemp); + + error = strwaitbuf(size, + BPRI_HI); + if (error != 0) { + /* + * In case we + * cannot copy + * control data + * free the remaining + * data. + */ + freemsg(mp); + goto done; + } + goto try_again; + } + + ASSERT(nmp != NULL); + ASSERT(DB_TYPE(nmp) != M_DATA); + savemptail->b_cont = mp; + *mctlp = nmp; + mp = savemp; + } + /* + * putback mp + */ + so_prepend_msg(so, mp, last_tail); + } + } + + /* fast check so_rcv_head if there is more data */ + if (partial_read && !(so->so_state & SS_RCVATMARK) && + *mctlp == NULL && uiop->uio_resid > 0 && + !(flags & MSG_PEEK) && so->so_rcv_head != NULL) { + goto again; + } + } else if (!partial_read) { + mutex_enter(&so->so_lock); + if (so->so_error != 0) { + error = sogeterr(so, !(flags & MSG_PEEK)); + mutex_exit(&so->so_lock); + return (error); + } + /* + * No pending data. Return right away for nonblocking + * socket, otherwise sleep waiting for data. + */ + if (!(so->so_state & SS_CANTRCVMORE)) { + if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) || + (flags & MSG_DONTWAIT)) { + error = EWOULDBLOCK; + } else { + if (so->so_state & (SS_CLOSING | + SS_FALLBACK_PENDING)) { + mutex_exit(&so->so_lock); + error = EINTR; + goto done; + } + + if (so->so_rcv_head != NULL) { + goto again1; + } + so->so_rcv_wakeup = B_TRUE; + so->so_rcv_wanted = uiop->uio_resid; + if (so->so_rcvtimeo == 0) { + /* + * Zero means disable timeout. + */ + error = cv_wait_sig(&so->so_rcv_cv, + &so->so_lock); + } else { + clock_t now; + time_to_wait(&now, so->so_rcvtimeo); + error = cv_timedwait_sig(&so->so_rcv_cv, + &so->so_lock, now); + } + so->so_rcv_wakeup = B_FALSE; + so->so_rcv_wanted = 0; + + if (error == 0) { + error = EINTR; + } else if (error == -1) { + error = ETIME; + } else { + goto again1; + } + } + } + mutex_exit(&so->so_lock); + } + if (reset_atmark && partial_read && !(flags & MSG_PEEK)) { + /* + * We are passed the mark, update state + * 4.3BSD and 4.4BSD clears the mark when peeking across it. + * The draft Posix socket spec states that the mark should + * not be cleared when peeking. We follow the latter. + */ + mutex_enter(&so->so_lock); + ASSERT(so_verify_oobstate(so)); + so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK); + freemsg(so->so_oobmsg); + so->so_oobmsg = NULL; + ASSERT(so_verify_oobstate(so)); + mutex_exit(&so->so_lock); + } + ASSERT(so->so_rcv_wakeup == B_FALSE); +done: + if (sodp != NULL) { + mutex_enter(sodp->sod_lockp); + if ((sodp->sod_state & SOD_ENABLED) && + (sodp->sod_uioa.uioa_state & UIOA_ENABLED)) { + SOD_UIOAFINI(sodp); + if (sodp->sod_uioa.uioa_mbytes > 0) { + ASSERT(so->so_rcv_q_head != NULL || + so->so_rcv_head != NULL); + so->so_rcv_queued -= sod_uioa_mblk(so, NULL); + if (error == EWOULDBLOCK) + error = 0; + } + } + mutex_exit(sodp->sod_lockp); + } +#ifdef DEBUG + if (so_debug_length) { + mutex_enter(&so->so_lock); + ASSERT(so_check_length(so)); + mutex_exit(&so->so_lock); + } +#endif + rvalp->r_val1 = more; + return (error); +} + +void +so_enqueue_msg(struct sonode *so, mblk_t *mp, size_t msg_size) +{ + ASSERT(MUTEX_HELD(&so->so_lock)); + +#ifdef DEBUG + if (so_debug_length) { + ASSERT(so_check_length(so)); + } +#endif + so->so_rcv_queued += msg_size; + + if (so->so_rcv_head == NULL) { + ASSERT(so->so_rcv_last_head == NULL); + so->so_rcv_head = mp; + so->so_rcv_last_head = mp; + } else if ((DB_TYPE(mp) == M_DATA && + DB_TYPE(so->so_rcv_last_head) == M_DATA) && + ((DB_FLAGS(mp) & DBLK_UIOA) == + (DB_FLAGS(so->so_rcv_last_head) & DBLK_UIOA))) { + /* Added to the end */ + ASSERT(so->so_rcv_last_head != NULL); + ASSERT(so->so_rcv_last_head->b_prev != NULL); + so->so_rcv_last_head->b_prev->b_cont = mp; + } else { + /* Start a new end */ + so->so_rcv_last_head->b_next = mp; + so->so_rcv_last_head = mp; + } + while (mp->b_cont != NULL) + mp = mp->b_cont; + + so->so_rcv_last_head->b_prev = mp; +#ifdef DEBUG + if (so_debug_length) { + ASSERT(so_check_length(so)); + } +#endif +} + +/* + * Return B_TRUE if there is data in the message, B_FALSE otherwise. + */ +boolean_t +somsghasdata(mblk_t *mp) +{ + for (; mp; mp = mp->b_cont) + if (mp->b_datap->db_type == M_DATA) { + ASSERT(mp->b_wptr >= mp->b_rptr); + if (mp->b_wptr > mp->b_rptr) + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Flush the read side of sockfs. + * + * The caller must be sure that a reader is not already active when the + * buffer is being flushed. + */ +void +so_rcv_flush(struct sonode *so) +{ + mblk_t *mp; + + ASSERT(MUTEX_HELD(&so->so_lock)); + + if (so->so_oobmsg != NULL) { + freemsg(so->so_oobmsg); + so->so_oobmsg = NULL; + so->so_oobmark = 0; + so->so_state &= + ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|SS_RCVATMARK); + } + + /* + * Free messages sitting in the send and recv queue + */ + while (so->so_rcv_q_head != NULL) { + mp = so->so_rcv_q_head; + so->so_rcv_q_head = mp->b_next; + mp->b_next = mp->b_prev = NULL; + freemsg(mp); + } + while (so->so_rcv_head != NULL) { + mp = so->so_rcv_head; + so->so_rcv_head = mp->b_next; + mp->b_next = mp->b_prev = NULL; + freemsg(mp); + } + so->so_rcv_queued = 0; + so->so_rcv_q_head = NULL; + so->so_rcv_q_last_head = NULL; + so->so_rcv_head = NULL; + so->so_rcv_last_head = NULL; +} + +/* + * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK. + */ +int +sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags, + boolean_t oob_inline) +{ + mblk_t *mp, *nmp; + int error; + + dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n", (void *)so, (void *)msg, + flags)); + + if (msg != NULL) { + /* + * There is never any oob data with addresses or control since + * the T_EXDATA_IND does not carry any options. + */ + msg->msg_controllen = 0; + msg->msg_namelen = 0; + msg->msg_flags = 0; + } + + mutex_enter(&so->so_lock); + ASSERT(so_verify_oobstate(so)); + if (oob_inline || + (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) { + dprintso(so, 1, ("sorecvoob: inline or data consumed\n")); + mutex_exit(&so->so_lock); + return (EINVAL); + } + if (!(so->so_state & SS_HAVEOOBDATA)) { + dprintso(so, 1, ("sorecvoob: no data yet\n")); + mutex_exit(&so->so_lock); + return (EWOULDBLOCK); + } + ASSERT(so->so_oobmsg != NULL); + mp = so->so_oobmsg; + if (flags & MSG_PEEK) { + /* + * Since recv* can not return ENOBUFS we can not use dupmsg. + * Instead we revert to the consolidation private + * allocb_wait plus bcopy. + */ + mblk_t *mp1; + + mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL); + ASSERT(mp1); + + while (mp != NULL) { + ssize_t size; + + size = MBLKL(mp); + bcopy(mp->b_rptr, mp1->b_wptr, size); + mp1->b_wptr += size; + ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim); + mp = mp->b_cont; + } + mp = mp1; + } else { + /* + * Update the state indicating that the data has been consumed. + * Keep SS_OOBPEND set until data is consumed past the mark. + */ + so->so_oobmsg = NULL; + so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA; + } + ASSERT(so_verify_oobstate(so)); + mutex_exit(&so->so_lock); + + error = 0; + nmp = mp; + while (nmp != NULL && uiop->uio_resid > 0) { + ssize_t n = MBLKL(nmp); + + n = MIN(n, uiop->uio_resid); + if (n > 0) + error = uiomove(nmp->b_rptr, n, + UIO_READ, uiop); + if (error) + break; + nmp = nmp->b_cont; + } + ASSERT(mp->b_next == NULL && mp->b_prev == NULL); + freemsg(mp); + return (error); +} + +/* + * Allocate and initializ sonode + */ +/* ARGSUSED */ +struct sonode * +socket_sonode_create(struct sockparams *sp, int family, int type, + int protocol, int version, int sflags, int *errorp, struct cred *cr) +{ + sonode_t *so; + int kmflags; + + /* + * Choose the right set of sonodeops based on the upcall and + * down call version that the protocol has provided + */ + if (SOCK_UC_VERSION != sp->sp_smod_info->smod_uc_version || + SOCK_DC_VERSION != sp->sp_smod_info->smod_dc_version) { + /* + * mismatch + */ +#ifdef DEBUG + cmn_err(CE_CONT, "protocol and socket module version mismatch"); +#endif + *errorp = EINVAL; + return (NULL); + } + + kmflags = (sflags & SOCKET_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; + + so = kmem_cache_alloc(socket_cache, kmflags); + if (so == NULL) { + *errorp = ENOMEM; + return (NULL); + } + + sonode_init(so, sp, family, type, protocol, &so_sonodeops); + + if (version == SOV_DEFAULT) + version = so_default_version; + + so->so_version = (short)version; + + /* + * set the default values to be INFPSZ + * if a protocol desires it can change the value later + */ + so->so_proto_props.sopp_rxhiwat = SOCKET_RECVHIWATER; + so->so_proto_props.sopp_rxlowat = SOCKET_RECVLOWATER; + so->so_proto_props.sopp_maxpsz = INFPSZ; + so->so_proto_props.sopp_maxblk = INFPSZ; + + return (so); +} + +int +socket_init_common(struct sonode *so, struct sonode *pso, int flags, cred_t *cr) +{ + int error = 0; + + if (pso != NULL) { + /* + * We have a passive open, so inherit basic state from + * the parent (listener). + * + * No need to grab the new sonode's lock, since there is no + * one that can have a reference to it. + */ + mutex_enter(&pso->so_lock); + + so->so_state |= SS_ISCONNECTED | (pso->so_state & SS_ASYNC); + so->so_pgrp = pso->so_pgrp; + so->so_rcvtimeo = pso->so_rcvtimeo; + so->so_sndtimeo = pso->so_sndtimeo; + /* + * Make note of the socket level options. TCP and IP level + * options are already inherited. We could do all this after + * accept is successful but doing it here simplifies code and + * no harm done for error case. + */ + so->so_options = pso->so_options & (SO_DEBUG|SO_REUSEADDR| + SO_KEEPALIVE| SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK| + SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER); + so->so_proto_props = pso->so_proto_props; + so->so_mode = pso->so_mode; + + mutex_exit(&pso->so_lock); + + if (uioasync.enabled) { + sod_sock_init(so, NULL, NULL, NULL, &so->so_lock); + } + return (0); + } else { + struct sockparams *sp = so->so_sockparams; + sock_upcalls_t *upcalls_to_use; + + /* + * Based on the version number select the right upcalls to + * pass down. Currently we only have one version so choose + * default + */ + upcalls_to_use = &so_upcalls; + + /* active open, so create a lower handle */ + so->so_proto_handle = + sp->sp_smod_info->smod_proto_create_func(so->so_family, + so->so_type, so->so_protocol, &so->so_downcalls, + &so->so_mode, &error, flags, cr); + + if (so->so_proto_handle == NULL) { + ASSERT(error != 0); + /* + * To be safe; if a lower handle cannot be created, and + * the proto does not give a reason why, assume there + * was a lack of memory. + */ + return ((error == 0) ? ENOMEM : error); + } + ASSERT(so->so_downcalls != NULL); + ASSERT(so->so_downcalls->sd_send != NULL || + so->so_downcalls->sd_send_uio != NULL); + if (so->so_downcalls->sd_recv_uio != NULL) { + ASSERT(so->so_downcalls->sd_poll != NULL); + so->so_pollev |= SO_POLLEV_ALWAYS; + } + + (*so->so_downcalls->sd_activate)(so->so_proto_handle, + (sock_upper_handle_t)so, upcalls_to_use, 0, cr); + + /* Wildcard */ + + /* + * FIXME No need for this, the protocol can deal with it in + * sd_create(). Should update ICMP. + */ + if (so->so_protocol != so->so_sockparams->sp_protocol) { + int protocol = so->so_protocol; + int error; + /* + * Issue SO_PROTOTYPE setsockopt. + */ + error = socket_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE, + &protocol, (t_uscalar_t)sizeof (protocol), cr); + if (error) { + (void) (*so->so_downcalls->sd_close) + (so->so_proto_handle, 0, cr); + + mutex_enter(&so->so_lock); + so_rcv_flush(so); + mutex_exit(&so->so_lock); + /* + * Setsockopt often fails with ENOPROTOOPT but + * socket() should fail with + * EPROTONOSUPPORT/EPROTOTYPE. + */ + return (EPROTONOSUPPORT); + } + } + return (0); + } +} + +/* + * int socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode, + * struct cred *cr, int32_t *rvalp) + * + * Handle ioctls that manipulate basic socket state; non-blocking, + * async, etc. + * + * Returns: + * < 0 - ioctl was not handle + * >= 0 - ioctl was handled, if > 0, then it is an errno + * + * Notes: + * Assumes the standard receive buffer is used to obtain info for + * NREAD. + */ +/* ARGSUSED */ +int +socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode, + struct cred *cr, int32_t *rvalp) +{ + switch (cmd) { + case FIONBIO: { + int32_t value; + + if (so_copyin((void *)arg, &value, sizeof (int32_t), + (mode & (int)FKIOCTL))) + return (EFAULT); + + mutex_enter(&so->so_lock); + if (value) { + so->so_state |= SS_NDELAY; + } else { + so->so_state &= ~SS_NDELAY; + } + mutex_exit(&so->so_lock); + return (0); + } + case FIOASYNC: { + int32_t value; + + if (so_copyin((void *)arg, &value, sizeof (int32_t), + (mode & (int)FKIOCTL))) + return (EFAULT); + + mutex_enter(&so->so_lock); + + if (value) { + /* Turn on SIGIO */ + so->so_state |= SS_ASYNC; + } else { + /* Turn off SIGIO */ + so->so_state &= ~SS_ASYNC; + } + mutex_exit(&so->so_lock); + + return (0); + } + + case SIOCSPGRP: + case FIOSETOWN: { + int error; + pid_t pid; + + if (so_copyin((void *)arg, &pid, sizeof (pid_t), + (mode & (int)FKIOCTL))) + return (EFAULT); + + mutex_enter(&so->so_lock); + error = (pid != so->so_pgrp) ? socket_chgpgrp(so, pid) : 0; + mutex_exit(&so->so_lock); + return (error); + } + case SIOCGPGRP: + case FIOGETOWN: + if (so_copyout(&so->so_pgrp, (void *)arg, + sizeof (pid_t), (mode & (int)FKIOCTL))) + return (EFAULT); + + return (0); + case SIOCATMARK: { + int retval; + + /* + * Only protocols that support urgent data can handle ATMARK. + */ + if ((so->so_mode & SM_EXDATA) == 0) + return (EINVAL); + + /* + * If the protocol is maintaining its own buffer, then the + * request must be passed down. + */ + if (so->so_downcalls->sd_recv_uio != NULL) + return (-1); + + retval = (so->so_state & SS_RCVATMARK) != 0; + + if (so_copyout(&retval, (void *)arg, sizeof (int), + (mode & (int)FKIOCTL))) { + return (EFAULT); + } + return (0); + } + + case FIONREAD: { + int retval; + + /* + * If the protocol is maintaining its own buffer, then the + * request must be passed down. + */ + if (so->so_downcalls->sd_recv_uio != NULL) + return (-1); + + retval = MIN(so->so_rcv_queued, INT_MAX); + + if (so_copyout(&retval, (void *)arg, + sizeof (retval), (mode & (int)FKIOCTL))) { + return (EFAULT); + } + return (0); + } + + case _I_GETPEERCRED: { + int error = 0; + + if ((mode & FKIOCTL) == 0) + return (EINVAL); + + mutex_enter(&so->so_lock); + if ((so->so_mode & SM_CONNREQUIRED) == 0) { + error = ENOTSUP; + } else if ((so->so_state & SS_ISCONNECTED) == 0) { + error = ENOTCONN; + } else if (so->so_peercred != NULL) { + k_peercred_t *kp = (k_peercred_t *)arg; + kp->pc_cr = so->so_peercred; + kp->pc_cpid = so->so_cpid; + crhold(so->so_peercred); + } else { + error = EINVAL; + } + mutex_exit(&so->so_lock); + return (error); + } + default: + return (-1); + } +} + +/* + * Process STREAMS related ioctls. If a I_PUSH/POP operation is specified + * then the socket will fall back to TPI. + * + * Returns: + * < 0 - ioctl was not handle + * >= 0 - ioctl was handled, if > 0, then it is an errno + */ +int +socket_strioc_common(struct sonode *so, int cmd, intptr_t arg, int mode, + struct cred *cr, int32_t *rvalp) +{ + switch (cmd) { + case _I_INSERT: + case _I_REMOVE: + case I_FIND: + case I_LIST: + return (EOPNOTSUPP); + + case I_PUSH: + case I_POP: { + int retval; + + if ((retval = so_tpi_fallback(so, cr)) == 0) { + /* Reissue the ioctl */ + ASSERT(so->so_rcv_q_head == NULL); + return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp)); + } + return (retval); + } + case I_LOOK: + if (so_copyout("sockmod", (void *)arg, strlen("sockmod") + 1, + (mode & (int)FKIOCTL))) { + return (EFAULT); + } + return (0); + default: + return (-1); + } +} + +int +socket_getopt_common(struct sonode *so, int level, int option_name, + void *optval, socklen_t *optlenp) +{ + if (level != SOL_SOCKET) + return (-1); + + switch (option_name) { + case SO_ERROR: + case SO_DOMAIN: + case SO_TYPE: + case SO_ACCEPTCONN: { + int32_t value; + socklen_t optlen = *optlenp; + + if (optlen < (t_uscalar_t)sizeof (int32_t)) { + return (EINVAL); + } + + switch (option_name) { + case SO_ERROR: + mutex_enter(&so->so_lock); + value = sogeterr(so, B_TRUE); + mutex_exit(&so->so_lock); + break; + case SO_DOMAIN: + value = so->so_family; + break; + case SO_TYPE: + value = so->so_type; + break; + case SO_ACCEPTCONN: + if (so->so_state & SS_ACCEPTCONN) + value = SO_ACCEPTCONN; + else + value = 0; + break; + } + + bcopy(&value, optval, sizeof (value)); + *optlenp = sizeof (value); + + return (0); + } + case SO_SNDTIMEO: + case SO_RCVTIMEO: { + clock_t value; + socklen_t optlen = *optlenp; + + if (optlen < (t_uscalar_t)sizeof (struct timeval)) { + return (EINVAL); + } + if (option_name == SO_RCVTIMEO) + value = drv_hztousec(so->so_rcvtimeo); + else + value = drv_hztousec(so->so_sndtimeo); + ((struct timeval *)(optval))->tv_sec = value / (1000 * 1000); + ((struct timeval *)(optval))->tv_usec = value % (1000 * 1000); + *optlenp = sizeof (struct timeval); + return (0); + } + case SO_DEBUG: + case SO_REUSEADDR: + case SO_KEEPALIVE: + case SO_DONTROUTE: + case SO_BROADCAST: + case SO_USELOOPBACK: + case SO_OOBINLINE: + case SO_SNDBUF: + case SO_RCVBUF: +#ifdef notyet + case SO_SNDLOWAT: + case SO_RCVLOWAT: +#endif /* notyet */ + case SO_DGRAM_ERRIND: { + socklen_t optlen = *optlenp; + + if (optlen < (t_uscalar_t)sizeof (int32_t)) + return (EINVAL); + break; + } + case SO_LINGER: { + socklen_t optlen = *optlenp; + + if (optlen < (t_uscalar_t)sizeof (struct linger)) + return (EINVAL); + break; + } + case SO_SND_BUFINFO: { + socklen_t optlen = *optlenp; + + if (optlen < (t_uscalar_t)sizeof (struct so_snd_bufinfo)) + return (EINVAL); + ((struct so_snd_bufinfo *)(optval))->sbi_wroff = + (so->so_proto_props).sopp_wroff; + ((struct so_snd_bufinfo *)(optval))->sbi_maxblk = + (so->so_proto_props).sopp_maxblk; + ((struct so_snd_bufinfo *)(optval))->sbi_maxpsz = + (so->so_proto_props).sopp_maxpsz; + ((struct so_snd_bufinfo *)(optval))->sbi_tail = + (so->so_proto_props).sopp_tail; + *optlenp = sizeof (struct so_snd_bufinfo); + return (0); + } + default: + break; + } + + /* Unknown Option */ + return (-1); +} + +void +socket_sonode_destroy(struct sonode *so) +{ + sonode_fini(so); + kmem_cache_free(socket_cache, so); +} + +int +so_zcopy_wait(struct sonode *so) +{ + int error = 0; + + mutex_enter(&so->so_lock); + while (!(so->so_copyflag & STZCNOTIFY)) { + if (so->so_state & SS_CLOSING) { + mutex_exit(&so->so_lock); + return (EINTR); + } + if (cv_wait_sig(&so->so_copy_cv, &so->so_lock) == 0) { + error = EINTR; + break; + } + } + so->so_copyflag &= ~STZCNOTIFY; + mutex_exit(&so->so_lock); + return (error); +} + +void +so_timer_callback(void *arg) +{ + struct sonode *so = (struct sonode *)arg; + + mutex_enter(&so->so_lock); + + so->so_rcv_timer_tid = 0; + if (so->so_rcv_queued > 0) { + so_notify_data(so, so->so_rcv_queued); + } else { + mutex_exit(&so->so_lock); + } +} + +#ifdef DEBUG +/* + * Verify that the length stored in so_rcv_queued and the length of data blocks + * queued is same. + */ +static boolean_t +so_check_length(sonode_t *so) +{ + mblk_t *mp = so->so_rcv_q_head; + int len = 0; + + ASSERT(MUTEX_HELD(&so->so_lock)); + + if (mp != NULL) { + len = msgdsize(mp); + while ((mp = mp->b_next) != NULL) + len += msgdsize(mp); + } + mp = so->so_rcv_head; + if (mp != NULL) { + len += msgdsize(mp); + while ((mp = mp->b_next) != NULL) + len += msgdsize(mp); + } + return ((len == so->so_rcv_queued) ? B_TRUE : B_FALSE); +} +#endif + +int +so_get_mod_version(struct sockparams *sp) +{ + ASSERT(sp != NULL && sp->sp_smod_info != NULL); + return (sp->sp_smod_info->smod_version); +} + +/* + * so_start_fallback() + * + * Block new socket operations from coming in, and wait for active operations + * to complete. Threads that are sleeping will be woken up so they can get + * out of the way. + * + * The caller must be a reader on so_fallback_rwlock. + */ +static boolean_t +so_start_fallback(struct sonode *so) +{ + ASSERT(RW_READ_HELD(&so->so_fallback_rwlock)); + + mutex_enter(&so->so_lock); + if (so->so_state & SS_FALLBACK_PENDING) { + mutex_exit(&so->so_lock); + return (B_FALSE); + } + so->so_state |= SS_FALLBACK_PENDING; + /* + * Poke all threads that might be sleeping. Any operation that comes + * in after the cv_broadcast will observe the fallback pending flag + * which cause the call to return where it would normally sleep. + */ + cv_broadcast(&so->so_state_cv); /* threads in connect() */ + cv_broadcast(&so->so_rcv_cv); /* threads in recvmsg() */ + cv_broadcast(&so->so_snd_cv); /* threads in sendmsg() */ + mutex_enter(&so->so_acceptq_lock); + cv_broadcast(&so->so_acceptq_cv); /* threads in accept() */ + mutex_exit(&so->so_acceptq_lock); + mutex_exit(&so->so_lock); + + /* + * The main reason for the rw_tryupgrade call is to provide + * observability during the fallback process. We want to + * be able to see if there are pending operations. + */ + if (rw_tryupgrade(&so->so_fallback_rwlock) == 0) { + /* + * It is safe to drop and reaquire the fallback lock, because + * we are guaranteed that another fallback cannot take place. + */ + rw_exit(&so->so_fallback_rwlock); + DTRACE_PROBE1(pending__ops__wait, (struct sonode *), so); + rw_enter(&so->so_fallback_rwlock, RW_WRITER); + DTRACE_PROBE1(pending__ops__complete, (struct sonode *), so); + } + + return (B_TRUE); +} + +/* + * so_end_fallback() + * + * Allow socket opertions back in. + * + * The caller must be a writer on so_fallback_rwlock. + */ +static void +so_end_fallback(struct sonode *so) +{ + ASSERT(RW_ISWRITER(&so->so_fallback_rwlock)); + + mutex_enter(&so->so_lock); + so->so_state &= ~SS_FALLBACK_PENDING; + mutex_exit(&so->so_lock); + + rw_downgrade(&so->so_fallback_rwlock); +} + +/* + * so_quiesced_cb() + * + * Callback passed to the protocol during fallback. It is called once + * the endpoint is quiescent. + * + * No requests from the user, no notifications from the protocol, so it + * is safe to synchronize the state. Data can also be moved without + * risk for reordering. + * + * NOTE: urgent data is dropped on the floor. + * + * We do not need to hold so_lock, since there can be only one thread + * operating on the sonode. + */ +static void +so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q, + struct T_capability_ack *tcap, struct sockaddr *laddr, socklen_t laddrlen, + struct sockaddr *faddr, socklen_t faddrlen, short opts) +{ + struct sonode *so = (struct sonode *)sock_handle; + + sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen, opts); + + mutex_enter(&so->so_lock); + SOCKET_TIMER_CANCEL(so); + mutex_exit(&so->so_lock); + /* + * Move data to the STREAM head. + */ + if (so->so_rcv_head != NULL) { + if (so->so_rcv_q_last_head == NULL) + so->so_rcv_q_head = so->so_rcv_head; + else + so->so_rcv_q_last_head->b_next = so->so_rcv_head; + so->so_rcv_q_last_head = so->so_rcv_last_head; + } + + while (so->so_rcv_q_head != NULL) { + mblk_t *mp = so->so_rcv_q_head; + size_t mlen = msgdsize(mp); + + so->so_rcv_q_head = mp->b_next; + mp->b_next = NULL; + mp->b_prev = NULL; + so->so_rcv_queued -= mlen; + putnext(q, mp); + } + ASSERT(so->so_rcv_queued == 0); + so->so_rcv_head = NULL; + so->so_rcv_last_head = NULL; + so->so_rcv_q_head = NULL; + so->so_rcv_q_last_head = NULL; + +#ifdef DEBUG + if (so->so_oobmsg != NULL || so->so_oobmark > 0) { + cmn_err(CE_NOTE, "losing oob data due to tpi fallback\n"); + } +#endif + if (so->so_oobmsg != NULL) { + freemsg(so->so_oobmsg); + so->so_oobmsg = NULL; + } + so->so_oobmark = 0; + + ASSERT(so->so_rcv_queued == 0); +} + +/* + * so_tpi_fallback() + * + * This is fallback initation routine; things start here. + * + * Basic strategy: + * o Block new socket operations from coming in + * o Allocate/initate info needed by TPI + * o Quiesce the connection, at which point we sync + * state and move data + * o Change operations (sonodeops) associated with the socket + * o Unblock threads waiting for the fallback to finish + */ +int +so_tpi_fallback(struct sonode *so, struct cred *cr) +{ + int error; + queue_t *q; + struct sockparams *sp; + struct sockparams *newsp; + so_proto_fallback_func_t fbfunc; + boolean_t direct; + + error = 0; + sp = so->so_sockparams; + fbfunc = sp->sp_smod_info->smod_proto_fallback_func; + + /* + * Fallback can only happen if there is a device associated + * with the sonode, and the socket module has a fallback function. + */ + if (!SOCKPARAMS_HAS_DEVICE(sp) || fbfunc == NULL) + return (EINVAL); + + /* + * Initiate fallback; upon success we know that no new requests + * will come in from the user. + */ + if (!so_start_fallback(so)) + return (EAGAIN); + + newsp = sockparams_hold_ephemeral_bydev(so->so_family, so->so_type, + so->so_protocol, so->so_sockparams->sp_sdev_info.sd_devpath, + KM_SLEEP, &error); + if (error != 0) + goto out; + + if (so->so_direct != NULL) { + sodirect_t *sodp = so->so_direct; + mutex_enter(sodp->sod_lockp); + + so->so_direct->sod_state &= ~SOD_ENABLED; + so->so_state &= ~SS_SODIRECT; + ASSERT(sodp->sod_uioafh == NULL); + mutex_exit(sodp->sod_lockp); + } + + /* Turn sonode into a TPI socket */ + q = sotpi_convert_sonode(so, newsp, &direct, cr); + if (q == NULL) { + zcmn_err(getzoneid(), CE_WARN, + "Failed to convert socket to TPI. Pid = %d\n", + curproc->p_pid); + SOCKPARAMS_DEC_REF(newsp); + error = EINVAL; + goto out; + } + + /* + * Now tell the protocol to start using TPI. so_quiesced_cb be + * called once it's safe to synchronize state. + */ + DTRACE_PROBE1(proto__fallback__begin, struct sonode *, so); + /* FIXME assumes this cannot fail. TCP can fail to enter squeue */ + (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb); + DTRACE_PROBE1(proto__fallback__end, struct sonode *, so); + + /* + * Free all pending connection indications, i.e., socket_accept() has + * not yet pulled the connection of the queue. The transport sent + * a T_CONN_IND message for each pending connection to the STREAM head. + */ + so_acceptq_flush(so); + + mutex_enter(&so->so_lock); + so->so_state |= SS_FALLBACK_COMP; + mutex_exit(&so->so_lock); + + /* + * Swap the sonode ops. Socket opertations that come in once this + * is done will proceed without blocking. + */ + so->so_ops = &sotpi_sonodeops; + + /* + * Wake up any threads stuck in poll. This is needed since the poll + * head changes when the fallback happens (moves from the sonode to + * the STREAMS head). + */ + pollwakeup(&so->so_poll_list, POLLERR); +out: + so_end_fallback(so); + + return (error); +} diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_vnops.c b/usr/src/uts/common/fs/sockfs/sockcommon_vnops.c new file mode 100644 index 0000000000..ffcecfa7c1 --- /dev/null +++ b/usr/src/uts/common/fs/sockfs/sockcommon_vnops.c @@ -0,0 +1,482 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bitmap.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/strsubr.h> +#include <sys/cmn_err.h> +#include <sys/sysmacros.h> +#include <sys/filio.h> +#include <sys/flock.h> +#include <sys/stat.h> +#include <sys/share.h> + +#include <sys/vfs.h> +#include <sys/vfs_opreg.h> + +#include <sys/sockio.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/strsun.h> + +#include <fs/sockfs/sockcommon.h> +#include <fs/sockfs/socktpi.h> + +/* + * Generic vnode ops + */ +static int socket_vop_open(struct vnode **, int, struct cred *, + caller_context_t *); +static int socket_vop_close(struct vnode *, int, int, offset_t, + struct cred *, caller_context_t *); +static int socket_vop_read(struct vnode *, struct uio *, int, + struct cred *, caller_context_t *); +static int socket_vop_write(struct vnode *, struct uio *, int, + struct cred *, caller_context_t *); +static int socket_vop_ioctl(struct vnode *, int, intptr_t, int, + struct cred *, int32_t *, caller_context_t *); +static int socket_vop_setfl(struct vnode *, int, int, cred_t *, + caller_context_t *); +static int socket_vop_getattr(struct vnode *, struct vattr *, int, + struct cred *, caller_context_t *); +static int socket_vop_setattr(struct vnode *, struct vattr *, int, + struct cred *, caller_context_t *); +static int socket_vop_access(struct vnode *, int, int, struct cred *, + caller_context_t *); +static int socket_vop_fsync(struct vnode *, int, struct cred *, + caller_context_t *); +static void socket_vop_inactive(struct vnode *, struct cred *, + caller_context_t *); +static int socket_vop_fid(struct vnode *, struct fid *, + caller_context_t *); +static int socket_vop_seek(struct vnode *, offset_t, offset_t *, + caller_context_t *); +static int socket_vop_poll(struct vnode *, short, int, short *, + struct pollhead **, caller_context_t *); + +extern int socket_close_internal(struct sonode *, int, cred_t *); +extern void socket_destroy_internal(struct sonode *, cred_t *); + +struct vnodeops *socket_vnodeops; +const fs_operation_def_t socket_vnodeops_template[] = { + VOPNAME_OPEN, { .vop_open = socket_vop_open }, + VOPNAME_CLOSE, { .vop_close = socket_vop_close }, + VOPNAME_READ, { .vop_read = socket_vop_read }, + VOPNAME_WRITE, { .vop_write = socket_vop_write }, + VOPNAME_IOCTL, { .vop_ioctl = socket_vop_ioctl }, + VOPNAME_SETFL, { .vop_setfl = socket_vop_setfl }, + VOPNAME_GETATTR, { .vop_getattr = socket_vop_getattr }, + VOPNAME_SETATTR, { .vop_setattr = socket_vop_setattr }, + VOPNAME_ACCESS, { .vop_access = socket_vop_access }, + VOPNAME_FSYNC, { .vop_fsync = socket_vop_fsync }, + VOPNAME_INACTIVE, { .vop_inactive = socket_vop_inactive }, + VOPNAME_FID, { .vop_fid = socket_vop_fid }, + VOPNAME_SEEK, { .vop_seek = socket_vop_seek }, + VOPNAME_POLL, { .vop_poll = socket_vop_poll }, + VOPNAME_DISPOSE, { .error = fs_error }, + NULL, NULL +}; + + +/* + * generic vnode ops + */ + +/*ARGSUSED*/ +static int +socket_vop_open(struct vnode **vpp, int flag, struct cred *cr, + caller_context_t *ct) +{ + struct vnode *vp = *vpp; + struct sonode *so = VTOSO(vp); + + flag &= ~FCREAT; /* paranoia */ + mutex_enter(&so->so_lock); + so->so_count++; + mutex_exit(&so->so_lock); + + ASSERT(so->so_count != 0); /* wraparound */ + ASSERT(vp->v_type == VSOCK); + + return (0); +} + +/*ARGSUSED*/ +static int +socket_vop_close(struct vnode *vp, int flag, int count, offset_t offset, + struct cred *cr, caller_context_t *ct) +{ + struct sonode *so; + int error = 0; + + so = VTOSO(vp); + ASSERT(vp->v_type == VSOCK); + + cleanlocks(vp, ttoproc(curthread)->p_pid, 0); + cleanshares(vp, ttoproc(curthread)->p_pid); + + if (vp->v_stream) + strclean(vp); + + if (count > 1) { + dprint(2, ("socket_vop_close: count %d\n", count)); + return (0); + } + + mutex_enter(&so->so_lock); + if (--so->so_count == 0) { + /* + * Initiate connection shutdown. + */ + mutex_exit(&so->so_lock); + error = socket_close_internal(so, flag, cr); + } else { + mutex_exit(&so->so_lock); + } + + return (error); +} + +/*ARGSUSED2*/ +static int +socket_vop_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr, + caller_context_t *ct) +{ + struct sonode *so = VTOSO(vp); + struct nmsghdr lmsg; + + ASSERT(vp->v_type == VSOCK); + bzero((void *)&lmsg, sizeof (lmsg)); + + return (socket_recvmsg(so, &lmsg, uiop, cr)); +} + +/*ARGSUSED2*/ +static int +socket_vop_write(struct vnode *vp, struct uio *uiop, int ioflag, + struct cred *cr, caller_context_t *ct) +{ + struct sonode *so = VTOSO(vp); + struct nmsghdr lmsg; + + ASSERT(vp->v_type == VSOCK); + bzero((void *)&lmsg, sizeof (lmsg)); + + if (!(so->so_mode & SM_BYTESTREAM)) { + /* + * If the socket is not byte stream set MSG_EOR + */ + lmsg.msg_flags = MSG_EOR; + } + + return (socket_sendmsg(so, &lmsg, uiop, cr)); +} + +/*ARGSUSED4*/ +static int +socket_vop_ioctl(struct vnode *vp, int cmd, intptr_t arg, int mode, + struct cred *cr, int32_t *rvalp, caller_context_t *ct) +{ + struct sonode *so = VTOSO(vp); + + ASSERT(vp->v_type == VSOCK); + + return (socket_ioctl(so, cmd, arg, mode, cr, rvalp)); +} + +/* + * Allow any flags. Record FNDELAY and FNONBLOCK so that they can be inherited + * from listener to acceptor. + */ +/* ARGSUSED */ +static int +socket_vop_setfl(vnode_t *vp, int oflags, int nflags, cred_t *cr, + caller_context_t *ct) +{ + struct sonode *so = VTOSO(vp); + int error = 0; + + ASSERT(vp->v_type == VSOCK); + + mutex_enter(&so->so_lock); + if (nflags & FNDELAY) + so->so_state |= SS_NDELAY; + else + so->so_state &= ~SS_NDELAY; + if (nflags & FNONBLOCK) + so->so_state |= SS_NONBLOCK; + else + so->so_state &= ~SS_NONBLOCK; + mutex_exit(&so->so_lock); + + if (so->so_state & SS_ASYNC) + oflags |= FASYNC; + /* + * Sets/clears the SS_ASYNC flag based on the presence/absence + * of the FASYNC flag passed to fcntl(F_SETFL). + * This exists solely for BSD fcntl() FASYNC compatibility. + */ + if ((oflags ^ nflags) & FASYNC && so->so_version != SOV_STREAM) { + int async = nflags & FASYNC; + int32_t rv; + + /* + * For non-TPI sockets all we have to do is set/remove the + * SS_ASYNC bit, but for TPI it is more involved. For that + * reason we delegate the job to the protocol's ioctl handler. + */ + error = socket_ioctl(so, FIOASYNC, (intptr_t)&async, FKIOCTL, + cr, &rv); + } + return (error); +} + + +/* + * Get the made up attributes for the vnode. + * 4.3BSD returns the current time for all the timestamps. + * 4.4BSD returns 0 for all the timestamps. + * Here we use the access and modified times recorded in the sonode. + * + * Just like in BSD there is not effect on the underlying file system node + * bound to an AF_UNIX pathname. + * + * When sockmod has been popped this will act just like a stream. Since + * a socket is always a clone there is no need to inspect the attributes + * of the "realvp". + */ +/* ARGSUSED */ +int +socket_vop_getattr(struct vnode *vp, struct vattr *vap, int flags, + struct cred *cr, caller_context_t *ct) +{ + dev_t fsid; + struct sonode *so; + static int sonode_shift = 0; + + /* + * Calculate the amount of bitshift to a sonode pointer which will + * still keep it unique. See below. + */ + if (sonode_shift == 0) + sonode_shift = highbit(sizeof (struct sonode)); + ASSERT(sonode_shift > 0); + + so = VTOSO(vp); + fsid = sockdev; + + if (so->so_version == SOV_STREAM) { + /* + * The imaginary "sockmod" has been popped - act + * as a stream + */ + vap->va_type = VCHR; + vap->va_mode = 0; + } else { + vap->va_type = vp->v_type; + vap->va_mode = S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP| + S_IROTH|S_IWOTH; + } + vap->va_uid = vap->va_gid = 0; + vap->va_fsid = fsid; + /* + * If the va_nodeid is > MAX_USHORT, then i386 stats might fail. + * So we shift down the sonode pointer to try and get the most + * uniqueness into 16-bits. + */ + vap->va_nodeid = ((ino_t)so >> sonode_shift) & 0xFFFF; + vap->va_nlink = 0; + vap->va_size = 0; + + /* + * We need to zero out the va_rdev to avoid some fstats getting + * EOVERFLOW. This also mimics SunOS 4.x and BSD behavior. + */ + vap->va_rdev = (dev_t)0; + vap->va_blksize = MAXBSIZE; + vap->va_nblocks = btod(vap->va_size); + + if (!SOCK_IS_NONSTR(so)) { + sotpi_info_t *sti = SOTOTPI(so); + + mutex_enter(&so->so_lock); + vap->va_atime.tv_sec = sti->sti_atime; + vap->va_mtime.tv_sec = sti->sti_mtime; + vap->va_ctime.tv_sec = sti->sti_ctime; + mutex_exit(&so->so_lock); + } else { + vap->va_atime.tv_sec = 0; + vap->va_mtime.tv_sec = 0; + vap->va_ctime.tv_sec = 0; + } + + vap->va_atime.tv_nsec = 0; + vap->va_mtime.tv_nsec = 0; + vap->va_ctime.tv_nsec = 0; + vap->va_seq = 0; + + return (0); +} + +/* + * Set attributes. + * Just like in BSD there is not effect on the underlying file system node + * bound to an AF_UNIX pathname. + * + * When sockmod has been popped this will act just like a stream. Since + * a socket is always a clone there is no need to modify the attributes + * of the "realvp". + */ +/* ARGSUSED */ +int +socket_vop_setattr(struct vnode *vp, struct vattr *vap, int flags, + struct cred *cr, caller_context_t *ct) +{ + struct sonode *so = VTOSO(vp); + + /* + * If times were changed, and we have a STREAMS socket, then update + * the sonode. + */ + if (!SOCK_IS_NONSTR(so)) { + sotpi_info_t *sti = SOTOTPI(so); + + mutex_enter(&so->so_lock); + if (vap->va_mask & AT_ATIME) + sti->sti_atime = vap->va_atime.tv_sec; + if (vap->va_mask & AT_MTIME) { + sti->sti_mtime = vap->va_mtime.tv_sec; + sti->sti_ctime = gethrestime_sec(); + } + mutex_exit(&so->so_lock); + } + + return (0); +} + +/* + * Check if user is allowed to access vp. For non-STREAMS based sockets, + * there might not be a device attached to the file system. So for those + * types of sockets there are no permissions to check. + * + * XXX Should there be some other mechanism to check access rights? + */ +/*ARGSUSED*/ +int +socket_vop_access(struct vnode *vp, int mode, int flags, struct cred *cr, + caller_context_t *ct) +{ + struct sonode *so = VTOSO(vp); + + if (!SOCK_IS_NONSTR(so)) { + ASSERT(so->so_sockparams->sp_sdev_info.sd_vnode != NULL); + return (VOP_ACCESS(so->so_sockparams->sp_sdev_info.sd_vnode, + mode, flags, cr, NULL)); + } + return (0); +} + +/* + * 4.3BSD and 4.4BSD fail a fsync on a socket with EINVAL. + * This code does the same to be compatible and also to not give an + * application the impression that the data has actually been "synced" + * to the other end of the connection. + */ +/* ARGSUSED */ +int +socket_vop_fsync(struct vnode *vp, int syncflag, struct cred *cr, + caller_context_t *ct) +{ + return (EINVAL); +} + +/*ARGSUSED*/ +static void +socket_vop_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct) +{ + struct sonode *so = VTOSO(vp); + + ASSERT(vp->v_type == VSOCK); + + mutex_enter(&vp->v_lock); + /* + * If no one has reclaimed the vnode, remove from the + * cache now. + */ + if (vp->v_count < 1) + cmn_err(CE_PANIC, "socket_inactive: Bad v_count"); + + /* + * Drop the temporary hold by vn_rele now + */ + if (--vp->v_count != 0) { + mutex_exit(&vp->v_lock); + return; + } + mutex_exit(&vp->v_lock); + + + ASSERT(!vn_has_cached_data(vp)); + + /* socket specfic clean-up */ + socket_destroy_internal(so, cr); +} + +/* ARGSUSED */ +int +socket_vop_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct) +{ + return (EINVAL); +} + +/* + * Sockets are not seekable. + * (and there is a bug to fix STREAMS to make them fail this as well). + */ +/*ARGSUSED*/ +int +socket_vop_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, + caller_context_t *ct) +{ + return (ESPIPE); +} + +/*ARGSUSED*/ +static int +socket_vop_poll(struct vnode *vp, short events, int anyyet, short *reventsp, + struct pollhead **phpp, caller_context_t *ct) +{ + struct sonode *so = VTOSO(vp); + + ASSERT(vp->v_type == VSOCK); + + return (socket_poll(so, events, anyyet, reventsp, phpp)); +} diff --git a/usr/src/uts/common/fs/sockfs/socknotify.c b/usr/src/uts/common/fs/sockfs/socknotify.c new file mode 100644 index 0000000000..788efa9ff5 --- /dev/null +++ b/usr/src/uts/common/fs/sockfs/socknotify.c @@ -0,0 +1,379 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/stropts.h> +#include <sys/socketvar.h> +#include <sys/ksocket.h> +#include <io/ksocket/ksocket_impl.h> +#include <fs/sockfs/sockcommon.h> + +/* + * There can only be a single thread waiting for data (enforced by + * so_lock_read()), whereas for write there might be multiple threads + * waiting for transmit buffers. So therefore we use cv_broadcast for + * write and cv_signal for read. + */ +#define SO_WAKEUP_READER(so) { \ + if ((so)->so_rcv_wakeup) { \ + (so)->so_rcv_wakeup = B_FALSE; \ + cv_signal(&(so)->so_rcv_cv); \ + } \ +} + +#define SO_WAKEUP_WRITER(so) { \ + if ((so)->so_snd_wakeup) { \ + (so)->so_snd_wakeup = B_FALSE; \ + cv_broadcast(&(so)->so_snd_cv); \ + } \ +} + +static int i_so_notify_last_rx(struct sonode *, int *, int *); +static int i_so_notify_last_tx(struct sonode *, int *, int *); + +/* + * The notification functions must be called with so_lock held, + * and they will all *drop* so_lock before returning. + */ + +/* + * Wake up anyone waiting for the connection to be established. + */ +void +so_notify_connected(struct sonode *so) +{ + ASSERT(MUTEX_HELD(&so->so_lock)); + + if (IS_KERNEL_SOCKET(so)) { + KSOCKET_CALLBACK(so, connected, 0); + mutex_exit(&so->so_lock); + } else { + socket_sendsig(so, SOCKETSIG_WRITE); + mutex_exit(&so->so_lock); + pollwakeup(&so->so_poll_list, POLLOUT); + } + + ASSERT(MUTEX_NOT_HELD(&so->so_lock)); +} + +/* + * The socket is disconnecting, so no more data can be sent. Wake up + * anyone that is waiting to send data. + */ +void +so_notify_disconnecting(struct sonode *so) +{ + int pollev = 0; + int sigev = 0; + + ASSERT(MUTEX_HELD(&so->so_lock)); + + if (IS_KERNEL_SOCKET(so)) { + SO_WAKEUP_WRITER(so); + KSOCKET_CALLBACK(so, cantsendmore, 0); + mutex_exit(&so->so_lock); + } else if (i_so_notify_last_tx(so, &pollev, &sigev)) { + socket_sendsig(so, sigev); + mutex_exit(&so->so_lock); + pollwakeup(&so->so_poll_list, pollev); + } else { + mutex_exit(&so->so_lock); + } + + ASSERT(MUTEX_NOT_HELD(&so->so_lock)); +} + +/* + * The socket is disconnected, so not more data can be sent or received. + * Wake up anyone that is waiting to send or receive data. + */ +void +so_notify_disconnected(struct sonode *so, int error) +{ + int pollev = 0; + int sigev = 0; + + ASSERT(MUTEX_HELD(&so->so_lock)); + + (void) i_so_notify_last_tx(so, &pollev, &sigev); + (void) i_so_notify_last_rx(so, &pollev, &sigev); + + if (IS_KERNEL_SOCKET(so)) { + KSOCKET_CALLBACK(so, disconnected, error); + mutex_exit(&so->so_lock); + } else { + if (sigev != 0) + socket_sendsig(so, sigev); + mutex_exit(&so->so_lock); + if (pollev != 0) + pollwakeup(&so->so_poll_list, pollev); + } + + ASSERT(MUTEX_NOT_HELD(&so->so_lock)); +} + +/* + * The socket is writeable. Wake up anyone waiting to send data. + */ +void +so_notify_writable(struct sonode *so) +{ + ASSERT(MUTEX_HELD(&so->so_lock)); + + SO_WAKEUP_WRITER(so); + + if (IS_KERNEL_SOCKET(so)) { + KSOCKET_CALLBACK(so, cansend, 0); + mutex_exit(&so->so_lock); + } else { + socket_sendsig(so, SOCKETSIG_WRITE); + mutex_exit(&so->so_lock); + pollwakeup(&so->so_poll_list, POLLOUT); + } + + ASSERT(MUTEX_NOT_HELD(&so->so_lock)); +} + +/* + * Data is available, so wake up anyone waiting for data. + */ +void +so_notify_data(struct sonode *so, size_t qlen) +{ + ASSERT(MUTEX_HELD(&so->so_lock)); + + SO_WAKEUP_READER(so); + + if (IS_KERNEL_SOCKET(so)) { + KSOCKET_CALLBACK(so, newdata, qlen); + mutex_exit(&so->so_lock); + } else { + socket_sendsig(so, SOCKETSIG_READ); + if (so->so_pollev & (SO_POLLEV_IN|SO_POLLEV_ALWAYS)) { + so->so_pollev &= ~SO_POLLEV_IN; + mutex_exit(&so->so_lock); + pollwakeup(&so->so_poll_list, POLLIN|POLLRDNORM); + } else { + mutex_exit(&so->so_lock); + } + } + + ASSERT(MUTEX_NOT_HELD(&so->so_lock)); +} + +/* + * Transient error. Wake up anyone waiting to send or receive data. + */ +void +so_notify_error(struct sonode *so) +{ + ASSERT(MUTEX_HELD(&so->so_lock)); + + SO_WAKEUP_WRITER(so); + SO_WAKEUP_READER(so); + + if (IS_KERNEL_SOCKET(so)) { + KSOCKET_CALLBACK(so, error, 0); + mutex_exit(&so->so_lock); + } else { + socket_sendsig(so, SOCKETSIG_WRITE|SOCKETSIG_READ); + so->so_pollev &= ~SO_POLLEV_IN; + mutex_exit(&so->so_lock); + pollwakeup(&so->so_poll_list, POLLOUT|POLLIN|POLLRDNORM); + } + + ASSERT(MUTEX_NOT_HELD(&so->so_lock)); +} + +/* + * Out-of-band data is incoming, notify any interested parties. + */ +void +so_notify_oobsig(struct sonode *so) +{ + socket_sendsig(so, SOCKETSIG_URG); + mutex_exit(&so->so_lock); + pollwakeup(&so->so_poll_list, POLLRDBAND); +} + +/* + * Received out-of-band data. If the OOB data is delivered inline, then + * in addition of regular OOB notification, anyone waiting for normal + * data is also notified. + */ +void +so_notify_oobdata(struct sonode *so, boolean_t oob_inline) +{ + ASSERT(MUTEX_HELD(&so->so_lock)); + SOD_UIOAFINI(so->so_direct); + + if (IS_KERNEL_SOCKET(so)) { + if (oob_inline) + SO_WAKEUP_READER(so); + KSOCKET_CALLBACK(so, oobdata, 0); + mutex_exit(&so->so_lock); + } else { + if (oob_inline) { + socket_sendsig(so, SOCKETSIG_READ); + so->so_pollev &= ~SO_POLLEV_IN; + mutex_exit(&so->so_lock); + pollwakeup(&so->so_poll_list, + POLLRDBAND|POLLIN|POLLRDNORM); + + SO_WAKEUP_READER(so); + } else { + mutex_exit(&so->so_lock); + pollwakeup(&so->so_poll_list, POLLRDBAND); + } + } + + ASSERT(MUTEX_NOT_HELD(&so->so_lock)); +} + +/* + * End-of-file has been reach, so peer will send no new data. Wake up + * anyone that is waiting for data. + */ +void +so_notify_eof(struct sonode *so) +{ + int pollev = 0; + int sigev = 0; + + ASSERT(MUTEX_HELD(&so->so_lock)); + + (void) i_so_notify_last_rx(so, &pollev, &sigev); + + if (IS_KERNEL_SOCKET(so)) { + SO_WAKEUP_READER(so); + KSOCKET_CALLBACK(so, cantrecvmore, 0); + mutex_exit(&so->so_lock); + } else { + if (sigev != 0) + socket_sendsig(so, sigev); + mutex_exit(&so->so_lock); + if (pollev != 0) + pollwakeup(&so->so_poll_list, pollev); + + } + + ASSERT(MUTEX_NOT_HELD(&so->so_lock)); +} + +/* + * Wake up anyone waiting for a new connection. + */ +void +so_notify_newconn(struct sonode *so) +{ + ASSERT(MUTEX_HELD(&so->so_lock)); + + if (IS_KERNEL_SOCKET(so)) { + KSOCKET_CALLBACK(so, newconn, so->so_rcv_queued); + mutex_exit(&so->so_lock); + } else { + socket_sendsig(so, SOCKETSIG_READ); + if (so->so_pollev & (SO_POLLEV_IN|SO_POLLEV_ALWAYS)) { + so->so_pollev &= ~SO_POLLEV_IN; + mutex_exit(&so->so_lock); + pollwakeup(&so->so_poll_list, POLLIN|POLLRDNORM); + } else { + mutex_exit(&so->so_lock); + } + } + + ASSERT(MUTEX_NOT_HELD(&so->so_lock)); +} + +/* + * User initated shutdown/close, wake anyone that is trying to do + * an operation that is no longer possible. + */ +void +so_notify_shutdown(struct sonode *so) +{ + int pollev = 0; + int sigev = 0; + + ASSERT(MUTEX_HELD(&so->so_lock)); + ASSERT(so->so_state & (SS_CANTSENDMORE|SS_CANTRCVMORE)); + + if (so->so_state & SS_CANTSENDMORE) + (void) i_so_notify_last_tx(so, &pollev, &sigev); + if (so->so_state & SS_CANTRCVMORE) + (void) i_so_notify_last_rx(so, &pollev, &sigev); + + if (sigev != 0) + socket_sendsig(so, sigev); + mutex_exit(&so->so_lock); + if (pollev != 0) + pollwakeup(&so->so_poll_list, pollev); + + ASSERT(MUTEX_NOT_HELD(&so->so_lock)); +} + +/* + * No more data will be coming in, and this will be the last notification + * made. + */ +static int +i_so_notify_last_rx(struct sonode *so, int *pollev, int *sigev) +{ + if (!(so->so_state & SS_SENTLASTREADSIG)) { + SOCKET_TIMER_CANCEL(so); + SO_WAKEUP_READER(so); + so->so_state |= SS_SENTLASTREADSIG; + so->so_pollev &= ~SO_POLLEV_IN; + + *pollev |= POLLIN|POLLRDNORM; + *sigev |= SOCKETSIG_READ; + + return (1); + } else { + return (0); + } +} + +/* + * The socket is un-writeable. Make one last notification. + */ +static int +i_so_notify_last_tx(struct sonode *so, int *pollev, int *sigev) +{ + if (!(so->so_state & SS_SENTLASTWRITESIG)) { + SO_WAKEUP_WRITER(so); + so->so_state |= SS_SENTLASTWRITESIG; + + *pollev |= POLLOUT; + *sigev |= SOCKETSIG_WRITE; + + return (1); + } else { + return (0); + } +} diff --git a/usr/src/uts/common/fs/sockfs/sockparams.c b/usr/src/uts/common/fs/sockfs/sockparams.c new file mode 100644 index 0000000000..2e1d11c64e --- /dev/null +++ b/usr/src/uts/common/fs/sockfs/sockparams.c @@ -0,0 +1,723 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysmacros.h> +#include <sys/cmn_err.h> +#include <sys/list.h> + +#include <sys/stropts.h> +#include <sys/socket.h> +#include <sys/socketvar.h> + +#include <fs/sockfs/sockcommon.h> +#include <fs/sockfs/socktpi.h> + +/* + * Socket Parameters + * + * Socket parameter (struct sockparams) entries represent the socket types + * available on the system. + * + * Flags (sp_flags): + * + * SOCKPARAMS_EPHEMERAL: A temporary sockparams entry that will be deleted + * as soon as its' ref count drops to zero. In addition, ephemeral entries will + * never be hooked onto the global sockparams list. Ephemeral entries are + * created when application requests to create a socket using an application + * supplied device path, or when a socket is falling back to TPI. + * + * Lock order: + * The lock order is splist_lock -> sp_lock. + * The lock order is sp_ephem_lock -> sp_lock. + */ +extern int kobj_path_exists(char *, int); +extern void nl7c_init(void); +extern int sockfs_defer_nl7c_init; + +static int sockparams_sdev_init(struct sockparams *, char *, int); +static void sockparams_sdev_fini(struct sockparams *); + +/* + * Global sockparams list (populated via soconfig(1M)). + */ +static list_t sphead; +static krwlock_t splist_lock; + +/* + * List of ephemeral sockparams. + */ +static list_t sp_ephem_list; +static krwlock_t sp_ephem_lock; + +/* + * Mearch criteria used by sockparams_find() + */ +typedef enum sp_match_criteria { + SP_MATCH_EXACT, /* family, type & proto must match */ + SP_MATCH_WILDCARD, /* family & type must match, proto can be 0 */ + SP_MATCH_INC_DEV, /* same as exact, but dev must also match */ + SP_MATCH_INC_MOD /* same as exact, but mod must also match */ +} sp_match_criteria_t; + + +void +sockparams_init(void) +{ + list_create(&sphead, sizeof (struct sockparams), + offsetof(struct sockparams, sp_node)); + list_create(&sp_ephem_list, sizeof (struct sockparams), + offsetof(struct sockparams, sp_node)); + + rw_init(&splist_lock, NULL, RW_DEFAULT, NULL); + rw_init(&sp_ephem_lock, NULL, RW_DEFAULT, NULL); +} + +/* + * sockparams_create(int family, int type, int protocol, char *modname, + * char *devpath, int devpathlen, int flags, int kmflags, int *errorp) + * + * Create a new sockparams entry. + * + * Arguments: + * family, type, protocol: specifies the socket type + * modname: Name of the module associated with the socket type. The + * module can be NULL if a device path is given, in which + * case the TPI module is used. + * devpath: Path to the STREAMS device. May be NULL for non-STREAMS + * based transports, or those transports that do not provide + * the capability to fallback to STREAMS. + * devpathlen: Length of the devpath string. The argument can be 0, + * indicating that devpath was allocated statically, and should + * not be freed when the sockparams entry is destroyed. + * + * flags : SOCKPARAMS_EPHEMERAL is the only flag that is allowed. + * kmflags: KM_{NO,}SLEEP + * errorp : Value-return argument, set when an error occurs. + * + * Returns: + * On success a new sockparams entry is returned, and *errorp is set + * to 0. On failure NULL is returned and *errorp is set to indicate the + * type of error that occured. + * + * Notes: + * devpath and modname are freed upon failure. + */ +struct sockparams * +sockparams_create(int family, int type, int protocol, char *modname, + char *devpath, int devpathlen, int flags, int kmflags, int *errorp) +{ + struct sockparams *sp = NULL; + size_t size; + + ASSERT((flags & ~SOCKPARAMS_EPHEMERAL) == 0); + if (flags & ~SOCKPARAMS_EPHEMERAL) { + *errorp = EINVAL; + goto error; + } + + /* either a module or device must be given */ + if (modname == NULL && devpath == NULL) { + *errorp = EINVAL; + goto error; + } + + sp = kmem_zalloc(sizeof (*sp), kmflags); + if (sp == NULL) { + *errorp = ENOMEM; + goto error; + } + sp->sp_family = family; + sp->sp_type = type; + sp->sp_protocol = protocol; + sp->sp_refcnt = 0; + sp->sp_flags = flags; + + if (modname != NULL) { + sp->sp_smod_name = modname; + } else { + size = strlen(SOTPI_SMOD_NAME) + 1; + modname = kmem_zalloc(size, kmflags); + if (modname == NULL) { + *errorp = ENOMEM; + goto error; + } + sp->sp_smod_name = modname; + (void) sprintf(sp->sp_smod_name, "%s", SOTPI_SMOD_NAME); + } + + if (devpath != NULL) { + /* Set up the device entry. */ + *errorp = sockparams_sdev_init(sp, devpath, devpathlen); + if (*errorp != 0) + goto error; + } + + mutex_init(&sp->sp_lock, NULL, MUTEX_DEFAULT, NULL); + *errorp = 0; + return (sp); +error: + ASSERT(*errorp != 0); + if (modname != NULL) + kmem_free(modname, strlen(modname) + 1); + if (devpathlen != 0) + kmem_free(devpath, devpathlen); + if (sp != NULL) + kmem_free(sp, sizeof (*sp)); + return (NULL); +} + +/* + * Initialize the STREAMS device aspect of the sockparams entry. + */ +static int +sockparams_sdev_init(struct sockparams *sp, char *devpath, int devpathlen) +{ + vnode_t *vp = NULL; + int error; + + ASSERT(devpath != NULL); + + if ((error = sogetvp(devpath, &vp, UIO_SYSSPACE)) != 0) { + dprint(0, ("sockparams_sdev_init: vp %s failed with %d\n", + devpath, error)); + return (error); + } + + ASSERT(vp != NULL); + sp->sp_sdev_info.sd_vnode = vp; + sp->sp_sdev_info.sd_devpath = devpath; + sp->sp_sdev_info.sd_devpathlen = devpathlen; + + return (0); +} + +/* + * sockparams_destroy(struct sockparams *sp) + * + * Releases all the resources associated with the sockparams entry, + * and frees the sockparams entry. + * + * Arguments: + * sp: the sockparams entry to destroy. + * + * Returns: + * Nothing. + * + * Locking: + * The sp_lock of the entry can not be held. + */ +void +sockparams_destroy(struct sockparams *sp) +{ + ASSERT(sp->sp_refcnt == 0); + ASSERT(!list_link_active(&sp->sp_node)); + + sockparams_sdev_fini(sp); + + if (sp->sp_smod_info != NULL) + SMOD_DEC_REF(sp, sp->sp_smod_info); + kmem_free(sp->sp_smod_name, strlen(sp->sp_smod_name) + 1); + sp->sp_smod_name = NULL; + sp->sp_smod_info = NULL; + mutex_destroy(&sp->sp_lock); + + kmem_free(sp, sizeof (*sp)); +} + +/* + * Clean up the STREAMS device part of the sockparams entry. + */ +static void +sockparams_sdev_fini(struct sockparams *sp) +{ + sdev_info_t sd; + + /* + * if the entry does not have a STREAMS device, then there + * is nothing to do. + */ + if (!SOCKPARAMS_HAS_DEVICE(sp)) + return; + + sd = sp->sp_sdev_info; + if (sd.sd_vnode != NULL) + VN_RELE(sd.sd_vnode); + if (sd.sd_devpathlen != 0) + kmem_free(sd.sd_devpath, sd.sd_devpathlen); + + sp->sp_sdev_info.sd_vnode = NULL; + sp->sp_sdev_info.sd_devpath = NULL; +} + +/* + * Look for a matching sockparams entry on the given list. + * + * The caller must hold the associated list lock. + */ +static struct sockparams * +sockparams_find(list_t *list, int family, int type, int protocol, + enum sp_match_criteria crit, const char *name) +{ + struct sockparams *sp; + struct sockparams *wild = NULL; + + for (sp = list_head(list); sp != NULL; sp = list_next(list, sp)) { + if (sp->sp_family == family && + sp->sp_type == type) { + + if (sp->sp_protocol == protocol) { + if (crit == SP_MATCH_EXACT || + crit == SP_MATCH_WILDCARD) + break; + else if (crit == SP_MATCH_INC_DEV && + sp->sp_sdev_info.sd_devpath != NULL && + strcmp(sp->sp_sdev_info.sd_devpath, + name) == 0) + break; + else if (crit == SP_MATCH_INC_MOD && + strcmp(sp->sp_smod_name, name) == 0) + break; + } else if (crit == SP_MATCH_WILDCARD && + sp->sp_protocol == 0) { + /* best match so far */ + wild = sp; + } + } + } + + return ((sp == NULL) ? wild : sp); +} + +/* + * sockparams_hold_ephemeral() + * + * Returns an ephemeral sockparams entry of the requested family, type and + * protocol. The entry is returned held, and the caller is responsible for + * dropping the reference using SOCKPARAMS_DEC_REF() once done. + * + * All ephemeral entries are on list (sp_ephem_list). If there is an + * entry on the list that match the search criteria, then a reference is + * placed on that entry. Otherwise, a new entry is created and inserted + * in the list. The entry is removed from the list when the last reference + * is dropped. + * + * The tpi flag is used to determine whether name refers to a device or + * module name. + */ +static struct sockparams * +sockparams_hold_ephemeral(int family, int type, int protocol, + const char *name, boolean_t tpi, int kmflag, int *errorp) +{ + struct sockparams *sp = NULL; + sp_match_criteria_t crit = (tpi) ? SP_MATCH_INC_DEV : SP_MATCH_INC_MOD; + + *errorp = 0; + + /* + * First look for an existing entry + */ + rw_enter(&sp_ephem_lock, RW_READER); + sp = sockparams_find(&sp_ephem_list, family, type, protocol, + crit, name); + if (sp != NULL) { + SOCKPARAMS_INC_REF(sp); + rw_exit(&sp_ephem_lock); + + return (sp); + } else { + struct sockparams *newsp = NULL; + char *namebuf = NULL; + int namelen = 0; + + rw_exit(&sp_ephem_lock); + + namelen = strlen(name) + 1; + namebuf = kmem_alloc(namelen, kmflag); + if (namebuf == NULL) { + *errorp = ENOMEM; + return (NULL); + } + + (void *)strncpy(namebuf, name, namelen); + if (tpi) { + newsp = sockparams_create(family, type, + protocol, NULL, namebuf, namelen, + SOCKPARAMS_EPHEMERAL, kmflag, errorp); + } else { + newsp = sockparams_create(family, type, + protocol, namebuf, NULL, 0, + SOCKPARAMS_EPHEMERAL, kmflag, errorp); + } + + if (newsp == NULL) { + ASSERT(*errorp != 0); + return (NULL); + } + + /* + * Time to load the socket module. + */ + ASSERT(newsp->sp_smod_info == NULL); + newsp->sp_smod_info = + smod_lookup_byname(newsp->sp_smod_name); + if (newsp->sp_smod_info == NULL) { + /* Failed to load */ + sockparams_destroy(newsp); + *errorp = ENXIO; + return (NULL); + } + + /* + * The sockparams entry was created, now try to add it + * to the list. We need to hold the lock as a WRITER. + */ + rw_enter(&sp_ephem_lock, RW_WRITER); + sp = sockparams_find(&sp_ephem_list, family, type, protocol, + crit, name); + if (sp != NULL) { + /* + * Someone has requested a matching entry, so just + * place a hold on it and release the entry we alloc'ed. + */ + SOCKPARAMS_INC_REF(sp); + rw_exit(&sp_ephem_lock); + + sockparams_destroy(newsp); + } else { + SOCKPARAMS_INC_REF(newsp); + list_insert_tail(&sp_ephem_list, newsp); + rw_exit(&sp_ephem_lock); + + sp = newsp; + } + ASSERT(*errorp == 0); + + return (sp); + } +} + +struct sockparams * +sockparams_hold_ephemeral_bydev(int family, int type, int protocol, + const char *dev, int kmflag, int *errorp) +{ + return (sockparams_hold_ephemeral(family, type, protocol, dev, B_TRUE, + kmflag, errorp)); +} + +struct sockparams * +sockparams_hold_ephemeral_bymod(int family, int type, int protocol, + const char *mod, int kmflag, int *errorp) +{ + return (sockparams_hold_ephemeral(family, type, protocol, mod, B_FALSE, + kmflag, errorp)); +} + +/* + * Called when the last socket using the ephemeral entry is dropping + * its' reference. To maintain lock order we must drop the sockparams + * lock before calling this function. As a result, a new reference + * might be placed on the entry, in which case there is nothing to + * do. However, if ref count goes to zero, we delete the entry. + */ +void +sockparams_ephemeral_drop_last_ref(struct sockparams *sp) +{ + ASSERT(sp->sp_flags & SOCKPARAMS_EPHEMERAL); + ASSERT(MUTEX_NOT_HELD(&sp->sp_lock)); + + rw_enter(&sp_ephem_lock, RW_WRITER); + mutex_enter(&sp->sp_lock); + + if (--sp->sp_refcnt == 0) { + list_remove(&sp_ephem_list, sp); + mutex_exit(&sp->sp_lock); + rw_exit(&sp_ephem_lock); + + sockparams_destroy(sp); + } else { + mutex_exit(&sp->sp_lock); + rw_exit(&sp_ephem_lock); + } +} + +/* + * sockparams_add(struct sockparams *sp) + * + * Tries to add the given sockparams entry to the global list. + * + * Arguments: + * sp: the sockparms entry to add + * + * Returns: + * On success 0, but if an entry already exists, then EEXIST + * is returned. + * + * Locking: + * The caller can not be holding splist_lock. + */ +static int +sockparams_add(struct sockparams *sp) +{ + ASSERT(!(sp->sp_flags & SOCKPARAMS_EPHEMERAL)); + + rw_enter(&splist_lock, RW_WRITER); + if (sockparams_find(&sphead, sp->sp_family, sp->sp_type, + sp->sp_protocol, SP_MATCH_EXACT, NULL) != 0) { + rw_exit(&splist_lock); + return (EEXIST); + } else { + list_insert_tail(&sphead, sp); + rw_exit(&splist_lock); + return (0); + } +} + +/* + * sockparams_delete(int family, int type, int protocol) + * + * Marks the sockparams entry for a specific family, type and protocol + * for deletion. The entry is removed from the list and destroyed + * if no one is holding a reference to it. + * + * Arguments: + * family, type, protocol: the socket type that should be removed. + * + * Returns: + * On success 0, otherwise ENXIO. + * + * Locking: + * Caller can not be holding splist_lock or the sp_lock of + * any sockparams entry. + */ +static int +sockparams_delete(int family, int type, int protocol) +{ + struct sockparams *sp; + + rw_enter(&splist_lock, RW_WRITER); + sp = sockparams_find(&sphead, family, type, protocol, SP_MATCH_EXACT, + NULL); + + if (sp != NULL) { + /* + * If no one is holding a reference to the entry, then + * we go ahead and remove it from the list and then + * destroy it. + */ + mutex_enter(&sp->sp_lock); + if (sp->sp_refcnt != 0) { + mutex_exit(&sp->sp_lock); + rw_exit(&splist_lock); + return (EBUSY); + } + mutex_exit(&sp->sp_lock); + /* Delete the sockparams entry. */ + list_remove(&sphead, sp); + rw_exit(&splist_lock); + + sockparams_destroy(sp); + return (0); + } else { + rw_exit(&splist_lock); + return (ENXIO); + } +} + +/* + * soconfig(int family, int type, int protocol, + * char *devpath, int devpathlen, char *module) + * + * Add or delete an entry to the sockparams table. + * When devpath and module both are NULL, it will delete an entry. + * + * Arguments: + * family, type, protocol: the tuple in question + * devpath: STREAMS device path. Can be NULL for module based sockets. + * module : Name of the socket module. Can be NULL for STREAMS + * based sockets. + * devpathlen: length of the devpath string, or 0 if devpath + * was statically allocated. + * + * Note: + * This routine assumes that the caller has kmem_alloced + * devpath (if devpathlen > 0) and module for this routine to + * consume. + */ +int +soconfig(int family, int type, int protocol, + char *devpath, int devpathlen, char *module) +{ + struct sockparams *sp; + int error = 0; + + dprint(0, ("soconfig(%d,%d,%d,%s,%d,%s)\n", + family, type, protocol, devpath, devpathlen, + module == NULL ? "NULL" : module)); + + if (sockfs_defer_nl7c_init) { + nl7c_init(); + sockfs_defer_nl7c_init = 0; + } + + if (devpath == NULL && module == NULL) { + /* + * Delete existing entry, + * both socket module and STEAMS device. + */ + ASSERT(module == NULL); + error = sockparams_delete(family, type, protocol); + } else { + /* + * Adding an entry + * sockparams_create frees mod name and devpath upon failure. + */ + sp = sockparams_create(family, type, protocol, module, + devpath, devpathlen, 0, KM_SLEEP, &error); + + if (sp != NULL) { + error = sockparams_add(sp); + if (error != 0) + sockparams_destroy(sp); + } + } + + return (error); +} + +/* + * solookup(int family, int type, int protocol, struct sockparams **spp) + * + * Lookup an entry in the sockparams list based on the triple. The returned + * entry either exactly match the given tuple, or it is the 'default' entry + * for the given <family, type>. A default entry is on with a protocol + * value of zero. + * + * Arguments: + * family, type, protocol: tuple to search for + * spp: Value-return argument + * + * Returns: + * If an entry is found, 0 is returned and *spp is set to point to the + * entry. In case an entry is not found, *spp is set to NULL, and an + * error code is returned. The errors are (in decreasing precedence): + * EAFNOSUPPORT - address family not in list + * EPROTONOSUPPORT - address family supported but not protocol. + * EPROTOTYPE - address family and protocol supported but not socket type. + * + * TODO: should use ddi_modopen()/ddi_modclose() + */ + +int +solookup(int family, int type, int protocol, struct sockparams **spp) +{ + struct sockparams *sp = NULL; + int error = 0; + + *spp = NULL; + rw_enter(&splist_lock, RW_READER); + + /* + * Search the sockparams list for an appropiate entry. + * Hopefully we find an entry that match the exact family, + * type and protocol specified by the user, in which case + * we return that entry. However, we also keep track of + * the default entry for a specific family and type, the + * entry of which would have a protocol value of 0. + */ + sp = sockparams_find(&sphead, family, type, protocol, SP_MATCH_WILDCARD, + NULL); + + if (sp == NULL) { + int found = 0; + + /* Determine correct error code */ + for (sp = list_head(&sphead); sp != NULL; + sp = list_next(&sphead, sp)) { + if (sp->sp_family == family && found < 1) + found = 1; + if (sp->sp_family == family && + sp->sp_protocol == protocol && found < 2) + found = 2; + } + rw_exit(&splist_lock); + + switch (found) { + case 0: + error = EAFNOSUPPORT; + break; + case 1: + error = EPROTONOSUPPORT; + break; + case 2: + error = EPROTOTYPE; + break; + } + return (error); + } + + /* + * An entry was found. + * + * We put a hold on the entry early on, so if the + * sockmod is not loaded, and we have to exit + * splist_lock to call modload(), we know that the + * sockparams entry wont go away. That way we don't + * have to look up the entry once we come back from + * modload(). + */ + SOCKPARAMS_INC_REF(sp); + rw_exit(&splist_lock); + + if (sp->sp_smod_info == NULL) { + sp->sp_smod_info = smod_lookup_byname(sp->sp_smod_name); + if (sp->sp_smod_info == NULL) { + /* + * We put a hold on the sockparams entry + * earlier, hoping everything would work out. + * That obviously did not happen, so release + * the hold here. + */ + SOCKPARAMS_DEC_REF(sp); + /* + * We should probably mark the sockparams as + * "bad", and redo the lookup skipping the + * "bad" entries. I.e., sp->sp_mod_state |= BAD, + * return (solookup(...)) + */ + return (ENXIO); + } + } + + /* + * Alright, we have a valid sockparams entry. + */ + *spp = sp; + return (0); +} diff --git a/usr/src/uts/common/fs/sockfs/socksctp.c b/usr/src/uts/common/fs/sockfs/socksctp.c deleted file mode 100644 index a5763b0b5f..0000000000 --- a/usr/src/uts/common/fs/sockfs/socksctp.c +++ /dev/null @@ -1,2773 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include <sys/types.h> -#include <sys/t_lock.h> -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/buf.h> -#include <sys/vfs.h> -#include <sys/vfs_opreg.h> -#include <sys/vnode.h> -#include <sys/debug.h> -#include <sys/errno.h> -#include <sys/stropts.h> -#include <sys/cmn_err.h> -#include <sys/sysmacros.h> - -#include <sys/project.h> -#include <sys/tihdr.h> -#include <sys/strsubr.h> - -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/strsun.h> - -#include <netinet/sctp.h> -#include <inet/sctp_itf.h> -#include "socksctp.h" - -/* - * SCTP sockfs sonode operations, 1-1 socket - */ -static int sosctp_accept(struct sonode *, int, struct sonode **); -static int sosctp_listen(struct sonode *, int); -static int sosctp_connect(struct sonode *, const struct sockaddr *, socklen_t, - int, int); -static int sosctp_sendmsg(struct sonode *, struct nmsghdr *, struct uio *); -static int sosctp_getpeername(struct sonode *); -static int sosctp_getsockname(struct sonode *); -static int sosctp_shutdown(struct sonode *, int); -static int sosctp_getsockopt(struct sonode *, int, int, void *, socklen_t *, - int); -static int sosctp_setsockopt(struct sonode *, int, int, const void *, - socklen_t); - -/* - * SCTP sockfs sonode operations, 1-N socket - */ -static int sosctp_seq_connect(struct sonode *, const struct sockaddr *, - socklen_t, int, int); -static int sosctp_seq_sendmsg(struct sonode *, struct nmsghdr *, struct uio *); - -/* - * Socket upcalls, 1-1 socket connection - */ -static void *sctp_sock_newconn(void *parenthandle, void *connind); -static void sctp_sock_connected(void *handle); -static int sctp_sock_disconnected(void *handle, int error); -static void sctp_sock_disconnecting(void *handle); -static int sctp_sock_recv(void *handle, mblk_t *mp, int flags); -static void sctp_sock_xmitted(void *handle, int txqueued); -static void sctp_sock_properties(void *handle, int wroff, size_t maxblk); - -/* - * Socket association upcalls, 1-N socket connection - */ -static void *sctp_assoc_newconn(void *parenthandle, void *connind); -static void sctp_assoc_connected(void *handle); -static int sctp_assoc_disconnected(void *handle, int error); -static void sctp_assoc_disconnecting(void *handle); -static int sctp_assoc_recv(void *handle, mblk_t *mp, int flags); -static void sctp_assoc_xmitted(void *handle, int txqueued); -static void sctp_assoc_properties(void *handle, int wroff, size_t maxblk); - -static kmem_cache_t *sosctp_sockcache; -kmem_cache_t *sosctp_assoccache; - -sonodeops_t sosctp_sonodeops = { - sosctp_accept, /* sop_accept */ - sosctp_bind, /* sop_bind */ - sosctp_listen, /* sop_listen */ - sosctp_connect, /* sop_connect */ - sosctp_recvmsg, /* sop_recvmsg */ - sosctp_sendmsg, /* sop_sendmsg */ - sosctp_getpeername, /* sop_getpeername */ - sosctp_getsockname, /* sop_getsockname */ - sosctp_shutdown, /* sop_shutdown */ - sosctp_getsockopt, /* sop_getsockopt */ - sosctp_setsockopt /* sop_setsockopt */ -}; - -sonodeops_t sosctp_seq_sonodeops = { - sosctp_accept, /* sop_accept */ - sosctp_bind, /* sop_bind */ - sosctp_listen, /* sop_listen */ - sosctp_seq_connect, /* sop_connect */ - sosctp_recvmsg, /* sop_recvmsg */ - sosctp_seq_sendmsg, /* sop_sendmsg */ - sosctp_getpeername, /* sop_getpeername */ - sosctp_getsockname, /* sop_getsockname */ - sosctp_shutdown, /* sop_shutdown */ - sosctp_getsockopt, /* sop_getsockopt */ - sosctp_setsockopt /* sop_setsockopt */ -}; - -sctp_upcalls_t sosctp_sock_upcalls = { - sctp_sock_newconn, - sctp_sock_connected, - sctp_sock_disconnected, - sctp_sock_disconnecting, - sctp_sock_recv, - sctp_sock_xmitted, - sctp_sock_properties -}; - -sctp_upcalls_t sosctp_assoc_upcalls = { - sctp_assoc_newconn, - sctp_assoc_connected, - sctp_assoc_disconnected, - sctp_assoc_disconnecting, - sctp_assoc_recv, - sctp_assoc_xmitted, - sctp_assoc_properties -}; - -/*ARGSUSED*/ -static int -sosctp_sock_constructor(void *buf, void *cdrarg, int kmflags) -{ - struct sctp_sonode *ss = buf; - struct sonode *so = &ss->ss_so; - struct vnode *vp; - - ss->ss_type = SOSCTP_SOCKET; - so->so_oobmsg = NULL; - so->so_ack_mp = NULL; - so->so_conn_ind_head = NULL; - so->so_conn_ind_tail = NULL; - so->so_discon_ind_mp = NULL; - so->so_ux_bound_vp = NULL; - so->so_unbind_mp = NULL; - so->so_ops = NULL; - so->so_accessvp = NULL; - so->so_priv = NULL; - - so->so_nl7c_flags = 0; - so->so_nl7c_uri = NULL; - so->so_nl7c_rcv_mp = NULL; - - so->so_direct = NULL; - - vp = vn_alloc(kmflags); - if (vp == NULL) { - return (-1); - } - so->so_vnode = vp; - - vn_setops(vp, socksctp_vnodeops); - vp->v_data = (caddr_t)so; - - ss->ss_rxdata = NULL; - ss->ss_rxtail = &ss->ss_rxdata; - - mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&so->so_plumb_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL); - cv_init(&so->so_ack_cv, NULL, CV_DEFAULT, NULL); - cv_init(&so->so_connind_cv, NULL, CV_DEFAULT, NULL); - cv_init(&so->so_want_cv, NULL, CV_DEFAULT, NULL); - - cv_init(&ss->ss_txdata_cv, NULL, CV_DEFAULT, NULL); - cv_init(&ss->ss_rxdata_cv, NULL, CV_DEFAULT, NULL); - - return (0); -} - -/*ARGSUSED*/ -static void -sosctp_sock_destructor(void *buf, void *cdrarg) -{ - struct sctp_sonode *ss = buf; - struct sonode *so = &ss->ss_so; - struct vnode *vp = SOTOV(so); - - ASSERT(so->so_direct == NULL); - - ASSERT(so->so_nl7c_flags == 0); - ASSERT(so->so_nl7c_uri == NULL); - ASSERT(so->so_nl7c_rcv_mp == NULL); - - ASSERT(so->so_oobmsg == NULL); - ASSERT(so->so_ack_mp == NULL); - ASSERT(so->so_conn_ind_head == NULL); - ASSERT(so->so_conn_ind_tail == NULL); - ASSERT(so->so_discon_ind_mp == NULL); - ASSERT(so->so_ux_bound_vp == NULL); - ASSERT(so->so_unbind_mp == NULL); - ASSERT(so->so_ops == NULL || - so->so_ops == &sosctp_sonodeops || - so->so_ops == &sosctp_seq_sonodeops); - - ASSERT(ss->ss_rxdata == NULL); - - ASSERT(vn_matchops(vp, socksctp_vnodeops)); - ASSERT(vp->v_data == (caddr_t)so); - - vn_free(vp); - - mutex_destroy(&so->so_lock); - mutex_destroy(&so->so_plumb_lock); - cv_destroy(&so->so_state_cv); - cv_destroy(&so->so_ack_cv); - cv_destroy(&so->so_connind_cv); - cv_destroy(&so->so_want_cv); - cv_destroy(&ss->ss_txdata_cv); - cv_destroy(&ss->ss_rxdata_cv); -} - -int -sosctp_init(void) -{ - int error; - - error = vn_make_ops("socksctp", socksctp_vnodeops_template, - &socksctp_vnodeops); - if (error != 0) { - zcmn_err(GLOBAL_ZONEID, CE_WARN, - "sosctp_init: bad vnode ops template"); - return (error); - } - - sosctp_sockcache = kmem_cache_create("sctpsock", - sizeof (struct sctp_sonode), 0, sosctp_sock_constructor, - sosctp_sock_destructor, NULL, NULL, NULL, 0); - sosctp_assoccache = kmem_cache_create("sctp_assoc", - sizeof (struct sctp_soassoc), 0, NULL, NULL, NULL, NULL, NULL, 0); - return (0); -} - -static struct vnode * -sosctp_makevp(struct vnode *accessvp, int domain, int type, int protocol, - int kmflags) -{ - struct sctp_sonode *ss; - struct sonode *so; - struct vnode *vp; - time_t now; - - ss = kmem_cache_alloc(sosctp_sockcache, kmflags); - if (ss == NULL) { - return (NULL); - } - so = &ss->ss_so; - so->so_cache = sosctp_sockcache; - so->so_obj = ss; - vp = SOTOV(so); - now = gethrestime_sec(); - - so->so_flag = 0; - so->so_accessvp = accessvp; - so->so_dev = accessvp->v_rdev; - - so->so_state = 0; - so->so_mode = 0; - - so->so_fsid = sockdev; - so->so_atime = now; - so->so_mtime = now; - so->so_ctime = now; - so->so_count = 0; - - so->so_family = domain; - so->so_type = type; - so->so_protocol = protocol; - so->so_pushcnt = 0; - - so->so_options = 0; - so->so_linger.l_onoff = 0; - so->so_linger.l_linger = 0; - so->so_sndbuf = 0; - so->so_rcvbuf = 0; - so->so_error = 0; - so->so_delayed_error = 0; - - ASSERT(so->so_oobmsg == NULL); - so->so_oobcnt = 0; - so->so_oobsigcnt = 0; - so->so_pgrp = 0; - so->so_provinfo = NULL; - - so->so_laddr_sa = (struct sockaddr *)&ss->ss_laddr; - so->so_faddr_sa = (struct sockaddr *)&ss->ss_faddr; - so->so_laddr_maxlen = so->so_faddr_maxlen = sizeof (ss->ss_laddr); - so->so_laddr_len = so->so_faddr_len = 0; - so->so_eaddr_mp = NULL; - so->so_delayed_error = 0; - - so->so_peercred = NULL; - - ASSERT(so->so_ack_mp == NULL); - ASSERT(so->so_conn_ind_head == NULL); - ASSERT(so->so_conn_ind_tail == NULL); - ASSERT(so->so_ux_bound_vp == NULL); - ASSERT(so->so_unbind_mp == NULL); - - vn_reinit(vp); - vp->v_vfsp = rootvfs; - vp->v_type = VSOCK; - vp->v_rdev = so->so_dev; - - ss->ss_maxassoc = 0; - ss->ss_assoccnt = 0; - ss->ss_assocs = NULL; - - if (type == SOCK_STREAM) { - so->so_ops = &sosctp_sonodeops; - } else { - ASSERT(type == SOCK_SEQPACKET); - so->so_ops = &sosctp_seq_sonodeops; - mutex_enter(&so->so_lock); - (void) sosctp_aid_grow(ss, 1, kmflags); - mutex_exit(&so->so_lock); - } - ss->ss_rxqueued = 0; - ss->ss_txqueued = 0; - ss->ss_wroff = 0; - ss->ss_wrsize = strmsgsz; - bzero(&ss->ss_poll_list, sizeof (ss->ss_poll_list)); - - vn_exists(vp); - return (vp); -} - -/* - * Creates a sctp socket data structure. - * tso is non-NULL if it's passive open. - */ -struct sonode * -sosctp_create(vnode_t *accessvp, int domain, int type, int protocol, - int version, struct sonode *tso, int *errorp) -{ - struct sonode *so; - vnode_t *vp; - int error; - int soflags; - cred_t *cr; - - if (version == SOV_STREAM) { - *errorp = EINVAL; - return (NULL); - } - ASSERT(accessvp != NULL); - - /* - * We only support two types of SCTP socket. Let sotpi_create() - * handle all other cases, such as raw socket. - */ - if (!(domain == AF_INET || domain == AF_INET6) || - !(type == SOCK_STREAM || type == SOCK_SEQPACKET)) { - return (sotpi_create(accessvp, domain, type, protocol, version, - NULL, errorp)); - } - - if (tso == NULL) { - vp = sosctp_makevp(accessvp, domain, type, protocol, KM_SLEEP); - ASSERT(vp != NULL); - - soflags = FREAD | FWRITE; - } else { - vp = sosctp_makevp(accessvp, domain, type, protocol, - KM_NOSLEEP); - if (vp == NULL) { - /* - * sosctp_makevp() only fails when there is no memory. - */ - *errorp = ENOMEM; - return (NULL); - } - soflags = FREAD | FWRITE | SO_ACCEPTOR; - } - /* - * This function may be called in interrupt context, and CRED() - * will be NULL. In this case, pass in kcred to VOP_OPEN(). - */ - if ((cr = CRED()) == NULL) - cr = kcred; - if ((error = VOP_OPEN(&vp, soflags, cr, NULL)) != 0) { - VN_RELE(vp); - *errorp = error; - return (NULL); - } - so = VTOSO(vp); - - dprint(2, ("sosctp_create: %p domain %d type %d\n", - (void *)so, domain, type)); - - if (version == SOV_DEFAULT) { - version = so_default_version; - } - so->so_version = (short)version; - - return (so); -} - -/* - * Free SCTP socket data structure. - * Closes incoming connections which were never accepted, frees - * resources. - */ -void -sosctp_free(struct sonode *so) -{ - struct sctp_sonode *ss = SOTOSSO(so); - struct sonode *nso; - mblk_t *mp; - - mutex_enter(&so->so_lock); - - /* - * Need to clear these out so that sockfree() doesn't think that - * there's memory in need of free'ing. - */ - so->so_laddr_sa = so->so_faddr_sa = NULL; - so->so_laddr_len = so->so_laddr_maxlen = 0; - so->so_faddr_len = so->so_faddr_maxlen = 0; - - while ((mp = ss->ss_rxdata) != NULL) { - ss->ss_rxdata = mp->b_next; - mp->b_next = NULL; - freemsg(mp); - mp = ss->ss_rxdata; - } - ss->ss_rxtail = &ss->ss_rxdata; - - - while ((mp = so->so_conn_ind_head) != NULL) { - so->so_conn_ind_head = mp->b_next; - mutex_exit(&so->so_lock); - mp->b_next = NULL; - nso = *(struct sonode **)mp->b_rptr; - - (void) VOP_CLOSE(SOTOV(nso), 0, 1, 0, CRED(), NULL); - vn_invalid(SOTOV(nso)); - VN_RELE(SOTOV(nso)); - - freeb(mp); - mutex_enter(&so->so_lock); - } - so->so_conn_ind_tail = NULL; - so->so_state &= ~SS_HASCONNIND; - - if (ss->ss_assocs != NULL) { - ASSERT(ss->ss_assoccnt == 0); - kmem_free(ss->ss_assocs, - ss->ss_maxassoc * sizeof (struct sctp_sa_id)); - } - mutex_exit(&so->so_lock); - - sockfree(so); -} - -/* - * Accept incoming connection. - */ -static int -sosctp_accept(struct sonode *lso, int fflag, struct sonode **nsop) -{ - int error = 0; - mblk_t *mp; - struct sonode *nso; - - if (!(lso->so_state & SS_ACCEPTCONN)) { - /* - * Not a listen socket. - */ - eprintsoline(lso, EINVAL); - return (EINVAL); - } - if (lso->so_type != SOCK_STREAM) { - /* - * Cannot accept() connections from SOCK_SEQPACKET type - * socket. - */ - eprintsoline(lso, EOPNOTSUPP); - return (EOPNOTSUPP); - } - - /* - * Returns right away if socket is nonblocking. - */ - error = sowaitconnind(lso, fflag, &mp); - if (error != 0) { - eprintsoline(lso, error); - return (error); - } - nso = *(struct sonode **)mp->b_rptr; - freeb(mp); - - mutex_enter(&lso->so_lock); - ASSERT(SOTOSSO(lso)->ss_rxqueued > 0); - --SOTOSSO(lso)->ss_rxqueued; - mutex_exit(&lso->so_lock); - - /* - * accept() needs remote address right away. - * since sosctp_getpeername() is called with - * socket lock released, the connection may - * get aborted before we return from the - * routine. So, we need to to handle aborted - * socket connection here. - */ - error = sosctp_getpeername(nso); - if (error != 0) { - vnode_t *nvp; - nvp = SOTOV(nso); - (void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL); - VN_RELE(nvp); - - /* - * We can't return ENOTCONN to accept. accept - * either returns connected socket in case no error - * has occured or the connection which is getting - * accepted is being aborted. This is the reason we - * return ECONNABORTED in case sosctp_getpeername() - * returns ENOTCONN. - */ - return ((error == ENOTCONN) ? ECONNABORTED : error); - } - - dprint(2, ("sosctp_accept: new %p\n", (void *)nso)); - - *nsop = nso; - return (0); -} - -/* - * Bind local endpoint. - */ -int -sosctp_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen, - int flags) -{ - int error = 0; - - if (!(flags & _SOBIND_LOCK_HELD)) { - mutex_enter(&so->so_lock); - so_lock_single(so); /* Set SOLOCKED */ - /* LINTED - statement has no conseq */ - } else { - ASSERT(MUTEX_HELD(&so->so_lock)); - ASSERT(so->so_flag & SOLOCKED); - } - - if ((so->so_state & SS_ISBOUND) || name == NULL || namelen == 0) { - /* - * Multiple binds not allowed for any SCTP socket. - * Also binding with null address is not supported. - */ - error = EINVAL; - eprintsoline(so, error); - goto done; - } - /* - * X/Open requires this check - */ - if (so->so_state & SS_CANTSENDMORE) { - error = EINVAL; - goto done; - } - - /* - * Protocol module does address family checks. - */ - mutex_exit(&so->so_lock); - - error = sctp_bind(so->so_priv, name, namelen); - - mutex_enter(&so->so_lock); - if (error == 0) { - so->so_state |= SS_ISBOUND; - /* LINTED - statement has no conseq */ - } else { - eprintsoline(so, error); - } -done: - if (!(flags & _SOBIND_LOCK_HELD)) { - so_unlock_single(so, SOLOCKED); - mutex_exit(&so->so_lock); - /* LINTED - statement has no conseq */ - } else { - /* If the caller held the lock don't release it here */ - ASSERT(MUTEX_HELD(&so->so_lock)); - ASSERT(so->so_flag & SOLOCKED); - } - return (error); -} - -/* - * Turn socket into a listen socket. - */ -static int -sosctp_listen(struct sonode *so, int backlog) -{ - int error = 0; - - mutex_enter(&so->so_lock); - so_lock_single(so); - - /* - * If this socket is trying to do connect, or if it has - * been connected, disallow. - */ - if (so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED | - SS_ISDISCONNECTING | SS_CANTRCVMORE | SS_CANTSENDMORE)) { - error = EINVAL; - eprintsoline(so, error); - goto done; - } - - if (backlog < 0) { - backlog = 0; - } - - /* - * If listen() is only called to change backlog, we don't - * need to notify protocol module. - */ - if (so->so_state & SS_ACCEPTCONN) { - so->so_backlog = backlog; - goto done; - } - - mutex_exit(&so->so_lock); - - error = sctp_listen(so->so_priv); - - mutex_enter(&so->so_lock); - if (error == 0) { - so->so_state |= (SS_ACCEPTCONN|SS_ISBOUND); - so->so_backlog = backlog; - /* LINTED - statement has no conseq */ - } else { - eprintsoline(so, error); - } -done: - so_unlock_single(so, SOLOCKED); - mutex_exit(&so->so_lock); - - return (error); -} - -/* - * Active open. - */ -static int -sosctp_connect(struct sonode *so, const struct sockaddr *name, - socklen_t namelen, int fflag, int flags) -{ - int error; - - ASSERT(so->so_type == SOCK_STREAM); - - mutex_enter(&so->so_lock); - so_lock_single(so); - - /* - * Can't connect() after listen(), or if the socket is already - * connected. - */ - if (so->so_state & (SS_ACCEPTCONN|SS_ISCONNECTED|SS_ISCONNECTING)) { - if (so->so_state & SS_ISCONNECTED) { - error = EISCONN; - } else if (so->so_state & SS_ISCONNECTING) { - error = EALREADY; - } else { - error = EOPNOTSUPP; - } - eprintsoline(so, error); - goto done; - } - - /* - * Check for failure of an earlier call - */ - if (so->so_error != 0) { - error = sogeterr(so); - eprintsoline(so, error); - goto done; - } - - /* - * Connection is closing, or closed, don't allow reconnect. - * TCP allows this to proceed, but the socket remains unwriteable. - * BSD returns EINVAL. - */ - if (so->so_state & (SS_ISDISCONNECTING|SS_CANTRCVMORE| - SS_CANTSENDMORE)) { - error = EINVAL; - eprintsoline(so, error); - goto done; - } - if (name == NULL || namelen == 0) { - error = EINVAL; - eprintsoline(so, error); - goto done; - } - soisconnecting(so); - - mutex_exit(&so->so_lock); - - error = sctp_connect(so->so_priv, name, namelen); - - mutex_enter(&so->so_lock); - if (error == 0) { - /* - * Allow other threads to access the socket - */ - error = sosctp_waitconnected(so, fflag); - } - switch (error) { - case 0: - case EINPROGRESS: - case EALREADY: - case EINTR: - /* Non-fatal errors */ - so->so_state |= SS_ISBOUND; - break; - case EHOSTUNREACH: - if (flags & _SOCONNECT_XPG4_2) { - /* - * X/Open specification contains a requirement that - * ENETUNREACH be returned but does not require - * EHOSTUNREACH. In order to keep the test suite - * happy we mess with the errno here. - */ - error = ENETUNREACH; - } - /* FALLTHRU */ - - default: - /* clear SS_ISCONNECTING in case it was set */ - so->so_state &= ~SS_ISCONNECTING; - break; - } -done: - so_unlock_single(so, SOLOCKED); - mutex_exit(&so->so_lock); - return (error); -} - -/* - * Active open for 1-N sockets, create a new association and - * call connect on that. - * If there parent hasn't been bound yet (this is the first association), - * make it so. - */ -static int -sosctp_seq_connect(struct sonode *so, const struct sockaddr *name, - socklen_t namelen, int fflag, int flags) -{ - struct sctp_soassoc *ssa; - struct sctp_sonode *ss; - int error; - - ASSERT(so->so_type == SOCK_SEQPACKET); - - mutex_enter(&so->so_lock); - so_lock_single(so); - - if (name == NULL || namelen == 0) { - error = EINVAL; - eprintsoline(so, error); - goto done; - } - - ss = SOTOSSO(so); - - error = sosctp_assoc_createconn(ss, name, namelen, NULL, 0, fflag, - &ssa); - if (error != 0) { - if ((error == EHOSTUNREACH) && (flags & _SOCONNECT_XPG4_2)) { - error = ENETUNREACH; - } - } - if (ssa != NULL) { - SSA_REFRELE(ss, ssa); - } - -done: - so_unlock_single(so, SOLOCKED); - mutex_exit(&so->so_lock); - return (error); -} - -/* - * Receive data. - */ -int -sosctp_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) -{ - struct sctp_sonode *ss = SOTOSSO(so); - struct sctp_soassoc *ssa = NULL; - int flags, error = 0; - struct T_unitdata_ind *tind; - int len, count, readcnt = 0, rxqueued; - boolean_t consumed = B_FALSE; - void *opt; - mblk_t *mp, *mdata; - - flags = msg->msg_flags; - msg->msg_flags = 0; - - if (so->so_type == SOCK_STREAM) { - if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING| - SS_CANTRCVMORE))) { - return (ENOTCONN); - } - } else { - /* For 1-N socket, recv() cannot be used. */ - if (msg->msg_namelen == 0) - return (EOPNOTSUPP); - /* - * If there are no associations, and no new connections are - * coming in, there's not going to be new messages coming - * in either. - */ - if (ss->ss_rxdata == NULL && ss->ss_assoccnt == 0 && - !(so->so_state & SS_ACCEPTCONN)) { - return (ENOTCONN); - } - } - - /* - * out-of-band data not supported. - */ - if (flags & MSG_OOB) { - return (EOPNOTSUPP); - } - - /* - * flag possibilities: - * - * MSG_PEEK Don't consume data - * MSG_WAITALL Wait for full quantity of data (ignored if MSG_PEEK) - * MSG_DONTWAIT Non-blocking (same as FNDELAY | FNONBLOCK) - * - * MSG_WAITALL can return less than the full buffer if either - * - * 1. we would block and we are non-blocking - * 2. a full message cannot be delivered - * - * Given that we always get a full message from proto below, - * MSG_WAITALL is not meaningful. - */ - - mutex_enter(&so->so_lock); - - /* - * Allow just one reader at a time. - */ - error = so_lock_read_intr(so, - uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0)); - if (error) { - mutex_exit(&so->so_lock); - return (error); - } -again: - mp = ss->ss_rxdata; - if (mp != NULL) { - if (so->so_type == SOCK_SEQPACKET) { - ssa = *(struct sctp_soassoc **)DB_BASE(mp); - } - mutex_exit(&so->so_lock); - - tind = (struct T_unitdata_ind *)mp->b_rptr; - - len = tind->SRC_length; - - if (msg->msg_namelen > 0 && len > 0) { - - opt = sogetoff(mp, tind->SRC_offset, len, 1); - - ASSERT(opt != NULL); - - msg->msg_name = kmem_alloc(len, KM_SLEEP); - msg->msg_namelen = len; - - bcopy(opt, msg->msg_name, len); - } else { - msg->msg_namelen = 0; - } - - len = tind->OPT_length; - if (msg->msg_controllen == 0) { - if (len > 0) { - msg->msg_flags |= MSG_CTRUNC; - } - } else if (len > 0) { - opt = sogetoff(mp, tind->OPT_offset, len, - __TPI_ALIGN_SIZE); - - ASSERT(opt != NULL); - sosctp_pack_cmsg(opt, msg, len); - } else { - msg->msg_controllen = 0; - } - - if (mp->b_flag & SCTP_NOTIFICATION) { - msg->msg_flags |= MSG_NOTIFICATION; - } - - mdata = mp->b_cont; - while (mdata != NULL) { - len = MBLKL(mdata); - count = MIN(uiop->uio_resid, len); - - error = uiomove(mdata->b_rptr, count, UIO_READ, uiop); - /* - * We will re-read this message the next time. - */ - if (error != 0) { - if (msg->msg_namelen > 0) { - kmem_free(msg->msg_name, - msg->msg_namelen); - } - if (msg->msg_controllen > 0) { - kmem_free(msg->msg_control, - msg->msg_controllen); - } - mutex_enter(&so->so_lock); - so_unlock_read(so); - mutex_exit(&so->so_lock); - return (error); - } - if (!(flags & MSG_PEEK)) - readcnt += count; - if (uiop->uio_resid == 0) { - mblk_t *mp1 = ss->ss_rxdata; - mblk_t *mp2 = mp1->b_cont; -#ifdef DEBUG - int rcnt = readcnt; -#endif - - /* Finished with this message? */ - if (count == len && mdata->b_cont == NULL) - break; - /* - * Remove the bits that have been read, the - * next read will start from where we left - * off. - */ - while (mp1->b_cont != mdata) { -#ifdef DEBUG - ASSERT(rcnt > MBLKL(mp1->b_cont)); - rcnt -= MBLKL(mp1->b_cont); -#endif - mp1 = mp1->b_cont; - } -#ifdef DEBUG - ASSERT(rcnt == count); -#endif - if (len > count) - mp1->b_cont->b_rptr += count; - else - mp1 = mp1->b_cont; - mutex_enter(&so->so_lock); - if (mp2 != mp1->b_cont) { - ss->ss_rxdata->b_cont = mp1->b_cont; - mp1->b_cont = NULL; - freemsg(mp2); - } - goto done; - } - mdata = mdata->b_cont; - } - if (!(mp->b_flag & SCTP_PARTIAL_DATA)) - msg->msg_flags |= MSG_EOR; - /* - * Consume this message - */ -consume: - mutex_enter(&so->so_lock); - if (!(flags & MSG_PEEK)) { - ss->ss_rxdata = mp->b_next; - if (ss->ss_rxtail == &mp->b_next) { - ss->ss_rxtail = &ss->ss_rxdata; - } - mp->b_next = NULL; - freemsg(mp); - consumed = B_TRUE; - } - } else { - /* - * No pending data. Return right away for nonblocking - * socket, otherwise sleep waiting for data. - */ - if (!(so->so_state & SS_CANTRCVMORE)) { - if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) || - (flags & MSG_DONTWAIT)) { - error = EWOULDBLOCK; - } else { - if (!cv_wait_sig(&ss->ss_rxdata_cv, - &so->so_lock)) { - error = EINTR; - } else { - goto again; - } - } - } else { - msg->msg_controllen = 0; - msg->msg_namelen = 0; - } - } -done: - /* - * Determine if we need to update SCTP about the buffer - * space. For performance reason, we cannot update SCTP - * every time a message is read. The socket buffer low - * watermark is used as the threshold. - */ - if (ssa == NULL) { - rxqueued = ss->ss_rxqueued; - - ss->ss_rxqueued = rxqueued - readcnt; - count = so->so_rcvbuf - ss->ss_rxqueued; - - ASSERT(ss->ss_rxdata != NULL || ss->ss_rxqueued == 0); - - so_unlock_read(so); - mutex_exit(&so->so_lock); - - if (readcnt > 0 && (((count > 0) && - (rxqueued >= so->so_rcvlowat)) || - (ss->ss_rxqueued == 0))) { - /* - * If amount of queued data is higher than watermark, - * updata SCTP's idea of available buffer space. - */ - sctp_recvd(so->so_priv, count); - } - } else { - rxqueued = ssa->ssa_rxqueued; - - ssa->ssa_rxqueued = rxqueued - readcnt; - count = so->so_rcvbuf - ssa->ssa_rxqueued; - - so_unlock_read(so); - - if (readcnt > 0 && - (((count > 0) && (rxqueued >= so->so_rcvlowat)) || - (ssa->ssa_rxqueued == 0))) { - /* - * If amount of queued data is higher than watermark, - * updata SCTP's idea of available buffer space. - */ - mutex_exit(&so->so_lock); - - sctp_recvd(ssa->ssa_conn, count); - - mutex_enter(&so->so_lock); - } - if (consumed) { - SSA_REFRELE(ss, ssa); - } - mutex_exit(&so->so_lock); - } - - return (error); -} - -int -sosctp_uiomove(mblk_t *hdr_mp, ssize_t count, ssize_t blk_size, int wroff, - struct uio *uiop, int flags, cred_t *cr) -{ - ssize_t size; - int error; - mblk_t *mp; - dblk_t *dp; - - /* - * Loop until we have all data copied into mblk's. - */ - while (count > 0) { - size = MIN(count, blk_size); - - /* - * As a message can be splitted up and sent in different - * packets, each mblk will have the extra space before - * data to accommodate what SCTP wants to put in there. - */ - while ((mp = allocb_cred(size + wroff, cr)) == NULL) { - if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) || - (flags & MSG_DONTWAIT)) { - return (EAGAIN); - } - if ((error = strwaitbuf(size + wroff, BPRI_MED))) { - return (error); - } - } - - dp = mp->b_datap; - dp->db_cpid = curproc->p_pid; - ASSERT(wroff <= dp->db_lim - mp->b_wptr); - mp->b_rptr += wroff; - error = uiomove(mp->b_rptr, size, UIO_WRITE, uiop); - if (error != 0) { - freeb(mp); - return (error); - } - mp->b_wptr = mp->b_rptr + size; - count -= size; - hdr_mp->b_cont = mp; - hdr_mp = mp; - } - return (0); -} - -/* - * Send message. - */ -static int -sosctp_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) -{ - struct sctp_sonode *ss = SOTOSSO(so); - mblk_t *mctl; - struct cmsghdr *cmsg; - struct sctp_sndrcvinfo *sinfo; - int optlen, flags, fflag; - ssize_t count, msglen; - int error; - - ASSERT(so->so_type == SOCK_STREAM); - - flags = msg->msg_flags; - if (flags & MSG_OOB) { - /* - * No out-of-band data support. - */ - return (EOPNOTSUPP); - } - - if (msg->msg_controllen != 0) { - optlen = msg->msg_controllen; - cmsg = sosctp_find_cmsg(msg->msg_control, optlen, SCTP_SNDRCV); - if (cmsg != NULL) { - if (cmsg->cmsg_len < - (sizeof (*sinfo) + sizeof (*cmsg))) { - eprintsoline(so, EINVAL); - return (EINVAL); - } - sinfo = (struct sctp_sndrcvinfo *)(cmsg + 1); - - /* Both flags should not be set together. */ - if ((sinfo->sinfo_flags & MSG_EOF) && - (sinfo->sinfo_flags & MSG_ABORT)) { - eprintsoline(so, EINVAL); - return (EINVAL); - } - - /* Initiate a graceful shutdown. */ - if (sinfo->sinfo_flags & MSG_EOF) { - /* Can't include data in MSG_EOF message. */ - if (uiop->uio_resid != 0) { - eprintsoline(so, EINVAL); - return (EINVAL); - } - - /* - * This is the same sequence as done in - * shutdown(SHUT_WR). - */ - mutex_enter(&so->so_lock); - so_lock_single(so); - socantsendmore(so); - cv_broadcast(&ss->ss_txdata_cv); - so->so_state |= SS_ISDISCONNECTING; - mutex_exit(&so->so_lock); - - pollwakeup(&ss->ss_poll_list, POLLOUT); - sctp_recvd(so->so_priv, so->so_rcvbuf); - error = sctp_disconnect(so->so_priv); - - mutex_enter(&so->so_lock); - so_unlock_single(so, SOLOCKED); - mutex_exit(&so->so_lock); - return (error); - } - } - } else { - optlen = 0; - } - - mutex_enter(&so->so_lock); - for (;;) { - if (so->so_state & SS_CANTSENDMORE) { - mutex_exit(&so->so_lock); - tsignal(curthread, SIGPIPE); - return (EPIPE); - } - - if (so->so_error != 0) { - error = sogeterr(so); - mutex_exit(&so->so_lock); - return (error); - } - - if (ss->ss_txqueued < so->so_sndbuf) - break; - - /* - * Xmit window full in a blocking socket. - */ - if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) || - (flags & MSG_DONTWAIT)) { - mutex_exit(&so->so_lock); - return (EAGAIN); - } else { - /* - * Wait for space to become available and try again. - */ - error = cv_wait_sig(&ss->ss_txdata_cv, &so->so_lock); - if (!error) { /* signal */ - mutex_exit(&so->so_lock); - return (EINTR); - } - } - } - msglen = count = uiop->uio_resid; - - /* Don't allow sending a message larger than the send buffer size. */ - if (msglen > so->so_sndbuf) { - mutex_exit(&so->so_lock); - return (EMSGSIZE); - } - - /* - * Update TX buffer usage here so that we can lift the socket lock. - */ - ss->ss_txqueued += msglen; - - /* - * Allow piggybacking data on handshake messages (SS_ISCONNECTING). - */ - if (!(so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED))) { - /* - * We need to check here for listener so that the - * same error will be returned as with a TCP socket. - * In this case, sosctp_connect() returns EOPNOTSUPP - * while a TCP socket returns ENOTCONN instead. Catch it - * here to have the same behavior as a TCP socket. - * - * We also need to make sure that the peer address is - * provided before we attempt to do the connect. - */ - if ((so->so_state & SS_ACCEPTCONN) || - msg->msg_name == NULL) { - mutex_exit(&so->so_lock); - error = ENOTCONN; - goto error_nofree; - } - mutex_exit(&so->so_lock); - fflag = uiop->uio_fmode; - if (flags & MSG_DONTWAIT) { - fflag |= FNDELAY; - } - error = sosctp_connect(so, msg->msg_name, msg->msg_namelen, - fflag, (so->so_version == SOV_XPG4_2) * _SOCONNECT_XPG4_2); - if (error) { - /* - * Check for non-fatal errors, socket connected - * while the lock had been lifted. - */ - if (error != EISCONN && error != EALREADY) { - goto error_nofree; - } - error = 0; - } - } else { - mutex_exit(&so->so_lock); - } - - mctl = sctp_alloc_hdr(msg->msg_name, msg->msg_namelen, - msg->msg_control, optlen, SCTP_CAN_BLOCK); - if (mctl == NULL) { - error = EINTR; - goto error_nofree; - } - - /* Copy in the message. */ - if ((error = sosctp_uiomove(mctl, count, ss->ss_wrsize, ss->ss_wroff, - uiop, flags, CRED())) != 0) { - goto error_ret; - } - error = sctp_sendmsg(so->so_priv, mctl, 0); - if (error == 0) - return (0); - -error_ret: - freemsg(mctl); -error_nofree: - mutex_enter(&so->so_lock); - ss->ss_txqueued -= msglen; - cv_broadcast(&ss->ss_txdata_cv); - if ((error == EPIPE) && (so->so_state & SS_CANTSENDMORE)) { - /* - * We received shutdown between the time lock was - * lifted and call to sctp_sendmsg(). - */ - mutex_exit(&so->so_lock); - tsignal(curthread, SIGPIPE); - return (EPIPE); - } - mutex_exit(&so->so_lock); - return (error); -} - -/* - * Send message on 1-N socket. Connects automatically if there is - * no association. - */ -static int -sosctp_seq_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) -{ - struct sctp_sonode *ss; - struct sctp_soassoc *ssa; - struct cmsghdr *cmsg; - struct sctp_sndrcvinfo *sinfo; - int aid = 0; - mblk_t *mctl; - int namelen, optlen, flags; - ssize_t count, msglen; - int error; - uint16_t s_flags = 0; - - ASSERT(so->so_type == SOCK_SEQPACKET); - - /* - * There shouldn't be problems with alignment, as the memory for - * msg_control was alloced with kmem_alloc. - */ - cmsg = sosctp_find_cmsg(msg->msg_control, msg->msg_controllen, - SCTP_SNDRCV); - if (cmsg != NULL) { - if (cmsg->cmsg_len < (sizeof (*sinfo) + sizeof (*cmsg))) { - eprintsoline(so, EINVAL); - return (EINVAL); - } - sinfo = (struct sctp_sndrcvinfo *)(cmsg + 1); - s_flags = sinfo->sinfo_flags; - aid = sinfo->sinfo_assoc_id; - } - - ss = SOTOSSO(so); - namelen = msg->msg_namelen; - - if (msg->msg_controllen > 0) { - optlen = msg->msg_controllen; - } else { - optlen = 0; - } - - mutex_enter(&so->so_lock); - - /* - * If there is no association id, connect to address specified - * in msg_name. Otherwise look up the association using the id. - */ - if (aid == 0) { - /* - * Connect and shutdown cannot be done together, so check for - * MSG_EOF. - */ - if (msg->msg_name == NULL || namelen == 0 || - (s_flags & MSG_EOF)) { - error = EINVAL; - eprintsoline(so, error); - goto done; - } - flags = uiop->uio_fmode; - if (msg->msg_flags & MSG_DONTWAIT) { - flags |= FNDELAY; - } - so_lock_single(so); - error = sosctp_assoc_createconn(ss, msg->msg_name, namelen, - msg->msg_control, optlen, flags, &ssa); - if (error) { - if ((so->so_version == SOV_XPG4_2) && - (error == EHOSTUNREACH)) { - error = ENETUNREACH; - } - if (ssa == NULL) { - /* - * Fatal error during connect(). Bail out. - * If ssa exists, it means that the handshake - * is in progress. - */ - eprintsoline(so, error); - so_unlock_single(so, SOLOCKED); - goto done; - } - /* - * All the errors are non-fatal ones, don't return - * e.g. EINPROGRESS from sendmsg(). - */ - error = 0; - } - so_unlock_single(so, SOLOCKED); - } else { - if ((error = sosctp_assoc(ss, aid, &ssa)) != 0) { - eprintsoline(so, error); - goto done; - } - } - - /* - * Now we have an association. - */ - flags = msg->msg_flags; - - /* - * MSG_EOF initiates graceful shutdown. - */ - if (s_flags & MSG_EOF) { - if (uiop->uio_resid) { - /* - * Can't include data in MSG_EOF message. - */ - error = EINVAL; - } else { - mutex_exit(&so->so_lock); - ssa->ssa_state |= SS_ISDISCONNECTING; - sctp_recvd(ssa->ssa_conn, so->so_rcvbuf); - error = sctp_disconnect(ssa->ssa_conn); - mutex_enter(&so->so_lock); - } - goto refrele; - } - - for (;;) { - if (ssa->ssa_state & SS_CANTSENDMORE) { - SSA_REFRELE(ss, ssa); - mutex_exit(&so->so_lock); - tsignal(curthread, SIGPIPE); - return (EPIPE); - } - - if (ssa->ssa_error != 0) { - error = ssa->ssa_error; - ssa->ssa_error = 0; - goto refrele; - } - - if (ssa->ssa_txqueued < so->so_sndbuf) - break; - - if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) || - (flags & MSG_DONTWAIT)) { - error = EAGAIN; - goto refrele; - } else { - /* - * Wait for space to become available and try again. - */ - error = cv_wait_sig(&ss->ss_txdata_cv, &so->so_lock); - if (!error) { /* signal */ - error = EINTR; - goto refrele; - } - } - } - - msglen = count = uiop->uio_resid; - - /* Don't allow sending a message larger than the send buffer size. */ - if (msglen > so->so_sndbuf) { - error = EMSGSIZE; - goto refrele; - } - - /* - * Update TX buffer usage here so that we can lift the socket lock. - */ - ssa->ssa_txqueued += msglen; - - mutex_exit(&so->so_lock); - - mctl = sctp_alloc_hdr(msg->msg_name, namelen, msg->msg_control, - optlen, SCTP_CAN_BLOCK); - if (mctl == NULL) { - error = EINTR; - goto lock_rele; - } - - /* Copy in the message. */ - if ((error = sosctp_uiomove(mctl, count, ssa->ssa_wrsize, - ssa->ssa_wroff, uiop, flags, CRED())) != 0) { - goto lock_rele; - } - error = sctp_sendmsg(ssa->ssa_conn, mctl, 0); -lock_rele: - mutex_enter(&so->so_lock); - if (error != 0) { - freemsg(mctl); - ssa->ssa_txqueued -= msglen; - cv_broadcast(&ss->ss_txdata_cv); - if ((error == EPIPE) && (ssa->ssa_state & SS_CANTSENDMORE)) { - /* - * We received shutdown between the time lock was - * lifted and call to sctp_sendmsg(). - */ - SSA_REFRELE(ss, ssa); - mutex_exit(&so->so_lock); - tsignal(curthread, SIGPIPE); - return (EPIPE); - } - } - -refrele: - SSA_REFRELE(ss, ssa); -done: - mutex_exit(&so->so_lock); - return (error); -} - -/* - * Get address of remote node. - */ -static int -sosctp_getpeername(struct sonode *so) -{ - int error; - - if (so->so_type != SOCK_STREAM) { - /* - * SEQPACKET can have multiple end-points. - */ - return (EOPNOTSUPP); - } - - if (!(so->so_state & SS_ISCONNECTED)) { - error = ENOTCONN; - } else { - error = sctp_getpeername(so->so_priv, so->so_faddr_sa, - &so->so_faddr_len); - } - return (error); -} - -/* - * Get local address. - */ -static int -sosctp_getsockname(struct sonode *so) -{ - int error; - - mutex_enter(&so->so_lock); - - if (!(so->so_state & SS_ISBOUND)) { - /* - * Zero address, except for address family - */ - bzero(so->so_laddr_sa, so->so_laddr_maxlen); - - so->so_laddr_len = (so->so_family == AF_INET6) ? - sizeof (struct sockaddr_in6) : sizeof (struct sockaddr_in); - so->so_laddr_sa->sa_family = so->so_family; - error = 0; - mutex_exit(&so->so_lock); - } else { - mutex_exit(&so->so_lock); - - error = sctp_getsockname(so->so_priv, so->so_laddr_sa, - &so->so_laddr_len); - } - - return (error); -} - -/* - * Called from shutdown(). - */ -static int -sosctp_shutdown(struct sonode *so, int how) -{ - struct sctp_sonode *ss = SOTOSSO(so); - uint_t state_change; - int error = 0; - short wakesig = 0; - - if (so->so_type == SOCK_SEQPACKET) { - return (EOPNOTSUPP); - } - mutex_enter(&so->so_lock); - so_lock_single(so); - - /* - * SunOS 4.X has no check for datagram sockets. - * 5.X checks that it is connected (ENOTCONN) - * X/Open requires that we check the connected state. - */ - if (!(so->so_state & SS_ISCONNECTED)) { - error = ENOTCONN; - goto done; - } - - /* - * Record the current state and then perform any state changes. - * Then use the difference between the old and new states to - * determine which needs to be done. - */ - state_change = so->so_state; - - switch (how) { - case SHUT_RD: - socantrcvmore(so); - break; - case SHUT_WR: - socantsendmore(so); - break; - case SHUT_RDWR: - socantsendmore(so); - socantrcvmore(so); - break; - default: - error = EINVAL; - goto done; - } - - state_change = so->so_state & ~state_change; - - if (state_change & SS_CANTRCVMORE) { - if (ss->ss_rxdata == NULL) { - cv_signal(&ss->ss_rxdata_cv); - } - wakesig = POLLIN|POLLRDNORM; - - sosctp_sendsig(ss, SCTPSIG_READ); - } - if (state_change & SS_CANTSENDMORE) { - cv_broadcast(&ss->ss_txdata_cv); - wakesig |= POLLOUT; - - so->so_state |= SS_ISDISCONNECTING; - } - mutex_exit(&so->so_lock); - - pollwakeup(&ss->ss_poll_list, wakesig); - - if (state_change & SS_CANTSENDMORE) { - sctp_recvd(so->so_priv, so->so_rcvbuf); - error = sctp_disconnect(so->so_priv); - } - mutex_enter(&so->so_lock); -done: - so_unlock_single(so, SOLOCKED); - mutex_exit(&so->so_lock); - - /* - * HACK: sctp_disconnect() may return EWOULDBLOCK. But this error is - * not documented in standard socket API. Catch it here. - */ - if (error == EWOULDBLOCK) - error = 0; - return (error); -} - -/* - * Get socket options. - */ -/*ARGSUSED5*/ -static int -sosctp_getsockopt(struct sonode *so, int level, int option_name, - void *optval, socklen_t *optlenp, int flags) -{ - int error = 0; - void *option = NULL; - socklen_t maxlen = *optlenp; - socklen_t len; - socklen_t optlen; - uint32_t value; - uint8_t buffer[4]; - void *optbuf = &buffer; - - mutex_enter(&so->so_lock); - - if (level == SOL_SOCKET) { - switch (option_name) { - /* Not supported options */ - case SO_SNDTIMEO: - case SO_RCVTIMEO: - case SO_EXCLBIND: - error = ENOPROTOOPT; - eprintsoline(so, error); - goto done; - - case SO_TYPE: - case SO_ERROR: - case SO_DEBUG: - case SO_ACCEPTCONN: - case SO_REUSEADDR: - case SO_KEEPALIVE: - case SO_DONTROUTE: - case SO_BROADCAST: - case SO_USELOOPBACK: - case SO_OOBINLINE: - case SO_SNDBUF: - case SO_RCVBUF: - case SO_SNDLOWAT: - case SO_RCVLOWAT: - case SO_DGRAM_ERRIND: - case SO_PROTOTYPE: - case SO_DOMAIN: - if (maxlen < (t_uscalar_t)sizeof (int32_t)) { - error = EINVAL; - eprintsoline(so, error); - goto done; - } - break; - case SO_LINGER: - if (maxlen < (t_uscalar_t)sizeof (struct linger)) { - error = EINVAL; - eprintsoline(so, error); - goto done; - } - break; - } - len = (t_uscalar_t)sizeof (uint32_t); /* Default */ - option = &value; - - /* - * Most of the SOL_SOCKET level option values are also - * recorded in sockfs. So we can return the recorded value - * here without calling into SCTP. - */ - switch (option_name) { - case SO_TYPE: - value = so->so_type; - goto copyout; - - case SO_ERROR: - value = sogeterr(so); - goto copyout; - - case SO_ACCEPTCONN: - value = (so->so_state & SS_ACCEPTCONN) ? - SO_ACCEPTCONN : 0; - goto copyout; - - case SO_DEBUG: - case SO_REUSEADDR: - case SO_KEEPALIVE: - case SO_DONTROUTE: - case SO_BROADCAST: - case SO_USELOOPBACK: - case SO_OOBINLINE: - case SO_DGRAM_ERRIND: - value = (so->so_options & option_name); - goto copyout; - - case SO_SNDBUF: - value = so->so_sndbuf; - goto copyout; - - case SO_RCVBUF: - value = so->so_rcvbuf; - goto copyout; - - case SO_SNDLOWAT: - value = so->so_sndlowat; - goto copyout; - - case SO_RCVLOWAT: - value = so->so_rcvlowat; - goto copyout; - - case SO_PROTOTYPE: - value = IPPROTO_SCTP; - goto copyout; - - case SO_DOMAIN: - value = so->so_family; - goto copyout; - - case SO_LINGER: - option = &so->so_linger; - len = (t_uscalar_t)sizeof (struct linger); - break; - - default: - option = NULL; - break; - } - } - if (level == IPPROTO_SCTP) { - /* - * Should go through ioctl(). - */ - error = EINVAL; - goto done; - } - if (maxlen > sizeof (buffer)) { - optbuf = kmem_alloc(maxlen, KM_SLEEP); - } - optlen = maxlen; - mutex_exit(&so->so_lock); - /* - * If the resulting optlen is greater than the provided maxlen, then - * we sliently trucate. - */ - error = sctp_get_opt(so->so_priv, level, option_name, optbuf, &optlen); - mutex_enter(&so->so_lock); - if (error != 0) { - if (option == NULL) { - /* We have no fallback value */ - eprintsoline(so, error); - goto free; - } - error = 0; - goto copyout; - } - - option = optbuf; - len = optlen; - -copyout: - len = MIN(len, maxlen); - bcopy(option, optval, len); - *optlenp = len; - -free: - if (optbuf != &buffer) { - kmem_free(optbuf, maxlen); - } -done: - mutex_exit(&so->so_lock); - return (error); -} - -/* - * Set socket options - */ -static int -sosctp_setsockopt(struct sonode *so, int level, int option_name, - const void *optval, t_uscalar_t optlen) -{ - struct sctp_sonode *ss = SOTOSSO(so); - struct sctp_soassoc *ssa = NULL; - sctp_assoc_t id; - int error, rc; - void *conn = NULL; - - /* X/Open requires this check */ - if (so->so_state & SS_CANTSENDMORE) { - return (EINVAL); - } - if ((option_name == SCTP_UC_SWAP) && (level == IPPROTO_SCTP)) { - error = EOPNOTSUPP; - eprintsoline(so, error); - return (error); - } - - /* Caller allocates aligned optval, or passes null */ - ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0); - - /* No SCTP options should be zero-length */ - if (optlen == 0) { - error = EINVAL; - eprintsoline(so, error); - return (error); - } - - mutex_enter(&so->so_lock); - so_lock_single(so); - - /* - * For some SCTP level options, one can select the association this - * applies to. - */ - if (so->so_type == SOCK_STREAM) { - conn = so->so_priv; - } else { - /* - * SOCK_SEQPACKET only - */ - id = 0; - if (level == IPPROTO_SCTP) { - switch (option_name) { - case SCTP_RTOINFO: - case SCTP_ASSOCINFO: - case SCTP_SET_PEER_PRIMARY_ADDR: - case SCTP_PRIMARY_ADDR: - case SCTP_PEER_ADDR_PARAMS: - /* - * Association ID is the first element - * params struct - */ - if (optlen < sizeof (sctp_assoc_t)) { - error = EINVAL; - eprintsoline(so, error); - goto done; - } - id = *(sctp_assoc_t *)optval; - break; - case SCTP_DEFAULT_SEND_PARAM: - if (optlen != sizeof (struct sctp_sndrcvinfo)) { - error = EINVAL; - eprintsoline(so, error); - goto done; - } - id = ((struct sctp_sndrcvinfo *) - optval)->sinfo_assoc_id; - break; - case SCTP_INITMSG: - /* - * Only applies to future associations - */ - conn = so->so_priv; - break; - default: - break; - } - } else if (level == SOL_SOCKET) { - if (option_name == SO_LINGER) { - error = EOPNOTSUPP; - eprintsoline(so, error); - goto done; - } - /* - * These 2 options are applied to all associations. - * The other socket level options are only applied - * to the socket (not associations). - */ - if ((option_name != SO_RCVBUF) && - (option_name != SO_SNDBUF)) { - conn = so->so_priv; - } - } else { - conn = NULL; - } - - /* - * If association ID was specified, do op on that assoc. - * Otherwise set the default setting of a socket. - */ - if (id != 0) { - if ((error = sosctp_assoc(ss, id, &ssa)) != 0) { - eprintsoline(so, error); - goto done; - } - conn = ssa->ssa_conn; - } - } - dprint(2, ("sosctp_setsockopt %p (%d) - conn %p %d %d id:%d\n", - (void *)ss, so->so_type, conn, level, option_name, id)); - - ASSERT(ssa == NULL || (ssa != NULL && conn != NULL)); - if (conn != NULL) { - mutex_exit(&so->so_lock); - error = sctp_set_opt(conn, level, option_name, optval, optlen); - mutex_enter(&so->so_lock); - if (ssa != NULL) - SSA_REFRELE(ss, ssa); - } else { - /* - * 1-N socket, and we have to apply the operation to ALL - * associations. Like with anything of this sort, the - * problem is what to do if the operation fails. - * Just try to apply the setting to everyone, but store - * error number if someone returns such. And since we are - * looping through all possible aids, some of them can be - * invalid. We just ignore this kind (sosctp_assoc()) of - * errors. - */ - sctp_assoc_t aid; - - mutex_exit(&so->so_lock); - error = sctp_set_opt(so->so_priv, level, option_name, optval, - optlen); - mutex_enter(&so->so_lock); - for (aid = 1; aid < ss->ss_maxassoc; aid++) { - if (sosctp_assoc(ss, aid, &ssa) != 0) - continue; - mutex_exit(&so->so_lock); - rc = sctp_set_opt(ssa->ssa_conn, level, option_name, - optval, optlen); - mutex_enter(&so->so_lock); - SSA_REFRELE(ss, ssa); - if (error == 0) { - error = rc; - } - } - } - /* - * Check for SOL_SOCKET options and record their values. - * If we know about a SOL_SOCKET parameter and the transport - * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or - * EPROTO) we let the setsockopt succeed. - */ - if (level == SOL_SOCKET) { - boolean_t handled = B_FALSE; - - /* Check parameters */ - switch (option_name) { - case SO_DEBUG: - case SO_REUSEADDR: - case SO_KEEPALIVE: - case SO_DONTROUTE: - case SO_BROADCAST: - case SO_USELOOPBACK: - case SO_OOBINLINE: - case SO_SNDBUF: - case SO_RCVBUF: - case SO_SNDLOWAT: - case SO_RCVLOWAT: - case SO_DGRAM_ERRIND: - if (optlen != (t_uscalar_t)sizeof (int32_t)) { - error = EINVAL; - eprintsoline(so, error); - goto done; - } - ASSERT(optval); - handled = B_TRUE; - break; - case SO_LINGER: - if (optlen != (t_uscalar_t)sizeof (struct linger)) { - error = EINVAL; - eprintsoline(so, error); - goto done; - } - ASSERT(optval); - handled = B_TRUE; - break; - } - -#define intvalue (*(int32_t *)optval) - - switch (option_name) { - case SO_SNDTIMEO: - case SO_RCVTIMEO: - case SO_EXCLBIND: - case SO_TYPE: - case SO_ERROR: - case SO_ACCEPTCONN: - case SO_PROTOTYPE: - case SO_DOMAIN: - /* Can't be set */ - error = ENOPROTOOPT; - goto done; - case SO_LINGER: { - struct linger *l = (struct linger *)optval; - - so->so_linger.l_linger = l->l_linger; - if (l->l_onoff) { - so->so_linger.l_onoff = SO_LINGER; - so->so_options |= SO_LINGER; - } else { - so->so_linger.l_onoff = 0; - so->so_options &= ~SO_LINGER; - } - break; - } - - case SO_DEBUG: - case SO_REUSEADDR: - case SO_KEEPALIVE: - case SO_DONTROUTE: - case SO_BROADCAST: - case SO_USELOOPBACK: - case SO_OOBINLINE: - case SO_DGRAM_ERRIND: - if (intvalue != 0) { - dprintso(so, 1, - ("sosctp_setsockopt: setting 0x%x\n", - option_name)); - so->so_options |= option_name; - } else { - dprintso(so, 1, - ("sosctp_setsockopt: clearing 0x%x\n", - option_name)); - so->so_options &= ~option_name; - } - break; - /* - * The following options are only returned by us when - * the sctp_set_opt fails. - * XXX XPG 4.2 applications retrieve SO_RCVBUF from - * sockfs since the transport might adjust the value - * and not return exactly what was set by the - * application. - */ - case SO_SNDBUF: - so->so_sndbuf = intvalue; - if (so->so_sndlowat > so->so_sndbuf) { - so->so_sndlowat = so->so_sndbuf; - } - break; - case SO_RCVBUF: - so->so_rcvbuf = intvalue; - if (so->so_rcvlowat > so->so_rcvbuf) { - so->so_rcvlowat = so->so_rcvbuf; - } - break; - case SO_SNDLOWAT: - so->so_sndlowat = intvalue; - if (so->so_sndlowat > so->so_sndbuf) { - so->so_sndlowat = so->so_sndbuf; - } - break; - case SO_RCVLOWAT: - so->so_rcvlowat = intvalue; - if (so->so_rcvlowat > so->so_rcvbuf) { - so->so_rcvlowat = so->so_rcvbuf; - } - break; - } -#undef intvalue - - if (error != 0) { - if ((error == ENOPROTOOPT || error == EPROTO || - error == EINVAL) && handled) { - dprintso(so, 1, - ("sosctp_setsockopt: ignoring error %d " - "for 0x%x\n", error, option_name)); - error = 0; - } - } - } - -done: - so_unlock_single(so, SOLOCKED); - mutex_exit(&so->so_lock); - - return (error); -} - -/* - * Upcalls from SCTP - */ - -/* - * Incoming connection on listen socket. - */ -static void * -sctp_sock_newconn(void *parenthandle, void *connind) -{ - struct sctp_sonode *lss = parenthandle; - struct sonode *lso = &lss->ss_so; - struct sonode *nso; - struct sctp_sonode *nss; - mblk_t *mp; - int error; - - ASSERT(lso->so_state & SS_ACCEPTCONN); - ASSERT(lso->so_priv != NULL); /* closed conn */ - ASSERT(lso->so_type == SOCK_STREAM); - - /* - * Check current # of queued conns against backlog - */ - if (lss->ss_rxqueued >= lso->so_backlog) { - return (NULL); - } - - /* - * Need to create a new socket. - */ - mp = allocb(sizeof (nso), BPRI_MED); - if (mp == NULL) { - eprintsoline(lso, ENOMEM); - return (NULL); - } - DB_TYPE(mp) = M_PROTO; - - VN_HOLD(lso->so_accessvp); - nso = sosctp_create(lso->so_accessvp, lso->so_family, lso->so_type, - lso->so_protocol, lso->so_version, lso, &error); - if (nso == NULL) { - VN_RELE(lso->so_accessvp); - freeb(mp); - eprintsoline(lso, error); - return (NULL); - } - - dprint(2, ("sctp_stream_newconn: new %p\n", (void *)nso)); - - nss = SOTOSSO(nso); - - /* - * Inherit socket properties - */ - mutex_enter(&lso->so_lock); - mutex_enter(&nso->so_lock); - - nso->so_state |= (SS_ISBOUND | SS_ISCONNECTED | - (lso->so_state & SS_ASYNC)); - sosctp_so_inherit(lss, nss); - nso->so_priv = connind; - - mutex_exit(&nso->so_lock); - - ++lss->ss_rxqueued; - mutex_exit(&lso->so_lock); - - /* - * Copy pointer to new socket to connind queue message - */ - *(struct sonode **)mp->b_wptr = nso; - mp->b_wptr += sizeof (nso); - - /* - * Wake people who're waiting incoming conns. Note that - * soqueueconnind gets so_lock. - */ - soqueueconnind(lso, mp); - pollwakeup(&lss->ss_poll_list, POLLIN|POLLRDNORM); - - mutex_enter(&lso->so_lock); - sosctp_sendsig(lss, SCTPSIG_READ); - mutex_exit(&lso->so_lock); - - return (nss); -} - -/* - * This is the upcall function for 1-N (SOCK_SEQPACKET) socket when a new - * association is created. Note that the first argument (handle) is of type - * sctp_sonode *, which is the one changed to a listener for new - * associations. All the other upcalls for 1-N socket take sctp_soassoc * - * as handle. The only exception is the su_properties upcall, which - * can take both types as handle. - */ -static void * -sctp_assoc_newconn(void *parenthandle, void *connind) -{ - struct sctp_sonode *lss = (struct sctp_sonode *)parenthandle; - struct sonode *lso = &lss->ss_so; - struct sctp_soassoc *ssa; - sctp_assoc_t id; - - ASSERT(lss->ss_type == SOSCTP_SOCKET); - ASSERT(lso->so_state & SS_ACCEPTCONN); - ASSERT(lso->so_priv != NULL); /* closed conn */ - ASSERT(lso->so_type == SOCK_SEQPACKET); - - mutex_enter(&lso->so_lock); - - if ((id = sosctp_aid_get(lss)) == -1) { - /* - * Array not large enough; increase size. - */ - if (sosctp_aid_grow(lss, lss->ss_maxassoc, KM_NOSLEEP) < 0) { - mutex_exit(&lso->so_lock); - return (NULL); - } - id = sosctp_aid_get(lss); - ASSERT(id != -1); - } - - /* - * Create soassoc for this connection - */ - ssa = sosctp_assoc_create(lss, KM_NOSLEEP); - if (ssa == NULL) { - mutex_exit(&lso->so_lock); - return (NULL); - } - sosctp_aid_reserve(lss, id, 1); - lss->ss_assocs[id].ssi_assoc = ssa; - ++lss->ss_assoccnt; - ssa->ssa_id = id; - ssa->ssa_conn = connind; - ssa->ssa_state = (SS_ISBOUND | SS_ISCONNECTED); - ssa->ssa_wroff = lss->ss_wroff; - ssa->ssa_wrsize = lss->ss_wrsize; - - mutex_exit(&lso->so_lock); - - return (ssa); -} - -/* - * For outgoing connections, the connection has been established. - */ -static void -sctp_sock_connected(void *handle) -{ - struct sctp_sonode *ss = handle; - struct sonode *so = &ss->ss_so; - - ASSERT(so->so_type == SOCK_STREAM); - - mutex_enter(&so->so_lock); - ASSERT(so->so_priv); /* closed conn */ - - ASSERT(!(so->so_state & SS_ACCEPTCONN)); - soisconnected(so); - - sosctp_sendsig(ss, SCTPSIG_WRITE); - - mutex_exit(&so->so_lock); - - /* - * Wake ones who're waiting for conn to become established. - */ - pollwakeup(&ss->ss_poll_list, POLLOUT); -} - -static void -sctp_assoc_connected(void *handle) -{ - struct sctp_soassoc *ssa = handle; - struct sonode *so = &ssa->ssa_sonode->ss_so; - - ASSERT(so->so_type == SOCK_SEQPACKET); - ASSERT(ssa->ssa_conn); - - mutex_enter(&so->so_lock); - sosctp_assoc_isconnected(ssa); - mutex_exit(&so->so_lock); -} - -/* - * Connection got disconnected. Either with an error, or through - * normal handshake. - * Note that there is no half-closed conn, like TCP. - */ -static int -sctp_sock_disconnected(void *handle, int error) -{ - struct sctp_sonode *ss = handle; - struct sonode *so = &ss->ss_so; - int event = 0; - - ASSERT(so->so_type == SOCK_STREAM); - - mutex_enter(&so->so_lock); - ASSERT(so->so_priv != NULL); /* closed conn */ - - /* - * Connection is gone, wake everybody. - */ - if (ss->ss_rxdata == NULL) { - cv_signal(&ss->ss_rxdata_cv); - } - cv_broadcast(&ss->ss_txdata_cv); - - /* - * If socket is already disconnected/disconnecting, - * don't (re)send signal. - */ - if (!(so->so_state & SS_CANTRCVMORE)) - event |= SCTPSIG_READ; - if (!(so->so_state & SS_CANTSENDMORE)) - event |= SCTPSIG_WRITE; - if (event != 0) - sosctp_sendsig(ss, event); - - soisdisconnected(so, error); - mutex_exit(&so->so_lock); - - pollwakeup(&ss->ss_poll_list, POLLIN|POLLRDNORM|POLLOUT); - - return (0); -} - -static int -sctp_assoc_disconnected(void *handle, int error) -{ - struct sctp_soassoc *ssa = handle; - struct sctp_sonode *ss = ssa->ssa_sonode; - struct sonode *so = &ssa->ssa_sonode->ss_so; - int ret; - - ASSERT(so->so_type == SOCK_SEQPACKET); - ASSERT(ssa->ssa_conn != NULL); - - mutex_enter(&so->so_lock); - sosctp_assoc_isdisconnected(ssa, error); - if (ssa->ssa_refcnt == 1) { - ret = 1; - ssa->ssa_conn = NULL; - } else { - ret = 0; - } - SSA_REFRELE(SOTOSSO(so), ssa); - - cv_broadcast(&ss->ss_txdata_cv); - - mutex_exit(&so->so_lock); - - return (ret); -} - -/* - * Peer sent a shutdown. After this point writes are not allowed - * to this socket, but one might still receive notifications - * (e.g. for data which never got sent). - */ -static void -sctp_sock_disconnecting(void *handle) -{ - struct sctp_sonode *ss = handle; - struct sonode *so = &ss->ss_so; - - ASSERT(so->so_type == SOCK_STREAM); - - mutex_enter(&so->so_lock); - ASSERT(so->so_priv != NULL); /* closed conn */ - - /* - * Socket not writeable anymore. Wake writers, and ones - * who're waiting on socket state change - */ - cv_broadcast(&ss->ss_txdata_cv); - - if (!(so->so_state & SS_CANTSENDMORE)) { - /* - * If socket already un-writeable, don't (re)send signal. - */ - sosctp_sendsig(ss, SCTPSIG_WRITE); - } - so->so_state &= ~(SS_ISCONNECTING); - so->so_state |= SS_CANTSENDMORE; - cv_broadcast(&so->so_state_cv); - mutex_exit(&so->so_lock); - - pollwakeup(&ss->ss_poll_list, POLLOUT); -} - -static void -sctp_assoc_disconnecting(void *handle) -{ - struct sctp_soassoc *ssa = handle; - struct sonode *so = &ssa->ssa_sonode->ss_so; - - ASSERT(so->so_type == SOCK_SEQPACKET); - ASSERT(ssa->ssa_conn != NULL); - - mutex_enter(&so->so_lock); - sosctp_assoc_isdisconnecting(ssa); - mutex_exit(&so->so_lock); -} - -/* - * Incoming data. - */ -static int -sctp_sock_recv(void *handle, mblk_t *mp, int flags) -{ - struct sctp_sonode *ss = handle; - struct sonode *so = &ss->ss_so; - int msglen; -#if defined(DEBUG) && !defined(lint) - union T_primitives *tpr; -#endif - - ASSERT(so->so_type == SOCK_STREAM); - ASSERT(mp != NULL); - ASSERT(!(so->so_state & SS_ACCEPTCONN)); - - /* - * Should be getting T_unitdata_req's only. - * Must have address as part of packet. - */ -#if defined(DEBUG) && !defined(lint) - tpr = (union T_primitives *)mp->b_rptr; - ASSERT((DB_TYPE(mp) == M_PROTO) && - (tpr->type == T_UNITDATA_IND)); - ASSERT((tpr->unitdata_ind.SRC_length)); -#endif - - /* - * First mblk has only unitdata_req - */ - msglen = msgsize(mp->b_cont); - - mutex_enter(&so->so_lock); - ASSERT(so->so_priv); /* closed conn */ - - if (so->so_state & SS_CANTRCVMORE) { - mutex_exit(&so->so_lock); - freemsg(mp); - return (so->so_rcvbuf); - } - if (ss->ss_rxdata == NULL) { - cv_signal(&ss->ss_rxdata_cv); - } - *ss->ss_rxtail = mp; - ss->ss_rxtail = &mp->b_next; - ss->ss_rxqueued += msglen; - - sosctp_sendsig(ss, SCTPSIG_READ); - - /* - * Override b_flag for SCTP sockfs internal use - */ - mp->b_flag = (short)flags; - - mutex_exit(&so->so_lock); - - pollwakeup(&ss->ss_poll_list, POLLIN|POLLRDNORM); - - return (so->so_rcvbuf - ss->ss_rxqueued); -} - -static int -sctp_assoc_recv(void *handle, mblk_t *mp, int flags) -{ - struct sctp_soassoc *ssa = handle; - struct sctp_sonode *ss = ssa->ssa_sonode; - struct sonode *so = &ss->ss_so; - struct T_unitdata_ind *tind; - int msglen; - mblk_t *mp2; - union sctp_notification *sn; - struct sctp_sndrcvinfo *sinfo; - - ASSERT(ssa->ssa_type == SOSCTP_ASSOC); - ASSERT(so->so_type == SOCK_SEQPACKET); - ASSERT(ssa->ssa_conn != NULL); /* closed conn */ - ASSERT(mp != NULL); - - /* - * Should be getting T_unitdata_req's only. - * Must have address as part of packet. - */ - tind = (struct T_unitdata_ind *)mp->b_rptr; - ASSERT((DB_TYPE(mp) == M_PROTO) && - (tind->PRIM_type == T_UNITDATA_IND)); - ASSERT(tind->SRC_length); - - /* - * First mblk has only unitdata_req - */ - msglen = msgsize(mp->b_cont); - - mutex_enter(&so->so_lock); - - /* - * Override b_flag for SCTP sockfs internal use - */ - mp->b_flag = (short)flags; - - /* - * For notify messages, need to fill in association id. - * For data messages, sndrcvinfo could be in ancillary data. - */ - if (flags & SCTP_NOTIFICATION) { - mp2 = mp->b_cont; - sn = (union sctp_notification *)mp2->b_rptr; - switch (sn->sn_header.sn_type) { - case SCTP_ASSOC_CHANGE: - sn->sn_assoc_change.sac_assoc_id = ssa->ssa_id; - break; - case SCTP_PEER_ADDR_CHANGE: - sn->sn_paddr_change.spc_assoc_id = ssa->ssa_id; - break; - case SCTP_REMOTE_ERROR: - sn->sn_remote_error.sre_assoc_id = ssa->ssa_id; - break; - case SCTP_SEND_FAILED: - sn->sn_send_failed.ssf_assoc_id = ssa->ssa_id; - break; - case SCTP_SHUTDOWN_EVENT: - sn->sn_shutdown_event.sse_assoc_id = ssa->ssa_id; - break; - case SCTP_ADAPTATION_INDICATION: - sn->sn_adaptation_event.sai_assoc_id = ssa->ssa_id; - break; - case SCTP_PARTIAL_DELIVERY_EVENT: - sn->sn_pdapi_event.pdapi_assoc_id = ssa->ssa_id; - break; - default: - ASSERT(0); - break; - } - } else { - if (tind->OPT_length > 0) { - struct cmsghdr *cmsg; - char *cend; - - cmsg = (struct cmsghdr *) - ((uchar_t *)mp->b_rptr + tind->OPT_offset); - cend = (char *)cmsg + tind->OPT_length; - for (;;) { - if ((char *)(cmsg + 1) > cend || - ((char *)cmsg + cmsg->cmsg_len) > cend) { - break; - } - if ((cmsg->cmsg_level == IPPROTO_SCTP) && - (cmsg->cmsg_type == SCTP_SNDRCV)) { - sinfo = (struct sctp_sndrcvinfo *) - (cmsg + 1); - sinfo->sinfo_assoc_id = ssa->ssa_id; - break; - } - if (cmsg->cmsg_len > 0) { - cmsg = (struct cmsghdr *) - ((uchar_t *)cmsg + cmsg->cmsg_len); - } else { - break; - } - } - } - } - - /* - * SCTP has reserved space in the header for storing a pointer. - * Put the pointer to assocation there, and queue the data. - */ - SSA_REFHOLD(ssa); - ASSERT((mp->b_rptr - DB_BASE(mp)) >= sizeof (ssa)); - *(struct sctp_soassoc **)DB_BASE(mp) = ssa; - - if (ss->ss_rxdata == NULL) { - cv_signal(&ss->ss_rxdata_cv); - } - *ss->ss_rxtail = mp; - ss->ss_rxtail = &mp->b_next; - ssa->ssa_rxqueued += msglen; - - sosctp_sendsig(ss, SCTPSIG_READ); - - mutex_exit(&so->so_lock); - - pollwakeup(&ss->ss_poll_list, POLLIN|POLLRDNORM); - - return (so->so_rcvbuf - ssa->ssa_rxqueued); -} - -/* - * TX queued data got acknowledged. Frees up space in TX queue. - */ -static void -sctp_sock_xmitted(void *handle, int txqueued) -{ - struct sctp_sonode *ss = handle; - struct sonode *so = &ss->ss_so; - boolean_t writeable; - - mutex_enter(&so->so_lock); - ASSERT(so->so_priv != NULL); /* closed conn */ - - if (ss->ss_txqueued < so->so_sndlowat) { - writeable = B_TRUE; - } else { - writeable = B_FALSE; - } - ss->ss_txqueued = txqueued; - - /* - * Wake blocked writers. - */ - cv_broadcast(&ss->ss_txdata_cv); - - /* - * Only do pollwakeup if the amount of queued data is less than - * watermark, and the socket wasn't writeable before. - */ - if (!writeable && (ss->ss_txqueued < so->so_sndlowat)) { - sosctp_sendsig(ss, SCTPSIG_WRITE); - mutex_exit(&so->so_lock); - pollwakeup(&ss->ss_poll_list, POLLOUT); - } else { - mutex_exit(&so->so_lock); - } -} - -static void -sctp_assoc_xmitted(void *handle, int txqueued) -{ - struct sctp_soassoc *ssa = handle; - struct sctp_sonode *ss = ssa->ssa_sonode; - - ASSERT(ssa->ssa_type == SOSCTP_ASSOC); - ASSERT(ss->ss_so.so_type == SOCK_SEQPACKET); - ASSERT(ssa->ssa_conn != NULL); - - mutex_enter(&ss->ss_so.so_lock); - - ssa->ssa_txqueued = txqueued; - - /* - * Wake blocked writers. - */ - cv_broadcast(&ss->ss_txdata_cv); - - mutex_exit(&ss->ss_so.so_lock); -} - -/* - * SCTP notifies socket about write offset and amount of TX data per mblk. - */ -static void -sctp_sock_properties(void *handle, int wroff, size_t maxblk) -{ - struct sctp_sonode *ss = handle; - - ASSERT(ss->ss_so.so_type == SOCK_STREAM); - - mutex_enter(&ss->ss_so.so_lock); - - ASSERT(ss->ss_so.so_priv != NULL); /* closed conn */ - - /* - * Only change them if they're set. - */ - if (wroff != 0) { - ss->ss_wroff = wroff; - } - if (maxblk != 0) { - ss->ss_wrsize = maxblk; - } - mutex_exit(&ss->ss_so.so_lock); -} - -static void -sctp_assoc_properties(void *handle, int wroff, size_t maxblk) -{ - struct sctp_soassoc *ssa = handle; - struct sctp_sonode *ss; - - if (ssa->ssa_type == SOSCTP_ASSOC) { - ss = ssa->ssa_sonode; - mutex_enter(&ss->ss_so.so_lock); - - /* - * Only change them if they're set. - */ - if (wroff != 0) { - ssa->ssa_wroff = wroff; - } - if (maxblk != 0) { - ssa->ssa_wrsize = maxblk; - } - } else { - ss = (struct sctp_sonode *)handle; - mutex_enter(&ss->ss_so.so_lock); - - if (wroff != 0) { - ss->ss_wroff = wroff; - } - if (maxblk != 0) { - ss->ss_wrsize = maxblk; - } - } - - mutex_exit(&ss->ss_so.so_lock); -} diff --git a/usr/src/uts/common/fs/sockfs/socksctpvnops.c b/usr/src/uts/common/fs/sockfs/socksctpvnops.c deleted file mode 100644 index b59bb8d163..0000000000 --- a/usr/src/uts/common/fs/sockfs/socksctpvnops.c +++ /dev/null @@ -1,875 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include <sys/types.h> -#include <sys/t_lock.h> -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/buf.h> -#include <sys/debug.h> -#include <sys/errno.h> -#include <sys/uio.h> -#include <sys/vfs.h> -#include <sys/vfs_opreg.h> -#include <sys/vnode.h> -#include <sys/stropts.h> -#include <sys/cmn_err.h> -#include <sys/sysmacros.h> -#include <sys/stream.h> -#include <sys/strsun.h> - -#include <sys/socket.h> -#include <sys/socketvar.h> - -#include <sys/project.h> -#include <sys/strsubr.h> - -#include <fs/fs_subr.h> - -#include <sys/esunddi.h> -#include <sys/ddi.h> - -#include <sys/filio.h> -#include <sys/sockio.h> - -#include <netinet/sctp.h> -#include <inet/sctp_itf.h> -#include "socksctp.h" - -/* - * SCTP sockfs vnode operations - */ -static int socksctpv_open(struct vnode **, int, struct cred *, - caller_context_t *); -static int socksctpv_close(struct vnode *, int, int, offset_t, - struct cred *, caller_context_t *); -static int socksctpv_read(struct vnode *, struct uio *, int, struct cred *, - caller_context_t *); -static int socksctpv_write(struct vnode *, struct uio *, int, struct cred *, - caller_context_t *); -static int socksctpv_ioctl(struct vnode *, int, intptr_t, int, - struct cred *, int32_t *, caller_context_t *); -static int socksctp_setfl(vnode_t *, int, int, cred_t *, caller_context_t *); -static void socksctpv_inactive(struct vnode *, struct cred *, - caller_context_t *); -static int socksctpv_poll(struct vnode *, short, int, short *, - struct pollhead **, caller_context_t *); - -const fs_operation_def_t socksctp_vnodeops_template[] = { - VOPNAME_OPEN, { .vop_open = socksctpv_open }, - VOPNAME_CLOSE, { .vop_close = socksctpv_close }, - VOPNAME_READ, { .vop_read = socksctpv_read }, - VOPNAME_WRITE, { .vop_write = socksctpv_write }, - VOPNAME_IOCTL, { .vop_ioctl = socksctpv_ioctl }, - VOPNAME_SETFL, { .vop_setfl = socksctp_setfl }, - VOPNAME_GETATTR, { .vop_getattr = socktpi_getattr }, - VOPNAME_SETATTR, { .vop_setattr = socktpi_setattr }, - VOPNAME_ACCESS, { .vop_access = socktpi_access }, - VOPNAME_FSYNC, { .vop_fsync = socktpi_fsync }, - VOPNAME_INACTIVE, { .vop_inactive = socksctpv_inactive }, - VOPNAME_FID, { .vop_fid = socktpi_fid }, - VOPNAME_SEEK, { .vop_seek = socktpi_seek }, - VOPNAME_POLL, { .vop_poll = socksctpv_poll }, - VOPNAME_DISPOSE, { .error = fs_error }, - NULL, NULL -}; -struct vnodeops *socksctp_vnodeops; - -/*ARGSUSED3*/ -static int -socksctpv_open(struct vnode **vpp, int flag, struct cred *cr, - caller_context_t *ct) -{ - struct sonode *so; - struct sctp_sonode *ss; - struct vnode *vp = *vpp; - int error = 0; - sctp_sockbuf_limits_t sbl; - sctp_upcalls_t *upcalls; - - flag &= ~FCREAT; /* paranoia */ - - so = VTOSO(vp); - ss = SOTOSSO(so); - - mutex_enter(&so->so_lock); - so->so_count++; /* one more open reference */ - ASSERT(so->so_count != 0); /* wraparound */ - mutex_exit(&so->so_lock); - - ASSERT(vp->v_type == VSOCK); - - if (flag & SO_ACCEPTOR) { - ASSERT(so->so_type == SOCK_STREAM); - /* - * Protocol control block already created - */ - return (0); - } - - /* - * Active open. - */ - if (so->so_type == SOCK_STREAM) { - upcalls = &sosctp_sock_upcalls; - } else { - ASSERT(so->so_type == SOCK_SEQPACKET); - upcalls = &sosctp_assoc_upcalls; - } - so->so_priv = sctp_create(ss, NULL, so->so_family, SCTP_CAN_BLOCK, - upcalls, &sbl, cr); - if (so->so_priv == NULL) { - error = ENOMEM; - mutex_enter(&so->so_lock); - ASSERT(so->so_count > 0); - so->so_count--; /* one less open reference */ - mutex_exit(&so->so_lock); - } - so->so_rcvbuf = sbl.sbl_rxbuf; - so->so_rcvlowat = sbl.sbl_rxlowat; - so->so_sndbuf = sbl.sbl_txbuf; - so->so_sndlowat = sbl.sbl_txlowat; - - return (error); -} - -/*ARGSUSED*/ -static int -socksctpv_close(struct vnode *vp, int flag, int count, offset_t offset, - struct cred *cr, caller_context_t *ct) -{ - struct sonode *so; - struct sctp_sonode *ss; - struct sctp_sa_id *ssi; - struct sctp_soassoc *ssa; - int sendsig = 0; - int32_t i; - - so = VTOSO(vp); - ss = SOTOSSO(so); - - cleanlocks(vp, ttoproc(curthread)->p_pid, 0); - cleanshares(vp, ttoproc(curthread)->p_pid); - - ASSERT(vp->v_stream == NULL); - if (count > 1) { - dprint(2, ("socksctpv_close: count %d\n", count)); - return (0); - } - - mutex_enter(&so->so_lock); - so_lock_single(so); /* Set SOLOCKED */ - ASSERT(so->so_count > 0); - so->so_count--; /* one fewer open reference */ - - dprint(2, ("socksctpv_close: %p so_count %d\n", (void *)so, - so->so_count)); - - if (so->so_count == 0) { - /* - * Need to set flags as there might be ops in progress on - * this socket. - * - * If socket already disconnected/disconnecting, - * don't send signal (again). - */ - if (!(so->so_state & SS_CANTRCVMORE)) - sendsig |= SCTPSIG_READ; - if (!(so->so_state & SS_CANTSENDMORE)) - sendsig |= SCTPSIG_WRITE; - soisdisconnected(so, 0); - mutex_exit(&so->so_lock); - - /* - * Initiate connection shutdown. Update SCTP's receive - * window. - */ - sctp_recvd(so->so_priv, so->so_rcvbuf - ss->ss_rxqueued); - (void) sctp_disconnect(so->so_priv); - - /* - * New associations can't come in, but old ones might get - * closed in upcall. Protect against that by taking a reference - * on the association. - */ - mutex_enter(&so->so_lock); - ssi = ss->ss_assocs; - for (i = 0; i < ss->ss_maxassoc; i++, ssi++) { - if ((ssa = ssi->ssi_assoc) != NULL) { - SSA_REFHOLD(ssa); - sosctp_assoc_isdisconnected(ssa, 0); - mutex_exit(&so->so_lock); - - sctp_recvd(ssa->ssa_conn, so->so_rcvbuf - - ssa->ssa_rxqueued); - (void) sctp_disconnect(ssa->ssa_conn); - - mutex_enter(&so->so_lock); - SSA_REFRELE(ss, ssa); - } - } - if (sendsig != 0) { - sosctp_sendsig(ss, sendsig); - } - mutex_exit(&so->so_lock); - pollwakeup(&ss->ss_poll_list, POLLIN|POLLRDNORM|POLLOUT); - mutex_enter(&so->so_lock); - } - so_unlock_single(so, SOLOCKED); - mutex_exit(&so->so_lock); - - return (0); -} - -/*ARGSUSED2*/ -static int -socksctpv_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr, - caller_context_t *ct) -{ - struct sonode *so = VTOSO(vp); - struct nmsghdr lmsg; - - if (so->so_type != SOCK_STREAM) { - return (EOPNOTSUPP); - } - - ASSERT(vp->v_type == VSOCK); - so_update_attrs(so, SOACC); - lmsg.msg_namelen = 0; - lmsg.msg_controllen = 0; - lmsg.msg_flags = 0; - return (sosctp_recvmsg(so, &lmsg, uiop)); -} - -/* - * Send data, see sosctp_sendmsg() - */ -/*ARGSUSED2*/ -static int -socksctpv_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr, - caller_context_t *ct) -{ - struct sctp_sonode *ss; - struct sonode *so; - mblk_t *head; - ssize_t count, msglen; - int error; - - so = VTOSO(vp); - ss = SOTOSSO(so); - - if (so->so_type != SOCK_STREAM) { - return (EOPNOTSUPP); - } - - mutex_enter(&so->so_lock); - - for (;;) { - if (so->so_state & SS_CANTSENDMORE) { - mutex_exit(&so->so_lock); - tsignal(curthread, SIGPIPE); - return (EPIPE); - } - - if (so->so_error != 0) { - error = sogeterr(so); - if (error != 0) { - mutex_exit(&so->so_lock); - return (error); - } - } - - if (ss->ss_txqueued < so->so_sndbuf) - break; - - if (uiop->uio_fmode & (FNDELAY|FNONBLOCK)) { - mutex_exit(&so->so_lock); - return (EAGAIN); - } else { - /* - * Xmit window full in a blocking socket. - * Wait for space to become available and try again. - */ - error = cv_wait_sig(&ss->ss_txdata_cv, &so->so_lock); - if (error == 0) { /* signal */ - mutex_exit(&so->so_lock); - return (EINTR); - } - } - } - - if (!(so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED))) { - mutex_exit(&so->so_lock); - return (ENOTCONN); - } - - msglen = count = uiop->uio_resid; - /* Don't allow sending a message larger than the send buffer size. */ - if (msglen > so->so_sndbuf) { - mutex_exit(&so->so_lock); - return (EMSGSIZE); - } - ss->ss_txqueued += msglen; - - mutex_exit(&so->so_lock); - - if (count == 0) { - return (0); - } - - head = sctp_alloc_hdr(NULL, 0, NULL, 0, SCTP_CAN_BLOCK); - if (head == NULL) { - error = EINTR; - goto error_ret; - } - - /* Copy in the message. */ - if ((error = sosctp_uiomove(head, count, ss->ss_wrsize, ss->ss_wroff, - uiop, 0, cr)) != 0) { - goto error_ret; - } - so_update_attrs(so, SOMOD); - - error = sctp_sendmsg(so->so_priv, head, 0); - if (error == 0) - return (0); - -error_ret: - mutex_enter(&so->so_lock); - ss->ss_txqueued -= msglen; - cv_broadcast(&ss->ss_txdata_cv); - mutex_exit(&so->so_lock); - freemsg(head); - return (error); -} - -/*ARGSUSED4*/ -static int -socksctpv_ioctl(struct vnode *vp, int cmd, intptr_t arg, int mode, - struct cred *cr, int32_t *rvalp, caller_context_t *ct) -{ - struct sonode *so; - struct sctp_sonode *ss; - int32_t value; - int error; - int intval; - pid_t pid; - struct sctp_soassoc *ssa; - void *conn; - void *buf; - STRUCT_DECL(sctpopt, opt); - uint32_t optlen; - int buflen; - - so = VTOSO(vp); - ss = SOTOSSO(so); - - /* handle socket specific ioctls */ - switch (cmd) { - case FIONBIO: - if (so_copyin((void *)arg, &value, sizeof (int32_t), - (mode & (int)FKIOCTL))) { - return (EFAULT); - } - mutex_enter(&so->so_lock); - if (value) { - so->so_state |= SS_NDELAY; - } else { - so->so_state &= ~SS_NDELAY; - } - mutex_exit(&so->so_lock); - return (0); - - case FIOASYNC: - if (so_copyin((void *)arg, &value, sizeof (int32_t), - (mode & (int)FKIOCTL))) { - return (EFAULT); - } - mutex_enter(&so->so_lock); - - if (value) { - /* Turn on SIGIO */ - so->so_state |= SS_ASYNC; - } else { - /* Turn off SIGIO */ - so->so_state &= ~SS_ASYNC; - } - mutex_exit(&so->so_lock); - return (0); - - case SIOCSPGRP: - case FIOSETOWN: - if (so_copyin((void *)arg, &pid, sizeof (pid_t), - (mode & (int)FKIOCTL))) { - return (EFAULT); - } - mutex_enter(&so->so_lock); - - error = (pid != so->so_pgrp) ? sosctp_chgpgrp(ss, pid) : 0; - mutex_exit(&so->so_lock); - return (error); - - case SIOCGPGRP: - case FIOGETOWN: - if (so_copyout(&so->so_pgrp, (void *)arg, - sizeof (pid_t), (mode & (int)FKIOCTL))) - return (EFAULT); - return (0); - - case SIOCATMARK: - /* - * No support for urgent data. - */ - intval = 0; - - if (so_copyout(&intval, (void *)arg, sizeof (int), - (mode & (int)FKIOCTL))) - return (EFAULT); - return (0); - - /* from strioctl */ - case FIONREAD: - /* - * Return number of bytes of data in all data messages - * in queue in "arg". - * For stream socket, amount of available data. - * For sock_dgram, # of available bytes + addresses. - */ - intval = (so->so_state & SS_ACCEPTCONN) ? 0 : - MIN(ss->ss_rxqueued, INT_MAX); - if (so_copyout(&intval, (void *)arg, sizeof (intval), - (mode & (int)FKIOCTL))) - return (EFAULT); - return (0); - - case SIOCSCTPGOPT: - STRUCT_INIT(opt, mode); - - if (so_copyin((void *)arg, STRUCT_BUF(opt), STRUCT_SIZE(opt), - (mode & (int)FKIOCTL))) { - return (EFAULT); - } - if ((optlen = STRUCT_FGET(opt, sopt_len)) > SO_MAXARGSIZE) - return (EINVAL); - - /* - * Find the correct sctp_t based on whether it is 1-N socket - * or not. - */ - intval = STRUCT_FGET(opt, sopt_aid); - mutex_enter(&so->so_lock); - if ((so->so_type == SOCK_SEQPACKET) && intval) { - if ((error = sosctp_assoc(ss, intval, &ssa)) != 0) { - mutex_exit(&so->so_lock); - return (error); - } - conn = ssa->ssa_conn; - ASSERT(conn != NULL); - } else { - conn = so->so_priv; - ssa = NULL; - } - mutex_exit(&so->so_lock); - - /* Copyin the option buffer and then call sctp_get_opt(). */ - buflen = optlen; - /* Let's allocate a buffer enough to hold an int */ - if (buflen < sizeof (uint32_t)) - buflen = sizeof (uint32_t); - buf = kmem_alloc(buflen, KM_SLEEP); - if (so_copyin(STRUCT_FGETP(opt, sopt_val), buf, optlen, - (mode & (int)FKIOCTL))) { - if (ssa != NULL) { - mutex_enter(&so->so_lock); - SSA_REFRELE(ss, ssa); - mutex_exit(&so->so_lock); - } - kmem_free(buf, buflen); - return (EFAULT); - } - /* The option level has to be IPPROTO_SCTP */ - error = sctp_get_opt(conn, IPPROTO_SCTP, - STRUCT_FGET(opt, sopt_name), buf, &optlen); - if (ssa != NULL) { - mutex_enter(&so->so_lock); - SSA_REFRELE(ss, ssa); - mutex_exit(&so->so_lock); - } - optlen = MIN(buflen, optlen); - /* No error, copyout the result with the correct buf len. */ - if (error == 0) { - STRUCT_FSET(opt, sopt_len, optlen); - if (so_copyout(STRUCT_BUF(opt), (void *)arg, - STRUCT_SIZE(opt), (mode & (int)FKIOCTL))) { - error = EFAULT; - } else if (so_copyout(buf, STRUCT_FGETP(opt, sopt_val), - optlen, (mode & (int)FKIOCTL))) { - error = EFAULT; - } - } - kmem_free(buf, buflen); - return (error); - - case SIOCSCTPSOPT: - STRUCT_INIT(opt, mode); - - if (so_copyin((void *)arg, STRUCT_BUF(opt), STRUCT_SIZE(opt), - (mode & (int)FKIOCTL))) { - return (EFAULT); - } - if ((optlen = STRUCT_FGET(opt, sopt_len)) > SO_MAXARGSIZE) - return (EINVAL); - - /* - * Find the correct sctp_t based on whether it is 1-N socket - * or not. - */ - intval = STRUCT_FGET(opt, sopt_aid); - mutex_enter(&so->so_lock); - if (intval != 0) { - if ((error = sosctp_assoc(ss, intval, &ssa)) != 0) { - mutex_exit(&so->so_lock); - return (error); - } - conn = ssa->ssa_conn; - ASSERT(conn != NULL); - } else { - conn = so->so_priv; - ssa = NULL; - } - mutex_exit(&so->so_lock); - - /* Copyin the option buffer and then call sctp_set_opt(). */ - buf = kmem_alloc(optlen, KM_SLEEP); - if (so_copyin(STRUCT_FGETP(opt, sopt_val), buf, optlen, - (mode & (int)FKIOCTL))) { - if (ssa != NULL) { - mutex_enter(&so->so_lock); - SSA_REFRELE(ss, ssa); - mutex_exit(&so->so_lock); - } - kmem_free(buf, intval); - return (EFAULT); - } - /* The option level has to be IPPROTO_SCTP */ - error = sctp_set_opt(conn, IPPROTO_SCTP, - STRUCT_FGET(opt, sopt_name), buf, optlen); - if (ssa) { - mutex_enter(&so->so_lock); - SSA_REFRELE(ss, ssa); - mutex_exit(&so->so_lock); - } - kmem_free(buf, optlen); - return (error); - - case SIOCSCTPPEELOFF: { - struct sonode *nso; - struct sctp_uc_swap us; - int nfd; - struct file *nfp; - struct vnode *nvp = NULL, *accessvp; - - dprint(2, ("sctppeeloff %p\n", (void *)ss)); - - if (so->so_type != SOCK_SEQPACKET) { - return (EOPNOTSUPP); - } - if (so_copyin((void *)arg, &intval, sizeof (intval), - (mode & (int)FKIOCTL))) { - return (EFAULT); - } - if (intval == 0) { - return (EINVAL); - } - - /* - * Find accessvp. This is different from parent's vp, - * as the socket type is different. - */ - accessvp = solookup(so->so_family, SOCK_STREAM, - so->so_protocol, NULL, &error); - if (accessvp == NULL) { - return (error); - } - - /* - * Allocate the user fd. - */ - if ((nfd = ufalloc(0)) == -1) { - eprintsoline(so, EMFILE); - return (EMFILE); - } - - /* - * Copy the fd out. - */ - if (so_copyout(&nfd, (void *)arg, sizeof (nfd), - (mode & (int)FKIOCTL))) { - error = EFAULT; - goto err; - } - mutex_enter(&so->so_lock); - - /* - * Don't use sosctp_assoc() in order to peel off disconnected - * associations. - */ - ssa = ((uint32_t)intval >= ss->ss_maxassoc) ? NULL : - ss->ss_assocs[intval].ssi_assoc; - if (ssa == NULL) { - mutex_exit(&so->so_lock); - error = EINVAL; - goto err; - } - SSA_REFHOLD(ssa); - - nso = sosctp_create(accessvp, so->so_family, SOCK_STREAM, - so->so_protocol, so->so_version, so, &error); - if (nso == NULL) { - SSA_REFRELE(ss, ssa); - mutex_exit(&so->so_lock); - goto err; - } - nvp = SOTOV(nso); - so_lock_single(so); - mutex_exit(&so->so_lock); - us.sus_handle = SOTOSSO(nso); - us.sus_upcalls = &sosctp_sock_upcalls; - - /* - * Upcalls to new socket are blocked for the duration of - * downcall. - */ - mutex_enter(&nso->so_lock); - - error = sctp_set_opt(ssa->ssa_conn, IPPROTO_SCTP, SCTP_UC_SWAP, - &us, sizeof (us)); - if (error) { - goto peelerr; - } - error = falloc(nvp, FWRITE|FREAD, &nfp, NULL); - if (error) { - goto peelerr; - } - - /* - * fill in the entries that falloc reserved - */ - nfp->f_vnode = nvp; - mutex_exit(&nfp->f_tlock); - setf(nfd, nfp); - - mutex_enter(&so->so_lock); - - sosctp_assoc_move(ss, SOTOSSO(nso), ssa); - - mutex_exit(&nso->so_lock); - - ssa->ssa_conn = NULL; - sosctp_assoc_free(ss, ssa); - - so_unlock_single(so, SOLOCKED); - mutex_exit(&so->so_lock); - - return (0); - -err: - setf(nfd, NULL); - eprintsoline(so, error); - return (error); - -peelerr: - mutex_exit(&nso->so_lock); - mutex_enter(&so->so_lock); - ASSERT(nso->so_count == 1); - nso->so_count = 0; - so_unlock_single(so, SOLOCKED); - SSA_REFRELE(ss, ssa); - mutex_exit(&so->so_lock); - /* held in VOP_OPEN() */ - ddi_rele_driver(getmajor(nso->so_dev)); - setf(nfd, NULL); - ASSERT(nvp->v_count == 1); - VN_RELE(nvp); - eprintsoline(so, error); - return (error); - } - default: - return (EINVAL); - } -} - -/* - * Allow any flags. Record FNDELAY and FNONBLOCK so that they can be inherited - * from listener to acceptor. - */ -/* ARGSUSED */ -static int -socksctp_setfl(vnode_t *vp, int oflags, int nflags, cred_t *cr, - caller_context_t *ct) -{ - struct sonode *so; - - so = VTOSO(vp); - - mutex_enter(&so->so_lock); - if (nflags & FNDELAY) - so->so_state |= SS_NDELAY; - else - so->so_state &= ~SS_NDELAY; - if (nflags & FNONBLOCK) - so->so_state |= SS_NONBLOCK; - else - so->so_state &= ~SS_NONBLOCK; - mutex_exit(&so->so_lock); - return (0); -} - -/*ARGSUSED*/ -static void -socksctpv_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct) -{ - struct sonode *so; - struct sctp_sonode *ss; - struct sctp_sa_id *ssi; - struct sctp_soassoc *ssa; - int32_t i; - - so = VTOSO(vp); - ss = SOTOSSO(so); - - mutex_enter(&vp->v_lock); - /* - * If no one has reclaimed the vnode, remove from the - * cache now. - */ - if (vp->v_count < 1) - cmn_err(CE_PANIC, "socksctpv_inactive: Bad v_count"); - - /* - * Drop the temporary hold by vn_rele now - */ - if (--vp->v_count != 0) { - mutex_exit(&vp->v_lock); - return; - } - mutex_exit(&vp->v_lock); - - /* We are the sole owner of so now */ - - /* - * New associations can't come in, but old ones might get - * closed in upcall. Protect against that by taking a reference - * on the association. - */ - mutex_enter(&so->so_lock); - - ssi = ss->ss_assocs; - for (i = 0; i < ss->ss_maxassoc; i++, ssi++) { - if ((ssa = ssi->ssi_assoc) != NULL) { - SSA_REFHOLD(ssa); - mutex_exit(&so->so_lock); - - sctp_close(ssa->ssa_conn); - - mutex_enter(&so->so_lock); - ssa->ssa_conn = NULL; - sosctp_assoc_free(ss, ssa); - } - } - mutex_exit(&so->so_lock); - - ASSERT(!vn_has_cached_data(vp)); - if (so->so_priv) { - sctp_close(so->so_priv); - } - so->so_priv = NULL; - sosctp_free(so); -} - -/* - * Check socktpi_poll() on why so_lock is not held in this function. - */ -/*ARGSUSED5*/ -static int -socksctpv_poll(struct vnode *vp, short events, int anyyet, short *reventsp, - struct pollhead **phpp, caller_context_t *ct) -{ - struct sonode *so; - struct sctp_sonode *ss; - short origevents = events; - int so_state; - - so = VTOSO(vp); - ss = SOTOSSO(so); - so_state = so->so_state; - - ASSERT(vp->v_type == VSOCK); - ASSERT(vp->v_stream == NULL); - ASSERT(so->so_version != SOV_STREAM); - - if (!(so_state & SS_ISCONNECTED) && (so->so_type == SOCK_STREAM)) { - /* - * Not connected yet - turn off write side events - */ - events &= ~(POLLOUT|POLLWRBAND); - } - - /* - * Check for errors - */ - if (so->so_error != 0 && - ((POLLIN|POLLRDNORM|POLLOUT) & origevents) != 0) { - *reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents; - return (0); - } - - *reventsp = 0; - - /* - * Don't mark socket as writable until TX queued data is - * below watermark. - */ - if (so->so_type == SOCK_STREAM) { - if (ss->ss_txqueued < so->so_sndlowat) { - *reventsp |= POLLOUT & events; - } - } else { - *reventsp |= POLLOUT & events; - } - if (ss->ss_rxdata) { - *reventsp |= (POLLIN|POLLRDNORM) & events; - } - if ((so_state & (SS_HASCONNIND|SS_CANTRCVMORE)) != 0) { - *reventsp |= (POLLIN|POLLRDNORM) & events; - } - - if (!*reventsp && !anyyet) { - *phpp = &ss->ss_poll_list; - } - - return (0); -} diff --git a/usr/src/uts/common/fs/sockfs/socksdp.h b/usr/src/uts/common/fs/sockfs/socksdp.h deleted file mode 100755 index 68231bb0e5..0000000000 --- a/usr/src/uts/common/fs/sockfs/socksdp.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SOCKSDP_H_ -#define _SOCKSDP_H_ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * SDP socket structure. - * - * The opaque pointer passed in upcalls is a pointer to sdp_sonode. - */ -struct sdp_sonode { - int ss_type; /* sonode or soassoc */ - struct sonode ss_so; - struct sockaddr_in6 ss_laddr; /* can fit both v4 & v6 */ - struct sockaddr_in6 ss_faddr; - int ss_rxqueued; /* queued # of conn */ - struct pollhead ss_poll_list; -}; - -extern sdp_upcalls_t sosdp_sock_upcalls; -extern struct vnodeops *socksdp_vnodeops; -extern const fs_operation_def_t socksdp_vnodeops_template[]; - -extern void sosdp_free(struct sonode *so); -extern int sosdp_chgpgrp(struct sdp_sonode *ss, pid_t pid); -extern void sosdp_sendsig(struct sdp_sonode *ss, int event); - -extern int sosdp_bind(struct sonode *so, struct sockaddr *name, - socklen_t namelen, int flags); -extern int sosdp_recvmsg(struct sonode *, struct nmsghdr *, struct uio *); - -extern int sosdp_waitconnected(struct sonode *so, int fmode); - -extern void sosdp_so_inherit(struct sdp_sonode *lss, struct sdp_sonode *nss); - -/* - * Data structure types. - */ -#define SOSDP_SOCKET 0x1 - -#define SOTOSDO(so) ((struct sdp_sonode *)(((char *)so) - \ - offsetof(struct sdp_sonode, ss_so))) - -/* - * Event flags to sosdp_sendsig(). - */ -#define SDPSIG_WRITE 0x1 -#define SDPSIG_READ 0x2 -#define SDPSIG_URG 0x4 - -#ifdef __cplusplus -} -#endif - -#endif /* _SOCKSDP_H_ */ diff --git a/usr/src/uts/common/fs/sockfs/socksdpsubr.c b/usr/src/uts/common/fs/sockfs/socksdpsubr.c deleted file mode 100755 index 357c61db3d..0000000000 --- a/usr/src/uts/common/fs/sockfs/socksdpsubr.c +++ /dev/null @@ -1,214 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/types.h> -#include <sys/t_lock.h> -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/debug.h> -#include <sys/errno.h> -#include <sys/strsubr.h> -#include <sys/cmn_err.h> -#include <sys/sysmacros.h> - -#include <sys/vfs.h> -#include <sys/vfs_opreg.h> - -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/strsun.h> -#include <sys/signal.h> - -#include <inet/sdp_itf.h> -#include "socksdp.h" - - -/* - * Wait until the socket is connected or there is an error. - * fmode should contain any nonblocking flags. - */ -int -sosdp_waitconnected(struct sonode *so, int fmode) -{ - int error; - - ASSERT(MUTEX_HELD(&so->so_lock)); - ASSERT((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) || - so->so_error != 0); - - while ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == - SS_ISCONNECTING && so->so_error == 0) { - - dprint(3, ("waiting for SS_ISCONNECTED on %p\n", (void *)so)); - if (fmode & (FNDELAY|FNONBLOCK)) - return (EINPROGRESS); - - if (!cv_wait_sig_swap(&so->so_state_cv, &so->so_lock)) { - /* - * Return EINTR and let the application use - * nonblocking techniques for detecting when - * the connection has been established. - */ - error = EINTR; - break; - } - dprint(3, ("awoken on %p\n", (void *)so)); - } - - if (so->so_error != 0) { - error = sogeterr(so); - ASSERT(error != 0); - dprint(3, ("sosdp_waitconnected: error %d\n", error)); - } else if (so->so_state & SS_ISCONNECTED) { - error = 0; - } - return (error); -} - - -/* - * Change the process/process group to which SIGIO is sent. - */ -int -sosdp_chgpgrp(struct sdp_sonode *ss, pid_t pid) -{ - int error; - - ASSERT(MUTEX_HELD(&ss->ss_so.so_lock)); - if (pid != 0) { - /* - * Permissions check by sending signal 0. - * Note that when kill fails it does a - * set_errno causing the system call to fail. - */ - error = kill(pid, 0); - if (error != 0) { - return (error); - } - } - ss->ss_so.so_pgrp = pid; - return (0); -} - - -/* - * Generate a SIGIO, for 'writable' events include siginfo structure, - * for read events just send the signal. - */ -/*ARGSUSED*/ -static void -sosdp_sigproc(proc_t *proc, int event) -{ - k_siginfo_t info; - - if (event & SDPSIG_WRITE) { - info.si_signo = SIGPOLL; - info.si_code = POLL_OUT; - info.si_errno = 0; - info.si_fd = 0; - info.si_band = 0; - sigaddq(proc, NULL, &info, KM_NOSLEEP); - } - if (event & SDPSIG_READ) { - sigtoproc(proc, NULL, SIGPOLL); - } - if (event & SDPSIG_URG) { - sigtoproc(proc, NULL, SIGURG); - } -} - -void -sosdp_sendsig(struct sdp_sonode *ss, int event) -{ - proc_t *proc; - struct sonode *so = &ss->ss_so; - - ASSERT(MUTEX_HELD(&ss->ss_so.so_lock)); - - if (so->so_pgrp == 0 || (!(so->so_state & SS_ASYNC) && - event != SDPSIG_URG)) { - return; - } - - dprint(3, ("sending sig %d to %d\n", event, so->so_pgrp)); - - if (so->so_pgrp > 0) { - /* - * XXX This unfortunately still generates - * a signal when a fd is closed but - * the proc is active. - */ - mutex_enter(&pidlock); - proc = prfind(so->so_pgrp); - if (proc == NULL) { - mutex_exit(&pidlock); - return; - } - mutex_enter(&proc->p_lock); - mutex_exit(&pidlock); - sosdp_sigproc(proc, event); - mutex_exit(&proc->p_lock); - } else { - /* - * Send to process group. Hold pidlock across - * calls to sosdp_sigproc(). - */ - pid_t pgrp = -so->so_pgrp; - - mutex_enter(&pidlock); - proc = pgfind(pgrp); - while (proc != NULL) { - mutex_enter(&proc->p_lock); - sosdp_sigproc(proc, event); - mutex_exit(&proc->p_lock); - proc = proc->p_pglink; - } - mutex_exit(&pidlock); - } -} - - -/* - * Inherit socket properties - */ -void -sosdp_so_inherit(struct sdp_sonode *lss, struct sdp_sonode *nss) -{ - struct sonode *nso = &nss->ss_so; - struct sonode *lso = &lss->ss_so; - - nso->so_options = lso->so_options & (SO_DEBUG|SO_REUSEADDR| - SO_KEEPALIVE|SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK| - SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER); - nso->so_sndbuf = lso->so_sndbuf; - nso->so_rcvbuf = lso->so_rcvbuf; - nso->so_pgrp = lso->so_pgrp; - - nso->so_rcvlowat = lso->so_rcvlowat; - nso->so_sndlowat = lso->so_sndlowat; -} diff --git a/usr/src/uts/common/fs/sockfs/socksdpvnops.c b/usr/src/uts/common/fs/sockfs/socksdpvnops.c deleted file mode 100644 index 395599daab..0000000000 --- a/usr/src/uts/common/fs/sockfs/socksdpvnops.c +++ /dev/null @@ -1,535 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include <sys/types.h> -#include <sys/t_lock.h> -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/buf.h> -#include <sys/debug.h> -#include <sys/errno.h> -#include <sys/uio.h> -#include <sys/vfs.h> -#include <sys/vfs_opreg.h> -#include <sys/vnode.h> -#include <sys/stropts.h> -#include <sys/cmn_err.h> -#include <sys/sysmacros.h> -#include <sys/stream.h> -#include <sys/strsun.h> - -#include <sys/socket.h> -#include <sys/socketvar.h> - -#include <sys/project.h> -#include <sys/strsubr.h> - -#include <fs/fs_subr.h> - -#include <sys/esunddi.h> -#include <sys/ddi.h> - -#include <sys/filio.h> -#include <sys/sockio.h> - -#include <inet/sdp_itf.h> -#include "socksdp.h" - -/* - * SDP sockfs vnode operations - */ -static int socksdpv_open(struct vnode **, int, struct cred *, - caller_context_t *); -static int socksdpv_close(struct vnode *, int, int, offset_t, - struct cred *, caller_context_t *); -static int socksdpv_read(struct vnode *, struct uio *, int, struct cred *, - caller_context_t *); -static int socksdpv_write(struct vnode *, struct uio *, int, struct cred *, - caller_context_t *); -static int socksdpv_ioctl(struct vnode *, int, intptr_t, int, - struct cred *, int32_t *, caller_context_t *); -static int socksdp_setfl(vnode_t *, int, int, cred_t *, caller_context_t *); -static void socksdpv_inactive(struct vnode *, struct cred *, - caller_context_t *); -static int socksdpv_poll(struct vnode *, short, int, short *, - struct pollhead **, caller_context_t *); - -const fs_operation_def_t socksdp_vnodeops_template[] = { - VOPNAME_OPEN, { .vop_open = socksdpv_open }, - VOPNAME_CLOSE, { .vop_close = socksdpv_close }, - VOPNAME_READ, { .vop_read = socksdpv_read }, - VOPNAME_WRITE, { .vop_write = socksdpv_write }, - VOPNAME_IOCTL, { .vop_ioctl = socksdpv_ioctl }, - VOPNAME_SETFL, { .vop_setfl = socksdp_setfl }, - VOPNAME_GETATTR, { .vop_getattr = socktpi_getattr }, - VOPNAME_SETATTR, { .vop_setattr = socktpi_setattr }, - VOPNAME_ACCESS, { .vop_access = socktpi_access }, - VOPNAME_FSYNC, { .vop_fsync = socktpi_fsync }, - VOPNAME_INACTIVE, { .vop_inactive = socksdpv_inactive }, - VOPNAME_FID, { .vop_fid = socktpi_fid }, - VOPNAME_SEEK, { .vop_seek = socktpi_seek }, - VOPNAME_POLL, { .vop_poll = socksdpv_poll }, - VOPNAME_DISPOSE, { .error = fs_error }, - NULL, NULL -}; -struct vnodeops *socksdp_vnodeops; - -/*ARGSUSED3*/ -static int -socksdpv_open(struct vnode **vpp, int flag, struct cred *cr, - caller_context_t *ct) -{ - struct sonode *so; - struct sdp_sonode *ss; - struct vnode *vp = *vpp; - int error = EPROTONOSUPPORT; /* in case sdpib fails to load */ - sdp_sockbuf_limits_t sbl; - sdp_upcalls_t *upcalls; - - flag &= ~FCREAT; /* paranoia */ - - so = VTOSO(vp); - ss = SOTOSDO(so); - - mutex_enter(&so->so_lock); - so->so_count++; /* one more open reference */ - ASSERT(so->so_count != 0); /* wraparound */ - mutex_exit(&so->so_lock); - - ASSERT(vp->v_type == VSOCK); - - if (flag & SO_ACCEPTOR) { - ASSERT(so->so_type == SOCK_STREAM); - return (0); - } - - /* - * Active open. - */ - upcalls = &sosdp_sock_upcalls; - - /* - * When the necessary hardware is not available, the sdp_create stub - * will evaluate to nomod_zero, which leaves 'error' untouched. Hence - * the EPROTONOSUPPORT above. A successful call to sdp_create clears - * the error. - */ - so->so_priv = sdp_create(ss, NULL, so->so_family, SDP_CAN_BLOCK, - upcalls, &sbl, cr, &error); - if (so->so_priv == NULL) { - ASSERT(error != 0); - mutex_enter(&so->so_lock); - ASSERT(so->so_count > 0); - so->so_count--; /* one less open reference */ - mutex_exit(&so->so_lock); - return (error); - } - so->so_rcvbuf = sbl.sbl_rxbuf; - so->so_rcvlowat = sbl.sbl_rxlowat; - so->so_sndbuf = sbl.sbl_txbuf; - so->so_sndlowat = sbl.sbl_txlowat; - - return (error); -} - -/*ARGSUSED*/ -static int -socksdpv_close(struct vnode *vp, int flag, int count, offset_t offset, - struct cred *cr, caller_context_t *ct) -{ - int sendsig = 0; - int error = 0; - struct sonode *so; - struct sdp_sonode *ss; - - so = VTOSO(vp); - ss = SOTOSDO(so); - - cleanlocks(vp, ttoproc(curthread)->p_pid, 0); - cleanshares(vp, ttoproc(curthread)->p_pid); - - ASSERT(vp->v_stream == NULL); - if (count > 1) { - dprint(2, ("socksdpv_close: count %d\n", count)); - return (0); - } - - mutex_enter(&so->so_lock); - so_lock_single(so); /* Set SOLOCKED */ - ASSERT(so->so_count > 0); - so->so_count--; /* one fewer open reference */ - - dprint(2, ("socksdpv_close: %p so_count %d\n", (void *)so, - so->so_count)); - - if (so->so_count == 0) { - /* - * Need to set flags as there might be ops in progress on - * this socket. - * - * If socket already disconnected/disconnecting, - * don't send signal (again). - */ - if (!(so->so_state & SS_CANTRCVMORE)) - sendsig |= SDPSIG_READ; - if (!(so->so_state & SS_CANTSENDMORE)) - sendsig |= SDPSIG_WRITE; - soisdisconnected(so, 0); - mutex_exit(&so->so_lock); - - /* - * Initiate connection shutdown. - */ - error = sdp_disconnect(so->so_priv, flag); - - mutex_enter(&so->so_lock); - if (sendsig != 0) - sosdp_sendsig(ss, sendsig); - mutex_exit(&so->so_lock); - - pollwakeup(&ss->ss_poll_list, POLLIN|POLLRDNORM|POLLOUT); - } - mutex_enter(&so->so_lock); - so_unlock_single(so, SOLOCKED); - mutex_exit(&so->so_lock); - - return (error); -} - -/*ARGSUSED2*/ -static int -socksdpv_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr, - caller_context_t *ct) -{ - struct sonode *so = VTOSO(vp); - struct nmsghdr lmsg; - - if (so->so_type != SOCK_STREAM) { - return (EOPNOTSUPP); - } - - ASSERT(vp->v_type == VSOCK); - so_update_attrs(so, SOACC); - lmsg.msg_namelen = 0; - lmsg.msg_controllen = 0; - lmsg.msg_flags = 0; - return (sosdp_recvmsg(so, &lmsg, uiop)); -} - -/* - * Send data, see sosdp_sendmsg() - */ -/*ARGSUSED2*/ -static int -socksdpv_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr, - caller_context_t *ct) -{ - struct sonode *so; - ssize_t count; - int error; - int flags = 0; - - so = VTOSO(vp); - - mutex_enter(&so->so_lock); - if (so->so_state & SS_CANTSENDMORE) { - mutex_exit(&so->so_lock); - tsignal(curthread, SIGPIPE); - return (EPIPE); - } - - if (so->so_error != 0) { - error = sogeterr(so); - if (error != 0) { - mutex_exit(&so->so_lock); - return (error); - } - } - - if (uiop->uio_fmode & (FNDELAY|FNONBLOCK)) - flags |= MSG_DONTWAIT; - - if (!(so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED))) { - mutex_exit(&so->so_lock); - return (ENOTCONN); - } - count = uiop->uio_resid; - mutex_exit(&so->so_lock); - - if (count == 0) { - return (0); - } - so_update_attrs(so, SOMOD); - - error = sdp_send(so->so_priv, NULL, count, flags, uiop); - return (error); -} - -/*ARGSUSED4*/ -static int -socksdpv_ioctl(struct vnode *vp, int cmd, intptr_t arg, int mode, - struct cred *cr, int32_t *rvalp, caller_context_t *ct) -{ - struct sonode *so; - struct sdp_sonode *ss; - int32_t value; - int error, intval; - pid_t pid; - - so = VTOSO(vp); - ss = SOTOSDO(so); - - /* handle socket specific ioctls */ - switch (cmd) { - case FIONBIO: - if (so_copyin((void *)arg, &value, sizeof (int32_t), - (mode & (int)FKIOCTL))) { - return (EFAULT); - } - mutex_enter(&so->so_lock); - if (value != 0) { - so->so_state |= SS_NDELAY; - } else { - so->so_state &= ~SS_NDELAY; - } - mutex_exit(&so->so_lock); - return (0); - - case FIOASYNC: - if (so_copyin((void *)arg, &value, sizeof (int32_t), - (mode & (int)FKIOCTL))) { - return (EFAULT); - } - mutex_enter(&so->so_lock); - - if (value) { - /* Turn on SIGIO */ - so->so_state |= SS_ASYNC; - } else { - /* Turn off SIGIO */ - so->so_state &= ~SS_ASYNC; - } - mutex_exit(&so->so_lock); - return (0); - - case SIOCSPGRP: - case FIOSETOWN: - if (so_copyin((void *)arg, &pid, sizeof (pid_t), - (mode & (int)FKIOCTL))) { - return (EFAULT); - } - mutex_enter(&so->so_lock); - - error = (pid != so->so_pgrp) ? sosdp_chgpgrp(ss, pid) : 0; - mutex_exit(&so->so_lock); - return (error); - - case SIOCGPGRP: - case FIOGETOWN: - if (so_copyout(&so->so_pgrp, (void *)arg, - sizeof (pid_t), (mode & (int)FKIOCTL))) - return (EFAULT); - return (0); - - case SIOCATMARK: - intval = 0; - error = sdp_ioctl(so->so_priv, cmd, &intval, cr); - if (so_copyout(&intval, (void *)arg, sizeof (int), - (mode & (int)FKIOCTL))) - return (EFAULT); - return (0); - - - case SIOCSENABLESDP: { - int32_t enable; - - /* - * System wide enable SDP - */ - - if (so_copyin((void *)arg, &enable, sizeof (int32_t), - mode & (int)FKIOCTL)) - return (EFAULT); - - error = sdp_ioctl(so->so_priv, cmd, &enable, cr); - if (so_copyout(&enable, (void *)arg, - sizeof (int32_t), (mode & (int)FKIOCTL))) - return (EFAULT); - return (0); - } - /* from strioctl */ - case FIONREAD: - /* - * Return number of bytes of data in all data messages - * in queue in "arg". - * For stream socket, amount of available data. - */ - if (so->so_state & SS_ACCEPTCONN) { - intval = 0; - } else { - mutex_enter(&so->so_lock); - intval = sdp_polldata(so->so_priv, SDP_READ); - mutex_exit(&so->so_lock); - } - if (so_copyout(&intval, (void *)arg, sizeof (intval), - (mode & (int)FKIOCTL))) - return (EFAULT); - return (0); - default: - return (EINVAL); - } - -} - -/* - * Allow any flags. Record FNDELAY and FNONBLOCK so that they can be inherited - * from listener to acceptor. - */ -/* ARGSUSED */ -static int -socksdp_setfl(vnode_t *vp, int oflags, int nflags, cred_t *cr, - caller_context_t *ct) -{ - struct sonode *so; - - so = VTOSO(vp); - - mutex_enter(&so->so_lock); - if (nflags & FNDELAY) - so->so_state |= SS_NDELAY; - else - so->so_state &= ~SS_NDELAY; - if (nflags & FNONBLOCK) - so->so_state |= SS_NONBLOCK; - else - so->so_state &= ~SS_NONBLOCK; - mutex_exit(&so->so_lock); - return (0); -} - -/*ARGSUSED*/ -static void -socksdpv_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct) -{ - struct sonode *so; - - so = VTOSO(vp); - - mutex_enter(&vp->v_lock); - /* - * If no one has reclaimed the vnode, remove from the - * cache now. - */ - if (vp->v_count < 1) - cmn_err(CE_PANIC, "socksdpv_inactive: Bad v_count"); - - /* - * Drop the temporary hold by vn_rele now - */ - if (--vp->v_count != 0) { - mutex_exit(&vp->v_lock); - return; - } - mutex_exit(&vp->v_lock); - - /* We are the sole owner of so now */ - - ASSERT(!vn_has_cached_data(vp)); - if (so->so_priv) { - sdp_close(so->so_priv); - } - so->so_priv = NULL; - sosdp_free(so); -} - -/* - * Check socktpi_poll() on why so_lock is not held in this function. - */ -/*ARGSUSED5*/ -static int -socksdpv_poll(struct vnode *vp, short events, int anyyet, short *reventsp, - struct pollhead **phpp, caller_context_t *ct) -{ - struct sonode *so; - struct sdp_sonode *ss; - short origevents = events; - int so_state; - - so = VTOSO(vp); - ss = SOTOSDO(so); - so_state = so->so_state; - - - ASSERT(vp->v_type == VSOCK); - ASSERT(vp->v_stream == NULL); - ASSERT(so->so_version != SOV_STREAM); - - if (!(so_state & SS_ISCONNECTED) && (so->so_type == SOCK_STREAM)) { - /* - * Not connected yet - turn off write side events - */ - events &= ~(POLLOUT|POLLWRBAND); - } - - /* - * Check for errors - */ - if (so->so_error != 0 && - ((POLLIN|POLLRDNORM|POLLOUT) & origevents) != 0) { - *reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents; - return (0); - } - - *reventsp = 0; - - /* - * Don't mark socket as writable until TX queued data is - * below watermark. - */ - if (so->so_type == SOCK_STREAM) { - if (sdp_polldata(so->so_priv, SDP_XMIT)) { - *reventsp |= POLLOUT & events; - } - } else { - *reventsp = 0; - goto done; - } - - if (sdp_polldata(so->so_priv, SDP_READ)) { - *reventsp |= (POLLIN|POLLRDNORM) & events; - } - - if ((so_state & (SS_HASCONNIND|SS_CANTRCVMORE)) != 0) { - *reventsp |= (POLLIN|POLLRDNORM) & events; - } - -done: - if (!*reventsp && !anyyet) { - *phpp = &ss->ss_poll_list; - } - - return (0); -} diff --git a/usr/src/uts/common/fs/sockfs/sockssl.c b/usr/src/uts/common/fs/sockfs/sockssl.c index 037805e6da..8df1d3fe58 100644 --- a/usr/src/uts/common/fs/sockfs/sockssl.c +++ b/usr/src/uts/common/fs/sockfs/sockssl.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/param.h> #include <sys/systm.h> @@ -43,8 +41,9 @@ #include <sys/sockio.h> #include <sys/socketvar.h> -#include <inet/kssl/ksslapi.h> +#include <fs/sockfs/socktpi.h> +#include <inet/kssl/ksslapi.h> /* * This routine is registered with the stream head to be called by kstrgetmsg() @@ -61,7 +60,7 @@ strsock_kssl_input(vnode_t *vp, mblk_t *mp, strsigset_t *allmsgsigs, strpollset_t *pollwakeups) { struct sonode *so = VTOSO(vp); - kssl_ctx_t kssl_ctx = so->so_kssl_ctx; + kssl_ctx_t kssl_ctx = SOTOTPI(so)->sti_kssl_ctx; kssl_cmd_t kssl_cmd; mblk_t *out; @@ -101,7 +100,7 @@ strsock_kssl_output(vnode_t *vp, mblk_t *mp, strsigset_t *allmsgsigs, strpollset_t *pollwakeups) { struct sonode *so = VTOSO(vp); - kssl_ctx_t kssl_ctx = so->so_kssl_ctx; + kssl_ctx_t kssl_ctx = SOTOTPI(so)->sti_kssl_ctx; mblk_t *recmp; dprintso(so, 1, ("strsock_kssl_output(%p, %p)\n", diff --git a/usr/src/uts/common/fs/sockfs/sockstr.c b/usr/src/uts/common/fs/sockfs/sockstr.c index b783a27251..71c8d4c49c 100644 --- a/usr/src/uts/common/fs/sockfs/sockstr.c +++ b/usr/src/uts/common/fs/sockfs/sockstr.c @@ -51,13 +51,15 @@ #include <sys/cmn_err.h> #include <sys/proc.h> #include <sys/ddi.h> -#include <sys/kmem_impl.h> #include <sys/suntpi.h> #include <sys/socket.h> #include <sys/sockio.h> #include <sys/socketvar.h> +#include <sys/sodirect.h> #include <netinet/in.h> +#include <inet/common.h> +#include <inet/proto_set.h> #include <sys/tiuser.h> #define _SUN_TPI_VERSION 2 @@ -67,6 +69,8 @@ #include <c2/audit.h> +#include <fs/sockfs/socktpi.h> +#include <fs/sockfs/socktpi_impl.h> #include <sys/dcopy.h> int so_default_version = SOV_SOCKSTREAM; @@ -115,13 +119,9 @@ static mblk_t *strsock_proto(vnode_t *vp, mblk_t *mp, static mblk_t *strsock_misc(vnode_t *vp, mblk_t *mp, strwakeup_t *wakeups, strsigset_t *firstmsgsigs, strsigset_t *allmsgsigs, strpollset_t *pollwakeups); - -static int tlitosyserr(int terr); - /* - * Sodirect kmem_cache and put/wakeup functions. + * STREAMS based sodirect put/wakeup functions. */ -struct kmem_cache *socktpi_sod_cache; static int sodput(sodirect_t *, mblk_t *); static void sodwakeup(sodirect_t *); @@ -131,10 +131,7 @@ static void sodwakeup(sodirect_t *); int sostr_init() { - /* Allocate sodirect_t kmem_cache */ - socktpi_sod_cache = kmem_cache_create("socktpi_sod_cache", - sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - + sod_init(); return (0); } @@ -151,15 +148,16 @@ so_sock2stream(struct sonode *so) queue_t *rq; mblk_t *mp; int error = 0; + sotpi_info_t *sti = SOTOTPI(so); - ASSERT(MUTEX_HELD(&so->so_plumb_lock)); + ASSERT(MUTEX_HELD(&sti->sti_plumb_lock)); mutex_enter(&so->so_lock); so_lock_single(so); ASSERT(so->so_version != SOV_STREAM); - if (so->so_state & SS_DIRECT) { + if (sti->sti_direct) { mblk_t **mpp; int rval; @@ -175,9 +173,9 @@ so_sock2stream(struct sonode *so) "_SIOCSOCKFALLBACK failed\n", (void *)so)); goto exit; } - so->so_state &= ~SS_DIRECT; + sti->sti_direct = 0; - for (mpp = &so->so_conn_ind_head; (mp = *mpp) != NULL; + for (mpp = &sti->sti_conn_ind_head; (mp = *mpp) != NULL; mpp = &mp->b_next) { struct T_conn_ind *conn_ind; @@ -236,7 +234,7 @@ so_sock2stream(struct sonode *so) } so->so_version = SOV_STREAM; - so->so_priv = NULL; + so->so_proto_handle = NULL; /* * Remove the hooks in the stream head to avoid queuing more @@ -251,20 +249,20 @@ so_sock2stream(struct sonode *so) * on the queue - the behavior of urgent data after a switch is * left undefined. */ - so->so_error = so->so_delayed_error = 0; + so->so_error = sti->sti_delayed_error = 0; freemsg(so->so_oobmsg); so->so_oobmsg = NULL; - so->so_oobsigcnt = so->so_oobcnt = 0; + sti->sti_oobsigcnt = sti->sti_oobcnt = 0; so->so_state &= ~(SS_RCVATMARK|SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA| - SS_HASCONNIND|SS_SAVEDEOR); + SS_SAVEDEOR); ASSERT(so_verify_oobstate(so)); - freemsg(so->so_ack_mp); - so->so_ack_mp = NULL; + freemsg(sti->sti_ack_mp); + sti->sti_ack_mp = NULL; /* - * Flush the T_DISCON_IND on so_discon_ind_mp. + * Flush the T_DISCON_IND on sti_discon_ind_mp. */ so_flush_discon_ind(so); @@ -272,16 +270,15 @@ so_sock2stream(struct sonode *so) * Move any queued T_CONN_IND messages to stream head queue. */ rq = RD(strvp2wq(vp)); - while ((mp = so->so_conn_ind_head) != NULL) { - so->so_conn_ind_head = mp->b_next; + while ((mp = sti->sti_conn_ind_head) != NULL) { + sti->sti_conn_ind_head = mp->b_next; mp->b_next = NULL; - if (so->so_conn_ind_head == NULL) { - ASSERT(so->so_conn_ind_tail == mp); - so->so_conn_ind_tail = NULL; + if (sti->sti_conn_ind_head == NULL) { + ASSERT(sti->sti_conn_ind_tail == mp); + sti->sti_conn_ind_tail = NULL; } dprintso(so, 0, - ("so_sock2stream(%p): moving T_CONN_IND\n", - (void *)so)); + ("so_sock2stream(%p): moving T_CONN_IND\n", (void *)so)); /* Drop lock across put() */ mutex_exit(&so->so_lock); @@ -311,14 +308,15 @@ void so_stream2sock(struct sonode *so) { struct vnode *vp = SOTOV(so); + sotpi_info_t *sti = SOTOTPI(so); - ASSERT(MUTEX_HELD(&so->so_plumb_lock)); + ASSERT(MUTEX_HELD(&sti->sti_plumb_lock)); mutex_enter(&so->so_lock); so_lock_single(so); ASSERT(so->so_version == SOV_STREAM); so->so_version = SOV_SOCKSTREAM; - so->so_pushcnt = 0; + sti->sti_pushcnt = 0; mutex_exit(&so->so_lock); /* @@ -350,7 +348,7 @@ so_stream2sock(struct sonode *so) mutex_enter(&so->so_lock); /* - * Flush the T_DISCON_IND on so_discon_ind_mp. + * Flush the T_DISCON_IND on sti_discon_ind_mp. */ so_flush_discon_ind(so); so_unlock_read(so); /* Clear SOREADLOCKED */ @@ -388,25 +386,18 @@ so_removehooks(struct sonode *so) */ } -/* - * Initialize the streams side of a socket including - * T_info_req/ack processing. If tso is not NULL its values are used thereby - * avoiding the T_INFO_REQ. - */ -int -so_strinit(struct sonode *so, struct sonode *tso) +void +so_basic_strinit(struct sonode *so) { struct vnode *vp = SOTOV(so); struct stdata *stp; mblk_t *mp; - int error; - - dprintso(so, 1, ("so_strinit(%p)\n", (void *)so)); + sotpi_info_t *sti = SOTOTPI(so); /* Preallocate an unbind_req message */ mp = soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP); mutex_enter(&so->so_lock); - so->so_unbind_mp = mp; + sti->sti_unbind_mp = mp; #ifdef DEBUG so->so_options = so_default_options; #endif /* DEBUG */ @@ -414,6 +405,40 @@ so_strinit(struct sonode *so, struct sonode *tso) so_installhooks(so); + stp = vp->v_stream; + /* + * Have to keep minpsz at zero in order to allow write/send of zero + * bytes. + */ + mutex_enter(&stp->sd_lock); + if (stp->sd_qn_minpsz == 1) + stp->sd_qn_minpsz = 0; + mutex_exit(&stp->sd_lock); + + /* + * If sodirect capable allocate and initialize sodirect_t. + * Note, SS_SODIRECT is set in socktpi_open(). + */ + if ((so->so_state & SS_SODIRECT) && + !(so->so_state & SS_FALLBACK_PENDING)) { + sod_sock_init(so, stp, sodput, sodwakeup, &stp->sd_lock); + } +} + +/* + * Initialize the streams side of a socket including + * T_info_req/ack processing. If tso is not NULL its values are used thereby + * avoiding the T_INFO_REQ. + */ +int +so_strinit(struct sonode *so, struct sonode *tso) +{ + sotpi_info_t *sti = SOTOTPI(so); + sotpi_info_t *tsti; + int error; + + so_basic_strinit(so); + /* * The T_CAPABILITY_REQ should be the first message sent down because * at least TCP has a fast-path for this which avoids timeouts while @@ -424,19 +449,21 @@ so_strinit(struct sonode *so, struct sonode *tso) if (error) return (error); } else { + tsti = SOTOTPI(tso); + mutex_enter(&so->so_lock); - so->so_tsdu_size = tso->so_tsdu_size; - so->so_etsdu_size = tso->so_etsdu_size; - so->so_addr_size = tso->so_addr_size; - so->so_opt_size = tso->so_opt_size; - so->so_tidu_size = tso->so_tidu_size; - so->so_serv_type = tso->so_serv_type; + sti->sti_tsdu_size = tsti->sti_tsdu_size; + sti->sti_etsdu_size = tsti->sti_etsdu_size; + sti->sti_addr_size = tsti->sti_addr_size; + sti->sti_opt_size = tsti->sti_opt_size; + sti->sti_tidu_size = tsti->sti_tidu_size; + sti->sti_serv_type = tsti->sti_serv_type; so->so_mode = tso->so_mode & ~SM_ACCEPTOR_ID; mutex_exit(&so->so_lock); /* the following do_tcapability may update so->so_mode */ - if ((tso->so_serv_type != T_CLTS) && - !(tso->so_state & SS_DIRECT)) { + if ((tsti->sti_serv_type != T_CLTS) && + (sti->sti_direct == 0)) { error = do_tcapability(so, TC1_ACCEPTOR_ID); if (error) return (error); @@ -448,73 +475,19 @@ so_strinit(struct sonode *so, struct sonode *tso) * We set the addr_size to something to allocate a the address * structures. */ - if (so->so_addr_size == 0) { + if (sti->sti_addr_size == 0) { so->so_state |= SS_ISBOUND | SS_ISCONNECTED; /* Address size can vary with address families. */ if (so->so_family == AF_INET6) - so->so_addr_size = + sti->sti_addr_size = (t_scalar_t)sizeof (struct sockaddr_in6); else - so->so_addr_size = + sti->sti_addr_size = (t_scalar_t)sizeof (struct sockaddr_in); - ASSERT(so->so_unbind_mp); + ASSERT(sti->sti_unbind_mp); } - /* - * Allocate the addresses. - */ - ASSERT(so->so_laddr_sa == NULL && so->so_faddr_sa == NULL); - ASSERT(so->so_laddr_len == 0 && so->so_faddr_len == 0); - so->so_laddr_maxlen = so->so_faddr_maxlen = - P2ROUNDUP(so->so_addr_size, KMEM_ALIGN); - so->so_laddr_sa = kmem_alloc(so->so_laddr_maxlen * 2, KM_SLEEP); - so->so_faddr_sa = (struct sockaddr *)((caddr_t)so->so_laddr_sa - + so->so_laddr_maxlen); - - if (so->so_family == AF_UNIX) { - /* - * Initialize AF_UNIX related fields. - */ - bzero(&so->so_ux_laddr, sizeof (so->so_ux_laddr)); - bzero(&so->so_ux_faddr, sizeof (so->so_ux_faddr)); - } - - stp = vp->v_stream; - /* - * Have to keep minpsz at zero in order to allow write/send of zero - * bytes. - */ - mutex_enter(&stp->sd_lock); - if (stp->sd_qn_minpsz == 1) - stp->sd_qn_minpsz = 0; - mutex_exit(&stp->sd_lock); - /* - * If sodirect capable allocate and initialize sodirect_t. - * Note, SS_SODIRECT is set in socktpi_open(). - */ - if (so->so_state & SS_SODIRECT) { - sodirect_t *sodp; - - ASSERT(so->so_direct == NULL); - - sodp = kmem_cache_alloc(socktpi_sod_cache, KM_SLEEP); - sodp->sod_state = SOD_ENABLED | SOD_WAKE_NOT; - sodp->sod_want = 0; - sodp->sod_q = RD(stp->sd_wrq); - sodp->sod_enqueue = sodput; - sodp->sod_wakeup = sodwakeup; - sodp->sod_uioafh = NULL; - sodp->sod_uioaft = NULL; - sodp->sod_lockp = &stp->sd_lock; - /* - * Remainder of the sod_uioa members are left uninitialized - * but will be initialized later by uioainit() before uioa - * is enabled. - */ - sodp->sod_uioa.uioa_state = UIOA_ALLOC; - so->so_direct = sodp; - stp->sd_sodirect = sodp; - } + so_alloc_addr(so, sti->sti_addr_size); return (0); } @@ -522,25 +495,28 @@ so_strinit(struct sonode *so, struct sonode *tso) static void copy_tinfo(struct sonode *so, struct T_info_ack *tia) { - so->so_tsdu_size = tia->TSDU_size; - so->so_etsdu_size = tia->ETSDU_size; - so->so_addr_size = tia->ADDR_size; - so->so_opt_size = tia->OPT_size; - so->so_tidu_size = tia->TIDU_size; - so->so_serv_type = tia->SERV_type; + sotpi_info_t *sti = SOTOTPI(so); + + sti->sti_tsdu_size = tia->TSDU_size; + sti->sti_etsdu_size = tia->ETSDU_size; + sti->sti_addr_size = tia->ADDR_size; + sti->sti_opt_size = tia->OPT_size; + sti->sti_tidu_size = tia->TIDU_size; + sti->sti_serv_type = tia->SERV_type; switch (tia->CURRENT_state) { case TS_UNBND: break; case TS_IDLE: so->so_state |= SS_ISBOUND; - so->so_laddr_len = 0; - so->so_state &= ~SS_LADDR_VALID; + sti->sti_laddr_len = 0; + sti->sti_laddr_valid = 0; break; case TS_DATA_XFER: so->so_state |= SS_ISBOUND|SS_ISCONNECTED; - so->so_laddr_len = 0; - so->so_faddr_len = 0; - so->so_state &= ~(SS_LADDR_VALID | SS_FADDR_VALID); + sti->sti_laddr_len = 0; + sti->sti_faddr_len = 0; + sti->sti_laddr_valid = 0; + sti->sti_faddr_valid = 0; break; } @@ -550,11 +526,11 @@ copy_tinfo(struct sonode *so, struct T_info_ack *tia) * and SM_EXDATA, SM_OPTDATA, and SM_BYTESTREAM) * from the info ack. */ - if (so->so_serv_type == T_CLTS) { + if (sti->sti_serv_type == T_CLTS) { so->so_mode |= SM_ATOMIC | SM_ADDR; } else { so->so_mode |= SM_CONNREQUIRED; - if (so->so_etsdu_size != 0 && so->so_etsdu_size != -2) + if (sti->sti_etsdu_size != 0 && sti->sti_etsdu_size != -2) so->so_mode |= SM_EXDATA; } if (so->so_type == SOCK_SEQPACKET || so->so_type == SOCK_RAW) { @@ -563,9 +539,9 @@ copy_tinfo(struct sonode *so, struct T_info_ack *tia) } if (so->so_family == AF_UNIX) { so->so_mode |= SM_FDPASSING | SM_OPTDATA; - if (so->so_addr_size == -1) { + if (sti->sti_addr_size == -1) { /* MAXPATHLEN + soun_family + nul termination */ - so->so_addr_size = (t_scalar_t)(MAXPATHLEN + + sti->sti_addr_size = (t_scalar_t)(MAXPATHLEN + sizeof (short) + 1); } if (so->so_type == SOCK_STREAM) { @@ -573,60 +549,62 @@ copy_tinfo(struct sonode *so, struct T_info_ack *tia) * Make it into a byte-stream transport. * SOCK_SEQPACKET sockets are unchanged. */ - so->so_tsdu_size = 0; + sti->sti_tsdu_size = 0; } - } else if (so->so_addr_size == -1) { + } else if (sti->sti_addr_size == -1) { /* * Logic extracted from sockmod - have to pick some max address * length in order to preallocate the addresses. */ - so->so_addr_size = SOA_DEFSIZE; + sti->sti_addr_size = SOA_DEFSIZE; } - if (so->so_tsdu_size == 0) + if (sti->sti_tsdu_size == 0) so->so_mode |= SM_BYTESTREAM; } static int check_tinfo(struct sonode *so) { + sotpi_info_t *sti = SOTOTPI(so); + /* Consistency checks */ - if (so->so_type == SOCK_DGRAM && so->so_serv_type != T_CLTS) { + if (so->so_type == SOCK_DGRAM && sti->sti_serv_type != T_CLTS) { eprintso(so, ("service type and socket type mismatch\n")); eprintsoline(so, EPROTO); return (EPROTO); } - if (so->so_type == SOCK_STREAM && so->so_serv_type == T_CLTS) { + if (so->so_type == SOCK_STREAM && sti->sti_serv_type == T_CLTS) { eprintso(so, ("service type and socket type mismatch\n")); eprintsoline(so, EPROTO); return (EPROTO); } - if (so->so_type == SOCK_SEQPACKET && so->so_serv_type == T_CLTS) { + if (so->so_type == SOCK_SEQPACKET && sti->sti_serv_type == T_CLTS) { eprintso(so, ("service type and socket type mismatch\n")); eprintsoline(so, EPROTO); return (EPROTO); } if (so->so_family == AF_INET && - so->so_addr_size != (t_scalar_t)sizeof (struct sockaddr_in)) { + sti->sti_addr_size != (t_scalar_t)sizeof (struct sockaddr_in)) { eprintso(so, ("AF_INET must have sockaddr_in address length. Got %d\n", - so->so_addr_size)); + sti->sti_addr_size)); eprintsoline(so, EMSGSIZE); return (EMSGSIZE); } if (so->so_family == AF_INET6 && - so->so_addr_size != (t_scalar_t)sizeof (struct sockaddr_in6)) { + sti->sti_addr_size != (t_scalar_t)sizeof (struct sockaddr_in6)) { eprintso(so, ("AF_INET6 must have sockaddr_in6 address length. Got %d\n", - so->so_addr_size)); + sti->sti_addr_size)); eprintsoline(so, EMSGSIZE); return (EMSGSIZE); } dprintso(so, 1, ( "tinfo: serv %d tsdu %d, etsdu %d, addr %d, opt %d, tidu %d\n", - so->so_serv_type, so->so_tsdu_size, so->so_etsdu_size, - so->so_addr_size, so->so_opt_size, - so->so_tidu_size)); + sti->sti_serv_type, sti->sti_tsdu_size, sti->sti_etsdu_size, + sti->sti_addr_size, sti->sti_opt_size, + sti->sti_tidu_size)); dprintso(so, 1, ("tinfo: so_state %s\n", pr_state(so->so_state, so->so_mode))); return (0); @@ -646,7 +624,7 @@ do_tinfo(struct sonode *so) ASSERT(MUTEX_NOT_HELD(&so->so_lock)); if (so_no_tinfo) { - so->so_addr_size = 0; + SOTOTPI(so)->sti_addr_size = 0; return (0); } @@ -697,16 +675,17 @@ do_tcapability(struct sonode *so, t_uscalar_t cap_bits1) struct T_capability_ack *tca; mblk_t *mp; int error; + sotpi_info_t *sti = SOTOTPI(so); ASSERT(cap_bits1 != 0); ASSERT((cap_bits1 & ~(TC1_ACCEPTOR_ID | TC1_INFO)) == 0); ASSERT(MUTEX_NOT_HELD(&so->so_lock)); - if (so->so_provinfo->tpi_capability == PI_NO) + if (sti->sti_provinfo->tpi_capability == PI_NO) return (do_tinfo(so)); if (so_no_tinfo) { - so->so_addr_size = 0; + sti->sti_addr_size = 0; if ((cap_bits1 &= ~TC1_INFO) == 0) return (0); } @@ -737,10 +716,10 @@ do_tcapability(struct sonode *so, t_uscalar_t cap_bits1) if ((error = sowaitprim(so, T_CAPABILITY_REQ, T_CAPABILITY_ACK, (t_uscalar_t)sizeof (*tca), &mp, sock_capability_timeout * hz))) { mutex_exit(&so->so_lock); - PI_PROVLOCK(so->so_provinfo); - if (so->so_provinfo->tpi_capability == PI_DONTKNOW) - so->so_provinfo->tpi_capability = PI_NO; - PI_PROVUNLOCK(so->so_provinfo); + PI_PROVLOCK(sti->sti_provinfo); + if (sti->sti_provinfo->tpi_capability == PI_DONTKNOW) + sti->sti_provinfo->tpi_capability = PI_NO; + PI_PROVUNLOCK(sti->sti_provinfo); ASSERT((so->so_mode & SM_ACCEPTOR_ID) == 0); if (cap_bits1 & TC1_INFO) { /* @@ -758,27 +737,14 @@ do_tcapability(struct sonode *so, t_uscalar_t cap_bits1) return (0); } - if (so->so_provinfo->tpi_capability == PI_DONTKNOW) { - PI_PROVLOCK(so->so_provinfo); - so->so_provinfo->tpi_capability = PI_YES; - PI_PROVUNLOCK(so->so_provinfo); - } - ASSERT(mp); tca = (struct T_capability_ack *)mp->b_rptr; ASSERT((cap_bits1 & TC1_INFO) == (tca->CAP_bits1 & TC1_INFO)); + so_proc_tcapability_ack(so, tca); cap_bits1 = tca->CAP_bits1; - if (cap_bits1 & TC1_ACCEPTOR_ID) { - so->so_acceptor_id = tca->ACCEPTOR_id; - so->so_mode |= SM_ACCEPTOR_ID; - } - - if (cap_bits1 & TC1_INFO) - copy_tinfo(so, &tca->INFO_ack); - mutex_exit(&so->so_lock); freemsg(mp); @@ -789,17 +755,41 @@ do_tcapability(struct sonode *so, t_uscalar_t cap_bits1) } /* - * Retrieve and clear the socket error. + * Process a T_CAPABILITY_ACK + */ +void +so_proc_tcapability_ack(struct sonode *so, struct T_capability_ack *tca) +{ + sotpi_info_t *sti = SOTOTPI(so); + + if (sti->sti_provinfo->tpi_capability == PI_DONTKNOW) { + PI_PROVLOCK(sti->sti_provinfo); + sti->sti_provinfo->tpi_capability = PI_YES; + PI_PROVUNLOCK(sti->sti_provinfo); + } + + if (tca->CAP_bits1 & TC1_ACCEPTOR_ID) { + sti->sti_acceptor_id = tca->ACCEPTOR_id; + so->so_mode |= SM_ACCEPTOR_ID; + } + + if (tca->CAP_bits1 & TC1_INFO) + copy_tinfo(so, &tca->INFO_ack); +} + +/* + * Retrieve socket error, clear error if not peek. */ int -sogeterr(struct sonode *so) +sogeterr(struct sonode *so, boolean_t clear_err) { int error; ASSERT(MUTEX_HELD(&so->so_lock)); error = so->so_error; - so->so_error = 0; + if (clear_err) + so->so_error = 0; return (error); } @@ -898,8 +888,7 @@ void soisdisconnected(struct sonode *so, int error) { ASSERT(MUTEX_HELD(&so->so_lock)); - so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING| - SS_LADDR_VALID|SS_FADDR_VALID); + so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE); so->so_error = (ushort_t)error; if (so->so_peercred != NULL) { @@ -935,7 +924,7 @@ void socantsendmore(struct sonode *so) { ASSERT(MUTEX_HELD(&so->so_lock)); - so->so_state = so->so_state & ~SS_FADDR_VALID | SS_CANTSENDMORE; + so->so_state |= SS_CANTSENDMORE; cv_broadcast(&so->so_state_cv); } @@ -1013,13 +1002,11 @@ sowaitprim(struct sonode *so, t_scalar_t request_prim, t_scalar_t ack_prim, if (tpr->error_ack.TLI_error == TSYSERR) { error = tpr->error_ack.UNIX_error; } else { - error = tlitosyserr(tpr->error_ack.TLI_error); + error = proto_tlitosyserr(tpr->error_ack.TLI_error); } dprintso(so, 0, ("error_ack for %d: %d/%d ->%d\n", - tpr->error_ack.ERROR_prim, - tpr->error_ack.TLI_error, - tpr->error_ack.UNIX_error, - error)); + tpr->error_ack.ERROR_prim, tpr->error_ack.TLI_error, + tpr->error_ack.UNIX_error, error)); freemsg(mp); return (error); } @@ -1029,13 +1016,11 @@ sowaitprim(struct sonode *so, t_scalar_t request_prim, t_scalar_t ack_prim, #ifdef DEBUG if (tpr->type == T_ERROR_ACK) { dprintso(so, 0, ("error_ack for %d: %d/%d\n", - tpr->error_ack.ERROR_prim, - tpr->error_ack.TLI_error, + tpr->error_ack.ERROR_prim, tpr->error_ack.TLI_error, tpr->error_ack.UNIX_error)); } else if (tpr->type == T_OK_ACK) { dprintso(so, 0, ("ok_ack for %d, expected %d for %d\n", - tpr->ok_ack.CORRECT_prim, - ack_prim, request_prim)); + tpr->ok_ack.CORRECT_prim, ack_prim, request_prim)); } else { dprintso(so, 0, ("unexpected primitive %d, expected %d for %d\n", @@ -1066,11 +1051,13 @@ sowaitokack(struct sonode *so, t_scalar_t request_prim) } /* - * Queue a received TPI ack message on so_ack_mp. + * Queue a received TPI ack message on sti_ack_mp. */ void soqueueack(struct sonode *so, mblk_t *mp) { + sotpi_info_t *sti = SOTOTPI(so); + if (DB_TYPE(mp) != M_PCPROTO) { zcmn_err(getzoneid(), CE_WARN, "sockfs: received unexpected M_PROTO TPI ack. Prim %d\n", @@ -1080,13 +1067,13 @@ soqueueack(struct sonode *so, mblk_t *mp) } mutex_enter(&so->so_lock); - if (so->so_ack_mp != NULL) { - dprintso(so, 1, ("so_ack_mp already set\n")); - freemsg(so->so_ack_mp); - so->so_ack_mp = NULL; + if (sti->sti_ack_mp != NULL) { + dprintso(so, 1, ("sti_ack_mp already set\n")); + freemsg(sti->sti_ack_mp); + sti->sti_ack_mp = NULL; } - so->so_ack_mp = mp; - cv_broadcast(&so->so_ack_cv); + sti->sti_ack_mp = mp; + cv_broadcast(&sti->sti_ack_cv); mutex_exit(&so->so_lock); } @@ -1096,9 +1083,11 @@ soqueueack(struct sonode *so, mblk_t *mp) int sowaitack(struct sonode *so, mblk_t **mpp, clock_t wait) { + sotpi_info_t *sti = SOTOTPI(so); + ASSERT(MUTEX_HELD(&so->so_lock)); - while (so->so_ack_mp == NULL) { + while (sti->sti_ack_mp == NULL) { #ifdef SOCK_TEST if (wait == 0 && sock_test_timelimit != 0) wait = sock_test_timelimit; @@ -1110,16 +1099,16 @@ sowaitack(struct sonode *so, mblk_t **mpp, clock_t wait) clock_t now; time_to_wait(&now, wait); - if (cv_timedwait(&so->so_ack_cv, &so->so_lock, + if (cv_timedwait(&sti->sti_ack_cv, &so->so_lock, now) == -1) { eprintsoline(so, ETIME); return (ETIME); } } else - cv_wait(&so->so_ack_cv, &so->so_lock); + cv_wait(&sti->sti_ack_cv, &so->so_lock); } - *mpp = so->so_ack_mp; + *mpp = sti->sti_ack_mp; #ifdef DEBUG { union T_primitives *tpr; @@ -1135,16 +1124,18 @@ sowaitack(struct sonode *so, mblk_t **mpp, clock_t wait) tpr->type == T_OPTMGMT_ACK); } #endif /* DEBUG */ - so->so_ack_mp = NULL; + sti->sti_ack_mp = NULL; return (0); } /* - * Queue a received T_CONN_IND message on so_conn_ind_head/tail. + * Queue a received T_CONN_IND message on sti_conn_ind_head/tail. */ void soqueueconnind(struct sonode *so, mblk_t *mp) { + sotpi_info_t *sti = SOTOTPI(so); + if (DB_TYPE(mp) != M_PROTO) { zcmn_err(getzoneid(), CE_WARN, "sockfs: received unexpected M_PCPROTO T_CONN_IND\n"); @@ -1154,17 +1145,15 @@ soqueueconnind(struct sonode *so, mblk_t *mp) mutex_enter(&so->so_lock); ASSERT(mp->b_next == NULL); - if (so->so_conn_ind_head == NULL) { - so->so_conn_ind_head = mp; - so->so_state |= SS_HASCONNIND; + if (sti->sti_conn_ind_head == NULL) { + sti->sti_conn_ind_head = mp; } else { - ASSERT(so->so_state & SS_HASCONNIND); - ASSERT(so->so_conn_ind_tail->b_next == NULL); - so->so_conn_ind_tail->b_next = mp; + ASSERT(sti->sti_conn_ind_tail->b_next == NULL); + sti->sti_conn_ind_tail->b_next = mp; } - so->so_conn_ind_tail = mp; + sti->sti_conn_ind_tail = mp; /* Wakeup a single consumer of the T_CONN_IND */ - cv_signal(&so->so_connind_cv); + cv_signal(&so->so_acceptq_cv); mutex_exit(&so->so_lock); } @@ -1177,37 +1166,43 @@ int sowaitconnind(struct sonode *so, int fmode, mblk_t **mpp) { mblk_t *mp; + sotpi_info_t *sti = SOTOTPI(so); int error = 0; ASSERT(MUTEX_NOT_HELD(&so->so_lock)); mutex_enter(&so->so_lock); check_error: if (so->so_error) { - error = sogeterr(so); + error = sogeterr(so, B_TRUE); if (error) { mutex_exit(&so->so_lock); return (error); } } - if (so->so_conn_ind_head == NULL) { + if (sti->sti_conn_ind_head == NULL) { if (fmode & (FNDELAY|FNONBLOCK)) { error = EWOULDBLOCK; goto done; } - if (!cv_wait_sig_swap(&so->so_connind_cv, &so->so_lock)) { + + if (so->so_state & SS_CLOSING) { + error = EINTR; + goto done; + } + + if (!cv_wait_sig_swap(&so->so_acceptq_cv, &so->so_lock)) { error = EINTR; goto done; } goto check_error; } - mp = so->so_conn_ind_head; - so->so_conn_ind_head = mp->b_next; + mp = sti->sti_conn_ind_head; + sti->sti_conn_ind_head = mp->b_next; mp->b_next = NULL; - if (so->so_conn_ind_head == NULL) { - ASSERT(so->so_conn_ind_tail == mp); - so->so_conn_ind_tail = NULL; - so->so_state &= ~SS_HASCONNIND; + if (sti->sti_conn_ind_head == NULL) { + ASSERT(sti->sti_conn_ind_tail == mp); + sti->sti_conn_ind_tail = NULL; } *mpp = mp; done: @@ -1225,31 +1220,32 @@ soflushconnind(struct sonode *so, t_scalar_t seqno) { mblk_t *prevmp, *mp; struct T_conn_ind *tci; + sotpi_info_t *sti = SOTOTPI(so); mutex_enter(&so->so_lock); - for (prevmp = NULL, mp = so->so_conn_ind_head; mp != NULL; + for (prevmp = NULL, mp = sti->sti_conn_ind_head; mp != NULL; prevmp = mp, mp = mp->b_next) { tci = (struct T_conn_ind *)mp->b_rptr; if (tci->SEQ_number == seqno) { dprintso(so, 1, ("t_discon_ind: found T_CONN_IND %d\n", seqno)); /* Deleting last? */ - if (so->so_conn_ind_tail == mp) { - so->so_conn_ind_tail = prevmp; + if (sti->sti_conn_ind_tail == mp) { + sti->sti_conn_ind_tail = prevmp; } if (prevmp == NULL) { /* Deleting first */ - so->so_conn_ind_head = mp->b_next; + sti->sti_conn_ind_head = mp->b_next; } else { prevmp->b_next = mp->b_next; } mp->b_next = NULL; - if (so->so_conn_ind_head == NULL) { - ASSERT(so->so_conn_ind_tail == NULL); - so->so_state &= ~SS_HASCONNIND; - } else { - ASSERT(so->so_conn_ind_tail != NULL); - } + + ASSERT((sti->sti_conn_ind_head == NULL && + sti->sti_conn_ind_tail == NULL) || + (sti->sti_conn_ind_head != NULL && + sti->sti_conn_ind_tail != NULL)); + so->so_error = ECONNABORTED; mutex_exit(&so->so_lock); @@ -1295,6 +1291,9 @@ sowaitconnected(struct sonode *so, int fmode, int nosig) if (fmode & (FNDELAY|FNONBLOCK)) return (EINPROGRESS); + if (so->so_state & SS_CLOSING) + return (EINTR); + if (nosig) cv_wait(&so->so_state_cv, &so->so_lock); else if (!cv_wait_sig_swap(&so->so_state_cv, &so->so_lock)) { @@ -1309,7 +1308,7 @@ sowaitconnected(struct sonode *so, int fmode, int nosig) } if (so->so_error != 0) { - error = sogeterr(so); + error = sogeterr(so, B_TRUE); ASSERT(error != 0); dprintso(so, 1, ("sowaitconnected: error %d\n", error)); return (error); @@ -1335,11 +1334,13 @@ static void so_oob_sig(struct sonode *so, int extrasig, strsigset_t *signals, strpollset_t *pollwakeups) { + sotpi_info_t *sti = SOTOTPI(so); + ASSERT(MUTEX_HELD(&so->so_lock)); ASSERT(so_verify_oobstate(so)); - ASSERT(so->so_oobsigcnt >= so->so_oobcnt); - if (so->so_oobsigcnt > so->so_oobcnt) { + ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt); + if (sti->sti_oobsigcnt > sti->sti_oobcnt) { /* * Signal has already been generated once for this * urgent "event". However, since TCP can receive updated @@ -1353,9 +1354,9 @@ so_oob_sig(struct sonode *so, int extrasig, return; } - so->so_oobsigcnt++; - ASSERT(so->so_oobsigcnt > 0); /* Wraparound */ - ASSERT(so->so_oobsigcnt > so->so_oobcnt); + sti->sti_oobsigcnt++; + ASSERT(sti->sti_oobsigcnt > 0); /* Wraparound */ + ASSERT(sti->sti_oobsigcnt > sti->sti_oobcnt); /* * Record (for select/poll) that urgent data is pending. @@ -1385,15 +1386,17 @@ static mblk_t * so_oob_exdata(struct sonode *so, mblk_t *mp, strsigset_t *signals, strpollset_t *pollwakeups) { + sotpi_info_t *sti = SOTOTPI(so); + ASSERT(MUTEX_HELD(&so->so_lock)); ASSERT(so_verify_oobstate(so)); - ASSERT(so->so_oobsigcnt > so->so_oobcnt); + ASSERT(sti->sti_oobsigcnt > sti->sti_oobcnt); - so->so_oobcnt++; - ASSERT(so->so_oobcnt > 0); /* wraparound? */ - ASSERT(so->so_oobsigcnt >= so->so_oobcnt); + sti->sti_oobcnt++; + ASSERT(sti->sti_oobcnt > 0); /* wraparound? */ + ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt); /* * Set MSGMARK for SIOCATMARK. @@ -1412,11 +1415,13 @@ static mblk_t * so_oob_data(struct sonode *so, mblk_t *mp, strsigset_t *signals, strpollset_t *pollwakeups) { + sotpi_info_t *sti = SOTOTPI(so); + ASSERT(MUTEX_HELD(&so->so_lock)); ASSERT(so_verify_oobstate(so)); - ASSERT(so->so_oobsigcnt >= so->so_oobcnt); + ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt); ASSERT(mp != NULL); /* * For OOBINLINE we keep the data in the T_EXDATA_IND. @@ -1439,7 +1444,7 @@ so_oob_data(struct sonode *so, mblk_t *mp, /* * Caller must hold the mutex. * For delayed processing, save the T_DISCON_IND received - * from below on so_discon_ind_mp. + * from below on sti_discon_ind_mp. * When the message is processed the framework will call: * (*func)(so, mp); */ @@ -1448,14 +1453,16 @@ so_save_discon_ind(struct sonode *so, mblk_t *mp, void (*func)(struct sonode *so, mblk_t *)) { + sotpi_info_t *sti = SOTOTPI(so); + ASSERT(MUTEX_HELD(&so->so_lock)); /* * Discard new T_DISCON_IND if we have already received another. - * Currently the earlier message can either be on so_discon_ind_mp + * Currently the earlier message can either be on sti_discon_ind_mp * or being processed. */ - if (so->so_discon_ind_mp != NULL || (so->so_flag & SOASYNC_UNBIND)) { + if (sti->sti_discon_ind_mp != NULL || (so->so_flag & SOASYNC_UNBIND)) { zcmn_err(getzoneid(), CE_WARN, "sockfs: received unexpected additional T_DISCON_IND\n"); freemsg(mp); @@ -1463,13 +1470,13 @@ so_save_discon_ind(struct sonode *so, } mp->b_prev = (mblk_t *)func; mp->b_next = NULL; - so->so_discon_ind_mp = mp; + sti->sti_discon_ind_mp = mp; } /* * Caller must hold the mutex and make sure that either SOLOCKED * or SOASYNC_UNBIND is set. Called from so_unlock_single(). - * Perform delayed processing of T_DISCON_IND message on so_discon_ind_mp. + * Perform delayed processing of T_DISCON_IND message on sti_discon_ind_mp. * Need to ensure that strsock_proto() will not end up sleeping for * SOASYNC_UNBIND, while executing this function. */ @@ -1478,13 +1485,14 @@ so_drain_discon_ind(struct sonode *so) { mblk_t *bp; void (*func)(struct sonode *so, mblk_t *); + sotpi_info_t *sti = SOTOTPI(so); ASSERT(MUTEX_HELD(&so->so_lock)); ASSERT(so->so_flag & (SOLOCKED|SOASYNC_UNBIND)); - /* Process T_DISCON_IND on so_discon_ind_mp */ - if ((bp = so->so_discon_ind_mp) != NULL) { - so->so_discon_ind_mp = NULL; + /* Process T_DISCON_IND on sti_discon_ind_mp */ + if ((bp = sti->sti_discon_ind_mp) != NULL) { + sti->sti_discon_ind_mp = NULL; func = (void (*)())bp->b_prev; bp->b_prev = NULL; @@ -1502,20 +1510,21 @@ so_drain_discon_ind(struct sonode *so) /* * Caller must hold the mutex. - * Remove the T_DISCON_IND on so_discon_ind_mp. + * Remove the T_DISCON_IND on sti_discon_ind_mp. */ void so_flush_discon_ind(struct sonode *so) { mblk_t *bp; + sotpi_info_t *sti = SOTOTPI(so); ASSERT(MUTEX_HELD(&so->so_lock)); /* - * Remove T_DISCON_IND mblk at so_discon_ind_mp. + * Remove T_DISCON_IND mblk at sti_discon_ind_mp. */ - if ((bp = so->so_discon_ind_mp) != NULL) { - so->so_discon_ind_mp = NULL; + if ((bp = sti->sti_discon_ind_mp) != NULL) { + sti->sti_discon_ind_mp = NULL; bp->b_prev = NULL; freemsg(bp); } @@ -1526,9 +1535,9 @@ so_flush_discon_ind(struct sonode *so) * * This function is used to process the T_DISCON_IND message. It does * immediate processing when called from strsock_proto and delayed - * processing of discon_ind saved on so_discon_ind_mp when called from + * processing of discon_ind saved on sti_discon_ind_mp when called from * so_drain_discon_ind. When a T_DISCON_IND message is saved in - * so_discon_ind_mp for delayed processing, this function is registered + * sti_discon_ind_mp for delayed processing, this function is registered * as the callback function to process the message. * * SOASYNC_UNBIND should be held in this function, during the non-blocking @@ -1549,6 +1558,7 @@ strsock_discon_ind(struct sonode *so, mblk_t *discon_mp) struct T_unbind_req *ubr; mblk_t *mp; int error; + sotpi_info_t *sti = SOTOTPI(so); ASSERT(MUTEX_HELD(&so->so_lock)); ASSERT(discon_mp); @@ -1571,6 +1581,8 @@ strsock_discon_ind(struct sonode *so, mblk_t *discon_mp) * is the errno name space. */ soisdisconnected(so, tpr->discon_ind.DISCON_reason); + sti->sti_laddr_valid = 0; + sti->sti_faddr_valid = 0; /* * Unbind with the transport without blocking. @@ -1581,14 +1593,14 @@ strsock_discon_ind(struct sonode *so, mblk_t *discon_mp) * * If the socket is not bound, no need to unbind. */ - mp = so->so_unbind_mp; + mp = sti->sti_unbind_mp; if (mp == NULL) { ASSERT(!(so->so_state & SS_ISBOUND)); mutex_exit(&so->so_lock); } else if (!(so->so_state & SS_ISBOUND)) { mutex_exit(&so->so_lock); } else { - so->so_unbind_mp = NULL; + sti->sti_unbind_mp = NULL; /* * Is another T_DISCON_IND being processed. @@ -1602,7 +1614,8 @@ strsock_discon_ind(struct sonode *so, mblk_t *discon_mp) */ so->so_flag |= SOASYNC_UNBIND; ASSERT(!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING))); - so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN|SS_LADDR_VALID); + so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN); + sti->sti_laddr_valid = 0; mutex_exit(&so->so_lock); /* @@ -1686,8 +1699,10 @@ strsock_proto(vnode_t *vp, mblk_t *mp, { union T_primitives *tpr; struct sonode *so; + sotpi_info_t *sti; so = VTOSO(vp); + sti = SOTOTPI(so); dprintso(so, 1, ("strsock_proto(%p, %p)\n", (void *)vp, (void *)mp)); @@ -1849,11 +1864,11 @@ strsock_proto(vnode_t *vp, mblk_t *mp, */ struct sockaddr_in *faddr, *sin; - /* Prevent so_faddr_sa from changing while accessed */ + /* Prevent sti_faddr_sa from changing while accessed */ mutex_enter(&so->so_lock); - ASSERT(so->so_faddr_len == + ASSERT(sti->sti_faddr_len == (socklen_t)sizeof (struct sockaddr_in)); - faddr = (struct sockaddr_in *)so->so_faddr_sa; + faddr = (struct sockaddr_in *)sti->sti_faddr_sa; sin = (struct sockaddr_in *)addr; if (addrlen != (t_uscalar_t)sizeof (struct sockaddr_in) || @@ -1866,11 +1881,10 @@ strsock_proto(vnode_t *vp, mblk_t *mp, dprintso(so, 0, ("sockfs: T_UNITDATA_IND mismatch: %s", pr_addr(so->so_family, - (struct sockaddr *)addr, - addrlen))); + (struct sockaddr *)addr, addrlen))); dprintso(so, 0, (" - %s\n", - pr_addr(so->so_family, so->so_faddr_sa, - (t_uscalar_t)so->so_faddr_len))); + pr_addr(so->so_family, sti->sti_faddr_sa, + (t_uscalar_t)sti->sti_faddr_len))); #endif /* DEBUG */ mutex_exit(&so->so_lock); freemsg(mp); @@ -1885,11 +1899,11 @@ strsock_proto(vnode_t *vp, mblk_t *mp, struct sockaddr_in6 *faddr6, *sin6; static struct in6_addr zeroes; /* inits to all zeros */ - /* Prevent so_faddr_sa from changing while accessed */ + /* Prevent sti_faddr_sa from changing while accessed */ mutex_enter(&so->so_lock); - ASSERT(so->so_faddr_len == + ASSERT(sti->sti_faddr_len == (socklen_t)sizeof (struct sockaddr_in6)); - faddr6 = (struct sockaddr_in6 *)so->so_faddr_sa; + faddr6 = (struct sockaddr_in6 *)sti->sti_faddr_sa; sin6 = (struct sockaddr_in6 *)addr; /* XXX could we get a mapped address ::ffff:0.0.0.0 ? */ if (addrlen != @@ -1904,11 +1918,10 @@ strsock_proto(vnode_t *vp, mblk_t *mp, dprintso(so, 0, ("sockfs: T_UNITDATA_IND mismatch: %s", pr_addr(so->so_family, - (struct sockaddr *)addr, - addrlen))); + (struct sockaddr *)addr, addrlen))); dprintso(so, 0, (" - %s\n", - pr_addr(so->so_family, so->so_faddr_sa, - (t_uscalar_t)so->so_faddr_len))); + pr_addr(so->so_family, sti->sti_faddr_sa, + (t_uscalar_t)sti->sti_faddr_len))); #endif /* DEBUG */ mutex_exit(&so->so_lock); freemsg(mp); @@ -2008,6 +2021,7 @@ strsock_proto(vnode_t *vp, mblk_t *mp, if (so_getopt_unix_close(opt, optlen)) { mutex_enter(&so->so_lock); socantsendmore(so); + sti->sti_faddr_valid = 0; mutex_exit(&so->so_lock); strsetwerror(SOTOV(so), 0, 0, sogetwrerr); freemsg(mp); @@ -2045,7 +2059,7 @@ strsock_proto(vnode_t *vp, mblk_t *mp, */ dprintso(so, 1, ("T_EXDATA_IND(%p): counts %d/%d state %s\n", - (void *)vp, so->so_oobsigcnt, so->so_oobcnt, + (void *)vp, sti->sti_oobsigcnt, sti->sti_oobcnt, pr_state(so->so_state, so->so_mode))); if (msgdsize(mp->b_cont) == 0) { @@ -2113,8 +2127,8 @@ strsock_proto(vnode_t *vp, mblk_t *mp, * adjust the OOB count and OOB signal count * just incremented for the new OOB data. */ - so->so_oobcnt--; - so->so_oobsigcnt--; + sti->sti_oobcnt--; + sti->sti_oobsigcnt--; mutex_exit(QLOCK(qp)); mutex_exit(&so->so_lock); return (NULL); @@ -2141,15 +2155,15 @@ strsock_proto(vnode_t *vp, mblk_t *mp, dprintso(so, 1, ("after outofline T_EXDATA_IND(%p): " "counts %d/%d poll 0x%x sig 0x%x state %s\n", - (void *)vp, so->so_oobsigcnt, - so->so_oobcnt, *pollwakeups, *allmsgsigs, + (void *)vp, sti->sti_oobsigcnt, + sti->sti_oobcnt, *pollwakeups, *allmsgsigs, pr_state(so->so_state, so->so_mode))); } else { dprintso(so, 1, ("after inline T_EXDATA_IND(%p): " "counts %d/%d poll 0x%x sig 0x%x state %s\n", - (void *)vp, so->so_oobsigcnt, - so->so_oobcnt, *pollwakeups, *allmsgsigs, + (void *)vp, sti->sti_oobsigcnt, + sti->sti_oobcnt, *pollwakeups, *allmsgsigs, pr_state(so->so_state, so->so_mode))); } #endif /* DEBUG */ @@ -2194,13 +2208,15 @@ strsock_proto(vnode_t *vp, mblk_t *mp, * For AF_UNIX require the identical length. */ if (so->so_family == AF_UNIX ? - addrlen != (t_uscalar_t)sizeof (so->so_ux_laddr) : - addrlen > (t_uscalar_t)so->so_faddr_maxlen) { + addrlen != (t_uscalar_t)sizeof (sti->sti_ux_laddr) : + addrlen > (t_uscalar_t)sti->sti_faddr_maxlen) { zcmn_err(getzoneid(), CE_WARN, "sockfs: T_conn_con with different " "length %u/%d\n", addrlen, conn_con->RES_length); soisdisconnected(so, EPROTO); + sti->sti_laddr_valid = 0; + sti->sti_faddr_valid = 0; mutex_exit(&so->so_lock); strsetrerror(SOTOV(so), 0, 0, sogetrderr); strsetwerror(SOTOV(so), 0, 0, sogetwrerr); @@ -2240,10 +2256,10 @@ strsock_proto(vnode_t *vp, mblk_t *mp, * Save for getpeername. */ if (so->so_family != AF_UNIX) { - so->so_faddr_len = (socklen_t)addrlen; - ASSERT(so->so_faddr_len <= so->so_faddr_maxlen); - bcopy(addr, so->so_faddr_sa, addrlen); - so->so_state |= SS_FADDR_VALID; + sti->sti_faddr_len = (socklen_t)addrlen; + ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen); + bcopy(addr, sti->sti_faddr_sa, addrlen); + sti->sti_faddr_valid = 1; } if (so->so_peercred != NULL) @@ -2275,7 +2291,7 @@ strsock_proto(vnode_t *vp, mblk_t *mp, case T_CONN_IND: /* * Verify the min size and queue the message on - * the so_conn_ind_head/tail list. + * the sti_conn_ind_head/tail list. */ if (MBLKL(mp) < sizeof (struct T_conn_ind)) { zcmn_err(getzoneid(), CE_WARN, @@ -2301,7 +2317,7 @@ strsock_proto(vnode_t *vp, mblk_t *mp, tpr->type = T_CONN_IND; - fbso = kssl_find_fallback(so->so_kssl_ent); + fbso = kssl_find_fallback(sti->sti_kssl_ent); /* * No fallback: the remote will timeout and @@ -2391,6 +2407,7 @@ strsock_proto(vnode_t *vp, mblk_t *mp, if ((so->so_state & SS_CANTRCVMORE) && (so->so_family == AF_UNIX)) { socantsendmore(so); + sti->sti_faddr_valid = 0; mutex_exit(&so->so_lock); strsetwerror(SOTOV(so), 0, 0, sogetwrerr); dprintso(so, 1, @@ -2468,7 +2485,7 @@ strsock_proto(vnode_t *vp, mblk_t *mp, /* Compare just IP address and port */ struct sockaddr_in *sin1, *sin2; - sin1 = (struct sockaddr_in *)so->so_faddr_sa; + sin1 = (struct sockaddr_in *)sti->sti_faddr_sa; sin2 = (struct sockaddr_in *)addr; if (addrlen == sizeof (struct sockaddr_in) && sin1->sin_port == sin2->sin_port && @@ -2481,7 +2498,7 @@ strsock_proto(vnode_t *vp, mblk_t *mp, /* Compare just IP address and port. Not flow */ struct sockaddr_in6 *sin1, *sin2; - sin1 = (struct sockaddr_in6 *)so->so_faddr_sa; + sin1 = (struct sockaddr_in6 *)sti->sti_faddr_sa; sin2 = (struct sockaddr_in6 *)addr; if (addrlen == sizeof (struct sockaddr_in6) && sin1->sin6_port == sin2->sin6_port && @@ -2491,16 +2508,16 @@ strsock_proto(vnode_t *vp, mblk_t *mp, break; } case AF_UNIX: - faddr = &so->so_ux_faddr; + faddr = &sti->sti_ux_faddr; faddr_len = - (t_uscalar_t)sizeof (so->so_ux_faddr); + (t_uscalar_t)sizeof (sti->sti_ux_faddr); if (faddr_len == addrlen && bcmp(addr, faddr, addrlen) == 0) match = B_TRUE; break; default: - faddr = so->so_faddr_sa; - faddr_len = (t_uscalar_t)so->so_faddr_len; + faddr = sti->sti_faddr_sa; + faddr_len = (t_uscalar_t)sti->sti_faddr_len; if (faddr_len == addrlen && bcmp(addr, faddr, addrlen) == 0) match = B_TRUE; @@ -2512,11 +2529,10 @@ strsock_proto(vnode_t *vp, mblk_t *mp, dprintso(so, 0, ("sockfs: T_UDERR_IND mismatch: %s - ", pr_addr(so->so_family, - (struct sockaddr *)addr, - addrlen))); + (struct sockaddr *)addr, addrlen))); dprintso(so, 0, ("%s\n", - pr_addr(so->so_family, so->so_faddr_sa, - so->so_faddr_len))); + pr_addr(so->so_family, sti->sti_faddr_sa, + sti->sti_faddr_len))); #endif /* DEBUG */ mutex_exit(&so->so_lock); freemsg(mp); @@ -2545,8 +2561,8 @@ strsock_proto(vnode_t *vp, mblk_t *mp, } /* * If the application asked for delayed errors - * record the T_UDERROR_IND so_eaddr_mp and the reason in - * so_delayed_error for delayed error posting. If the reason + * record the T_UDERROR_IND sti_eaddr_mp and the reason in + * sti_delayed_error for delayed error posting. If the reason * is zero use ECONNRESET. * Note that delayed error indications do not make sense for * AF_UNIX sockets since sendto checks that the destination @@ -2557,15 +2573,15 @@ strsock_proto(vnode_t *vp, mblk_t *mp, freemsg(mp); return (NULL); } - if (so->so_eaddr_mp != NULL) - freemsg(so->so_eaddr_mp); + if (sti->sti_eaddr_mp != NULL) + freemsg(sti->sti_eaddr_mp); - so->so_eaddr_mp = mp; + sti->sti_eaddr_mp = mp; if (tudi->ERROR_type != 0) error = tudi->ERROR_type; else error = ECONNRESET; - so->so_delayed_error = (ushort_t)error; + sti->sti_delayed_error = (ushort_t)error; mutex_exit(&so->so_lock); return (NULL); } @@ -2700,8 +2716,10 @@ strsock_misc(vnode_t *vp, mblk_t *mp, strsigset_t *allmsgsigs, strpollset_t *pollwakeups) { struct sonode *so; + sotpi_info_t *sti; so = VTOSO(vp); + sti = SOTOTPI(so); dprintso(so, 1, ("strsock_misc(%p, %p, 0x%x)\n", (void *)vp, (void *)mp, DB_TYPE(mp))); @@ -2724,15 +2742,14 @@ strsock_misc(vnode_t *vp, mblk_t *mp, mutex_enter(&so->so_lock); dprintso(so, 1, ("SIGURG(%p): counts %d/%d state %s\n", - (void *)vp, so->so_oobsigcnt, - so->so_oobcnt, + (void *)vp, sti->sti_oobsigcnt, sti->sti_oobcnt, pr_state(so->so_state, so->so_mode))); so_oob_sig(so, 1, allmsgsigs, pollwakeups); dprintso(so, 1, ("after SIGURG(%p): counts %d/%d " " poll 0x%x sig 0x%x state %s\n", - (void *)vp, so->so_oobsigcnt, - so->so_oobcnt, *pollwakeups, *allmsgsigs, + (void *)vp, sti->sti_oobsigcnt, sti->sti_oobcnt, + *pollwakeups, *allmsgsigs, pr_state(so->so_state, so->so_mode))); mutex_exit(&so->so_lock); } @@ -2873,53 +2890,118 @@ bad: return (error); } +/* + * Wrapper for getmsg. If the socket has been converted to a stream + * pass the request to the stream head. + */ +int +sock_getmsg( + struct vnode *vp, + struct strbuf *mctl, + struct strbuf *mdata, + uchar_t *prip, + int *flagsp, + int fmode, + rval_t *rvp +) +{ + struct sonode *so; + + ASSERT(vp->v_type == VSOCK); + /* + * Use the stream head to find the real socket vnode. + * This is needed when namefs sits above sockfs. Some + * sockets (like SCTP) are not streams. + */ + if (!vp->v_stream) { + return (ENOSTR); + } + ASSERT(vp->v_stream->sd_vnode); + vp = vp->v_stream->sd_vnode; + ASSERT(vn_matchops(vp, socket_vnodeops)); + so = VTOSO(vp); + dprintso(so, 1, ("sock_getmsg(%p) %s\n", + (void *)so, pr_state(so->so_state, so->so_mode))); + + if (so->so_version == SOV_STREAM) { + /* The imaginary "sockmod" has been popped - act as a stream */ + return (strgetmsg(vp, mctl, mdata, prip, flagsp, fmode, rvp)); + } + eprintsoline(so, ENOSTR); + return (ENOSTR); +} /* - * Translate a TLI(/XTI) error into a system error as best we can. + * Wrapper for putmsg. If the socket has been converted to a stream + * pass the request to the stream head. + * + * Note that a while a regular socket (SOV_SOCKSTREAM) does support the + * streams ioctl set it does not support putmsg and getmsg. + * Allowing putmsg would prevent sockfs from tracking the state of + * the socket/transport and would also invalidate the locking in sockfs. */ -static const int tli_errs[] = { - 0, /* no error */ - EADDRNOTAVAIL, /* TBADADDR */ - ENOPROTOOPT, /* TBADOPT */ - EACCES, /* TACCES */ - EBADF, /* TBADF */ - EADDRNOTAVAIL, /* TNOADDR */ - EPROTO, /* TOUTSTATE */ - ECONNABORTED, /* TBADSEQ */ - 0, /* TSYSERR - will never get */ - EPROTO, /* TLOOK - should never be sent by transport */ - EMSGSIZE, /* TBADDATA */ - EMSGSIZE, /* TBUFOVFLW */ - EPROTO, /* TFLOW */ - EWOULDBLOCK, /* TNODATA */ - EPROTO, /* TNODIS */ - EPROTO, /* TNOUDERR */ - EINVAL, /* TBADFLAG */ - EPROTO, /* TNOREL */ - EOPNOTSUPP, /* TNOTSUPPORT */ - EPROTO, /* TSTATECHNG */ - /* following represent error namespace expansion with XTI */ - EPROTO, /* TNOSTRUCTYPE - never sent by transport */ - EPROTO, /* TBADNAME - never sent by transport */ - EPROTO, /* TBADQLEN - never sent by transport */ - EADDRINUSE, /* TADDRBUSY */ - EBADF, /* TINDOUT */ - EBADF, /* TPROVMISMATCH */ - EBADF, /* TRESQLEN */ - EBADF, /* TRESADDR */ - EPROTO, /* TQFULL - never sent by transport */ - EPROTO, /* TPROTO */ -}; +int +sock_putmsg( + struct vnode *vp, + struct strbuf *mctl, + struct strbuf *mdata, + uchar_t pri, + int flag, + int fmode +) +{ + struct sonode *so; -static int -tlitosyserr(int terr) + ASSERT(vp->v_type == VSOCK); + /* + * Use the stream head to find the real socket vnode. + * This is needed when namefs sits above sockfs. + */ + if (!vp->v_stream) { + return (ENOSTR); + } + ASSERT(vp->v_stream->sd_vnode); + vp = vp->v_stream->sd_vnode; + ASSERT(vn_matchops(vp, socket_vnodeops)); + so = VTOSO(vp); + + dprintso(so, 1, ("sock_putmsg(%p) %s\n", + (void *)so, pr_state(so->so_state, so->so_mode))); + + if (so->so_version == SOV_STREAM) { + /* The imaginary "sockmod" has been popped - act as a stream */ + return (strputmsg(vp, mctl, mdata, pri, flag, fmode)); + } + eprintsoline(so, ENOSTR); + return (ENOSTR); +} + +/* + * Special function called only from f_getfl(). + * Returns FASYNC if the SS_ASYNC flag is set on a socket, else 0. + * No locks are acquired here, so it is safe to use while uf_lock is held. + * This exists solely for BSD fcntl() FASYNC compatibility. + */ +int +sock_getfasync(vnode_t *vp) { - ASSERT(terr != TSYSERR); - if (terr >= (sizeof (tli_errs) / sizeof (tli_errs[0]))) - return (EPROTO); + struct sonode *so; + + ASSERT(vp->v_type == VSOCK); + /* + * For stream model, v_stream is used; For non-stream, v_stream always + * equals NULL + */ + if (vp->v_stream != NULL) + so = VTOSO(vp->v_stream->sd_vnode); else - return (tli_errs[terr]); + so = VTOSO(vp); + + if (so->so_version == SOV_STREAM || !(so->so_state & SS_ASYNC)) + return (0); + + return (FASYNC); } /* diff --git a/usr/src/uts/common/fs/sockfs/socksubr.c b/usr/src/uts/common/fs/sockfs/socksubr.c index 33a6841f16..b82adb1789 100644 --- a/usr/src/uts/common/fs/sockfs/socksubr.c +++ b/usr/src/uts/common/fs/sockfs/socksubr.c @@ -73,6 +73,9 @@ #include <c2/audit.h> #include <fs/sockfs/nl7c.h> +#include <fs/sockfs/sockcommon.h> +#include <fs/sockfs/socktpi.h> +#include <fs/sockfs/socktpi_impl.h> /* * Macros that operate on struct cmsghdr. @@ -88,18 +91,16 @@ ((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end))) #define SO_LOCK_WAKEUP_TIME 3000 /* Wakeup time in milliseconds */ -static struct kmem_cache *socktpi_cache, *socktpi_unix_cache; -struct kmem_cache *socktpi_sod_cache; - dev_t sockdev; /* For fsid in getattr */ int sockfs_defer_nl7c_init = 0; -struct sockparams *sphead; -krwlock_t splist_lock; struct socklist socklist; +struct kmem_cache *socket_cache; + static int sockfs_update(kstat_t *, int); static int sockfs_snapshot(kstat_t *, void *, int); +extern smod_info_t *sotpi_smod_create(void); extern void sendfile_init(); @@ -124,7 +125,7 @@ struct k_sockinfo { * Translate from a device pathname (e.g. "/dev/tcp") to a vnode. * Returns with the vnode held. */ -static int +int sogetvp(char *devpath, vnode_t **vpp, int uioflag) { struct snode *csp; @@ -133,6 +134,7 @@ sogetvp(char *devpath, vnode_t **vpp, int uioflag) int error; ASSERT(uioflag == UIO_SYSSPACE || uioflag == UIO_USERSPACE); + /* * Lookup the underlying filesystem vnode. */ @@ -179,382 +181,6 @@ sogetvp(char *devpath, vnode_t **vpp, int uioflag) } /* - * Add or delete (latter if devpath is NULL) an enter to the sockparams - * table. If devpathlen is zero the devpath with not be kmem_freed. Otherwise - * this routine assumes that the caller has kmem_alloced devpath/devpathlen - * for this routine to consume. - * The zero devpathlen could be used if the kernel wants to create entries - * itself by calling sockconfig(1,2,3, "/dev/tcp", 0); - */ -int -soconfig(int domain, int type, int protocol, - char *devpath, int devpathlen) -{ - struct sockparams **spp; - struct sockparams *sp; - int error = 0; - - dprint(0, ("soconfig(%d,%d,%d,%s,%d)\n", - domain, type, protocol, devpath, devpathlen)); - - if (sockfs_defer_nl7c_init) { - nl7c_init(); - sockfs_defer_nl7c_init = 0; - } - - /* - * Look for an existing match. - */ - rw_enter(&splist_lock, RW_WRITER); - for (spp = &sphead; (sp = *spp) != NULL; spp = &sp->sp_next) { - if (sp->sp_domain == domain && - sp->sp_type == type && - sp->sp_protocol == protocol) { - break; - } - } - if (devpath == NULL) { - ASSERT(devpathlen == 0); - - /* Delete existing entry */ - if (sp == NULL) { - error = ENXIO; - goto done; - } - /* Unlink and free existing entry */ - *spp = sp->sp_next; - ASSERT(sp->sp_vnode); - VN_RELE(sp->sp_vnode); - if (sp->sp_devpathlen != 0) - kmem_free(sp->sp_devpath, sp->sp_devpathlen); - kmem_free(sp, sizeof (*sp)); - } else { - vnode_t *vp; - - /* Add new entry */ - if (sp != NULL) { - error = EEXIST; - goto done; - } - - error = sogetvp(devpath, &vp, UIO_SYSSPACE); - if (error) { - dprint(0, ("soconfig: vp %s failed with %d\n", - devpath, error)); - goto done; - } - - dprint(0, ("soconfig: %s => vp %p, dev 0x%lx\n", - devpath, (void *)vp, vp->v_rdev)); - - sp = kmem_alloc(sizeof (*sp), KM_SLEEP); - sp->sp_domain = domain; - sp->sp_type = type; - sp->sp_protocol = protocol; - sp->sp_devpath = devpath; - sp->sp_devpathlen = devpathlen; - sp->sp_vnode = vp; - sp->sp_next = NULL; - *spp = sp; - } -done: - rw_exit(&splist_lock); - if (error) { - if (devpath != NULL) - kmem_free(devpath, devpathlen); -#ifdef SOCK_DEBUG - eprintline(error); -#endif /* SOCK_DEBUG */ - } - return (error); -} - -/* - * Lookup an entry in the sockparams list based on the triple. - * If no entry is found and devpath is not NULL translate devpath to a - * vnode. Note that devpath is a pointer to a user address! - * Returns with the vnode held. - * - * When this routine uses devpath it does not create an entry in the sockparams - * list since this routine can run on behalf of any user and one user - * should not be able to effect the transport used by another user. - * - * In order to return the correct error this routine has to do wildcard scans - * of the list. The errors are (in decreasing precedence): - * EAFNOSUPPORT - address family not in list - * EPROTONOSUPPORT - address family supported but not protocol. - * EPROTOTYPE - address family and protocol supported but not socket type. - */ -vnode_t * -solookup(int domain, int type, int protocol, char *devpath, int *errorp) -{ - struct sockparams *sp; - int error; - vnode_t *vp; - - rw_enter(&splist_lock, RW_READER); - for (sp = sphead; sp != NULL; sp = sp->sp_next) { - if (sp->sp_domain == domain && - sp->sp_type == type && - sp->sp_protocol == protocol) { - break; - } - } - if (sp == NULL) { - dprint(0, ("solookup(%d,%d,%d) not found\n", - domain, type, protocol)); - if (devpath == NULL) { - /* Determine correct error code */ - int found = 0; - - for (sp = sphead; sp != NULL; sp = sp->sp_next) { - if (sp->sp_domain == domain && found < 1) - found = 1; - if (sp->sp_domain == domain && - sp->sp_protocol == protocol && found < 2) - found = 2; - } - rw_exit(&splist_lock); - switch (found) { - case 0: - *errorp = EAFNOSUPPORT; - break; - case 1: - *errorp = EPROTONOSUPPORT; - break; - case 2: - *errorp = EPROTOTYPE; - break; - } - return (NULL); - } - rw_exit(&splist_lock); - - /* - * Return vp based on devpath. - * Do not enter into table to avoid random users - * modifying the sockparams list. - */ - error = sogetvp(devpath, &vp, UIO_USERSPACE); - if (error) { - dprint(0, ("solookup: vp %p failed with %d\n", - (void *)devpath, error)); - *errorp = EPROTONOSUPPORT; - return (NULL); - } - dprint(0, ("solookup: %p => vp %p, dev 0x%lx\n", - (void *)devpath, (void *)vp, vp->v_rdev)); - - return (vp); - } - dprint(0, ("solookup(%d,%d,%d) vp %p devpath %s\n", - domain, type, protocol, (void *)sp->sp_vnode, sp->sp_devpath)); - - vp = sp->sp_vnode; - VN_HOLD(vp); - rw_exit(&splist_lock); - return (vp); -} - -/* - * Return a socket vnode. - * - * Assumes that the caller is "passing" an VN_HOLD for accessvp i.e. - * when the socket is freed a VN_RELE will take place. - * - * Note that sockets assume that the driver will clone (either itself - * or by using the clone driver) i.e. a socket() call will always - * result in a new vnode being created. - */ -struct vnode * -makesockvp(struct vnode *accessvp, int domain, int type, int protocol) -{ - kmem_cache_t *cp; - struct sonode *so; - struct vnode *vp; - time_t now; - dev_t dev; - - cp = (domain == AF_UNIX) ? socktpi_unix_cache : socktpi_cache; - so = kmem_cache_alloc(cp, KM_SLEEP); - so->so_cache = cp; - so->so_obj = so; - vp = SOTOV(so); - now = gethrestime_sec(); - - so->so_flag = 0; - ASSERT(so->so_accessvp == NULL); - so->so_accessvp = accessvp; - dev = accessvp->v_rdev; - - /* - * Record in so_flag that it is a clone. - */ - if (getmajor(dev) == clone_major) { - so->so_flag |= SOCLONE; - } - so->so_dev = dev; - - so->so_state = 0; - so->so_mode = 0; - - so->so_fsid = sockdev; - so->so_atime = now; - so->so_mtime = now; - so->so_ctime = now; /* Never modified */ - so->so_count = 0; - - so->so_family = (short)domain; - so->so_type = (short)type; - so->so_protocol = (short)protocol; - so->so_pushcnt = 0; - - so->so_options = 0; - so->so_linger.l_onoff = 0; - so->so_linger.l_linger = 0; - so->so_sndbuf = 0; - so->so_rcvbuf = 0; - so->so_sndlowat = 0; - so->so_rcvlowat = 0; -#ifdef notyet - so->so_sndtimeo = 0; - so->so_rcvtimeo = 0; -#endif /* notyet */ - so->so_error = 0; - so->so_delayed_error = 0; - - ASSERT(so->so_oobmsg == NULL); - so->so_oobcnt = 0; - so->so_oobsigcnt = 0; - so->so_pgrp = 0; - so->so_provinfo = NULL; - - ASSERT(so->so_laddr_sa == NULL && so->so_faddr_sa == NULL); - so->so_laddr_len = so->so_faddr_len = 0; - so->so_laddr_maxlen = so->so_faddr_maxlen = 0; - so->so_eaddr_mp = NULL; - so->so_priv = NULL; - - so->so_peercred = NULL; - - ASSERT(so->so_ack_mp == NULL); - ASSERT(so->so_conn_ind_head == NULL); - ASSERT(so->so_conn_ind_tail == NULL); - ASSERT(so->so_ux_bound_vp == NULL); - ASSERT(so->so_unbind_mp == NULL); - - vn_reinit(vp); - vp->v_vfsp = rootvfs; - vp->v_type = VSOCK; - vp->v_rdev = so->so_dev; - vn_exists(vp); - - return (vp); -} - -void -sockfree(struct sonode *so) -{ - mblk_t *mp; - vnode_t *vp; - - ASSERT(so->so_count == 0); - ASSERT(so->so_accessvp); - ASSERT(so->so_discon_ind_mp == NULL); - - vp = so->so_accessvp; - VN_RELE(vp); - - /* - * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely - * indirect them. It also uses so_accessvp as a validity test. - */ - mutex_enter(&so->so_lock); - - so->so_accessvp = NULL; - - if (so->so_laddr_sa) { - ASSERT((caddr_t)so->so_faddr_sa == - (caddr_t)so->so_laddr_sa + so->so_laddr_maxlen); - ASSERT(so->so_faddr_maxlen == so->so_laddr_maxlen); - so->so_state &= ~(SS_LADDR_VALID | SS_FADDR_VALID); - kmem_free(so->so_laddr_sa, so->so_laddr_maxlen * 2); - so->so_laddr_sa = NULL; - so->so_laddr_len = so->so_laddr_maxlen = 0; - so->so_faddr_sa = NULL; - so->so_faddr_len = so->so_faddr_maxlen = 0; - } - - mutex_exit(&so->so_lock); - - if ((mp = so->so_eaddr_mp) != NULL) { - freemsg(mp); - so->so_eaddr_mp = NULL; - so->so_delayed_error = 0; - } - if ((mp = so->so_ack_mp) != NULL) { - freemsg(mp); - so->so_ack_mp = NULL; - } - if ((mp = so->so_conn_ind_head) != NULL) { - mblk_t *mp1; - - while (mp) { - mp1 = mp->b_next; - mp->b_next = NULL; - freemsg(mp); - mp = mp1; - } - so->so_conn_ind_head = so->so_conn_ind_tail = NULL; - so->so_state &= ~SS_HASCONNIND; - } -#ifdef DEBUG - mutex_enter(&so->so_lock); - ASSERT(so_verify_oobstate(so)); - mutex_exit(&so->so_lock); -#endif /* DEBUG */ - if ((mp = so->so_oobmsg) != NULL) { - freemsg(mp); - so->so_oobmsg = NULL; - so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA); - } - - if ((mp = so->so_nl7c_rcv_mp) != NULL) { - so->so_nl7c_rcv_mp = NULL; - freemsg(mp); - } - so->so_nl7c_rcv_rval = 0; - if (so->so_nl7c_uri != NULL) { - nl7c_urifree(so); - /* urifree() cleared nl7c_uri */ - } - if (so->so_nl7c_flags) { - so->so_nl7c_flags = 0; - } - - if (so->so_direct != NULL) { - sodirect_t *sodp = so->so_direct; - - ASSERT(sodp->sod_uioafh == NULL); - - so->so_direct = NULL; - kmem_cache_free(socktpi_sod_cache, sodp); - } - - ASSERT(so->so_ux_bound_vp == NULL); - if ((mp = so->so_unbind_mp) != NULL) { - freemsg(mp); - so->so_unbind_mp = NULL; - } - vn_invalid(SOTOV(so)); - - if (so->so_peercred != NULL) - crfree(so->so_peercred); - - kmem_cache_free(so->so_cache, so->so_obj); -} - -/* * Update the accessed, updated, or changed times in an sonode * with the current time. * @@ -569,133 +195,20 @@ so_update_attrs(struct sonode *so, int flag) { time_t now = gethrestime_sec(); + if (SOCK_IS_NONSTR(so)) + return; + mutex_enter(&so->so_lock); so->so_flag |= flag; if (flag & SOACC) - so->so_atime = now; + SOTOTPI(so)->sti_atime = now; if (flag & SOMOD) - so->so_mtime = now; + SOTOTPI(so)->sti_mtime = now; mutex_exit(&so->so_lock); } -/*ARGSUSED*/ -static int -socktpi_constructor(void *buf, void *cdrarg, int kmflags) -{ - struct sonode *so = buf; - struct vnode *vp; - - vp = so->so_vnode = vn_alloc(kmflags); - if (vp == NULL) { - return (-1); - } - vn_setops(vp, socktpi_vnodeops); - vp->v_data = so; - - so->so_direct = NULL; - - so->so_nl7c_flags = 0; - so->so_nl7c_uri = NULL; - so->so_nl7c_rcv_mp = NULL; - - so->so_oobmsg = NULL; - so->so_ack_mp = NULL; - so->so_conn_ind_head = NULL; - so->so_conn_ind_tail = NULL; - so->so_discon_ind_mp = NULL; - so->so_ux_bound_vp = NULL; - so->so_unbind_mp = NULL; - so->so_accessvp = NULL; - so->so_laddr_sa = NULL; - so->so_faddr_sa = NULL; - so->so_ops = &sotpi_sonodeops; - - mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&so->so_plumb_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL); - cv_init(&so->so_ack_cv, NULL, CV_DEFAULT, NULL); - cv_init(&so->so_connind_cv, NULL, CV_DEFAULT, NULL); - cv_init(&so->so_want_cv, NULL, CV_DEFAULT, NULL); - - return (0); -} - -/*ARGSUSED1*/ -static void -socktpi_destructor(void *buf, void *cdrarg) -{ - struct sonode *so = buf; - struct vnode *vp = SOTOV(so); - - ASSERT(so->so_direct == NULL); - - ASSERT(so->so_nl7c_flags == 0); - ASSERT(so->so_nl7c_uri == NULL); - ASSERT(so->so_nl7c_rcv_mp == NULL); - - ASSERT(so->so_oobmsg == NULL); - ASSERT(so->so_ack_mp == NULL); - ASSERT(so->so_conn_ind_head == NULL); - ASSERT(so->so_conn_ind_tail == NULL); - ASSERT(so->so_discon_ind_mp == NULL); - ASSERT(so->so_ux_bound_vp == NULL); - ASSERT(so->so_unbind_mp == NULL); - ASSERT(so->so_ops == &sotpi_sonodeops); - - ASSERT(vn_matchops(vp, socktpi_vnodeops)); - ASSERT(vp->v_data == so); - - vn_free(vp); - - mutex_destroy(&so->so_lock); - mutex_destroy(&so->so_plumb_lock); - cv_destroy(&so->so_state_cv); - cv_destroy(&so->so_ack_cv); - cv_destroy(&so->so_connind_cv); - cv_destroy(&so->so_want_cv); -} - -static int -socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags) -{ - int retval; - - if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) { - struct sonode *so = (struct sonode *)buf; - - mutex_enter(&socklist.sl_lock); - - so->so_next = socklist.sl_list; - so->so_prev = NULL; - if (so->so_next != NULL) - so->so_next->so_prev = so; - socklist.sl_list = so; - - mutex_exit(&socklist.sl_lock); - - } - return (retval); -} - -static void -socktpi_unix_destructor(void *buf, void *cdrarg) -{ - struct sonode *so = (struct sonode *)buf; - - mutex_enter(&socklist.sl_lock); - - if (so->so_next != NULL) - so->so_next->so_prev = so->so_prev; - if (so->so_prev != NULL) - so->so_prev->so_next = so->so_next; - else - socklist.sl_list = so->so_next; - - mutex_exit(&socklist.sl_lock); - - socktpi_destructor(buf, cdrarg); -} - +extern so_create_func_t sock_comm_create_function; +extern so_destroy_func_t sock_comm_destroy_function; /* * Init function called when sockfs is loaded. */ @@ -716,21 +229,20 @@ sockinit(int fstype, char *name) return (error); } - error = vn_make_ops(name, socktpi_vnodeops_template, &socktpi_vnodeops); + error = vn_make_ops(name, socket_vnodeops_template, + &socket_vnodeops); if (error != 0) { - err_str = "sockinit: bad sock vnode ops template"; + err_str = "sockinit: bad socket vnode ops template"; /* vn_make_ops() does not reset socktpi_vnodeops on failure. */ - socktpi_vnodeops = NULL; + socket_vnodeops = NULL; goto failure; } - error = sosctp_init(); - if (error != 0) { - err_str = NULL; - goto failure; - } + socket_cache = kmem_cache_create("socket_cache", + sizeof (struct sonode), 0, sonode_constructor, + sonode_destructor, NULL, NULL, NULL, 0); - error = sosdp_init(); + error = socktpi_init(); if (error != 0) { err_str = NULL; goto failure; @@ -743,21 +255,18 @@ sockinit(int fstype, char *name) } /* - * Create sonode caches. We create a special one for AF_UNIX so - * that we can track them for netstat(1m). + * Set up the default create and destroy functions */ - socktpi_cache = kmem_cache_create("socktpi_cache", - sizeof (struct sonode), 0, socktpi_constructor, - socktpi_destructor, NULL, NULL, NULL, 0); - - socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache", - sizeof (struct sonode), 0, socktpi_unix_constructor, - socktpi_unix_destructor, NULL, NULL, NULL, 0); + sock_comm_create_function = socket_sonode_create; + sock_comm_destroy_function = socket_sonode_destroy; /* * Build initial list mapping socket parameters to vnode. */ - rw_init(&splist_lock, NULL, RW_DEFAULT, NULL); + smod_init(); + smod_add(sotpi_smod_create()); + + sockparams_init(); /* * If sockets are needed before init runs /sbin/soconfig @@ -786,8 +295,8 @@ sockinit(int fstype, char *name) failure: (void) vfs_freevfsops_by_type(fstype); - if (socktpi_vnodeops != NULL) - vn_freevnodeops(socktpi_vnodeops); + if (socket_vnodeops != NULL) + vn_freevnodeops(socket_vnodeops); if (err_str != NULL) zcmn_err(GLOBAL_ZONEID, CE_WARN, err_str); return (error); @@ -820,15 +329,18 @@ so_unlock_single(struct sonode *so, int flag) ASSERT(flag & (SOLOCKED|SOASYNC_UNBIND)); ASSERT((flag & ~(SOLOCKED|SOASYNC_UNBIND)) == 0); ASSERT(so->so_flag & flag); - /* - * Process the T_DISCON_IND on so_discon_ind_mp. + * Process the T_DISCON_IND on sti_discon_ind_mp. * * Call to so_drain_discon_ind will result in so_lock * being dropped and re-acquired later. */ - if (so->so_discon_ind_mp != NULL) - so_drain_discon_ind(so); + if (!SOCK_IS_NONSTR(so)) { + sotpi_info_t *sti = SOTOTPI(so); + + if (sti->sti_discon_ind_mp != NULL) + so_drain_discon_ind(so); + } if (so->so_flag & SOWANT) cv_broadcast(&so->so_want_cv); @@ -1076,7 +588,7 @@ so_addr_verify(struct sonode *so, const struct sockaddr *name, break; } case AF_UNIX: - if (so->so_state & SS_FADDR_NOXLATE) { + if (SOTOTPI(so)->sti_faddr_noxlate) { return (0); } if (namelen < (socklen_t)sizeof (short)) { @@ -1122,13 +634,14 @@ so_ux_addr_xlate(struct sonode *so, struct sockaddr *name, vnode_t *vp; void *addr; socklen_t addrlen; + sotpi_info_t *sti = SOTOTPI(so); dprintso(so, 1, ("so_ux_addr_xlate(%p, %p, %d, %d)\n", (void *)so, (void *)name, namelen, checkaccess)); ASSERT(name != NULL); ASSERT(so->so_family == AF_UNIX); - ASSERT(!(so->so_state & SS_FADDR_NOXLATE)); + ASSERT(!sti->sti_faddr_noxlate); ASSERT(namelen >= (socklen_t)sizeof (short)); ASSERT(name->sa_family == AF_UNIX); soun = (struct sockaddr_un *)name; @@ -1147,10 +660,10 @@ so_ux_addr_xlate(struct sonode *so, struct sockaddr *name, * closed by the time the T_CONN_REQ or T_UNIDATA_REQ reaches the * transport the message will get an error or be dropped. */ - so->so_ux_faddr.soua_vp = vp; - so->so_ux_faddr.soua_magic = SOU_MAGIC_EXPLICIT; - addr = &so->so_ux_faddr; - addrlen = (socklen_t)sizeof (so->so_ux_faddr); + sti->sti_ux_faddr.soua_vp = vp; + sti->sti_ux_faddr.soua_magic = SOU_MAGIC_EXPLICIT; + addr = &sti->sti_ux_faddr; + addrlen = (socklen_t)sizeof (sti->sti_ux_faddr); dprintso(so, 1, ("ux_xlate UNIX: addrlen %d, vp %p\n", addrlen, (void *)vp)); VN_RELE(vp); @@ -2007,8 +1520,6 @@ pr_state(uint_t state, uint_t mode) (void) strcat(buf, "ASYNC "); if (state & SS_ACCEPTCONN) (void) strcat(buf, "ACCEPTCONN "); - if (state & SS_HASCONNIND) - (void) strcat(buf, "HASCONNIND "); if (state & SS_SAVEDEOR) (void) strcat(buf, "SAVEDEOR "); @@ -2021,9 +1532,6 @@ pr_state(uint_t state, uint_t mode) if (state & SS_HADOOBDATA) (void) strcat(buf, "HADOOBDATA "); - if (state & SS_FADDR_NOXLATE) - (void) strcat(buf, "FADDR_NOXLATE "); - if (mode & SM_PRIV) (void) strcat(buf, "PRIV "); if (mode & SM_ATOMIC) @@ -2102,6 +1610,8 @@ pr_addr(int family, struct sockaddr *addr, t_uscalar_t addrlen) int so_verify_oobstate(struct sonode *so) { + boolean_t havemark; + ASSERT(MUTEX_HELD(&so->so_lock)); /* @@ -2120,28 +1630,29 @@ so_verify_oobstate(struct sonode *so) case SS_HADOOBDATA: break; default: - printf("Bad oob state 1 (%p): counts %d/%d state %s\n", - (void *)so, so->so_oobsigcnt, - so->so_oobcnt, pr_state(so->so_state, so->so_mode)); + printf("Bad oob state 1 (%p): state %s\n", + (void *)so, pr_state(so->so_state, so->so_mode)); return (0); } /* SS_RCVATMARK should only be set when SS_OOBPEND is set */ if ((so->so_state & (SS_RCVATMARK|SS_OOBPEND)) == SS_RCVATMARK) { - printf("Bad oob state 2 (%p): counts %d/%d state %s\n", - (void *)so, so->so_oobsigcnt, - so->so_oobcnt, pr_state(so->so_state, so->so_mode)); + printf("Bad oob state 2 (%p): state %s\n", + (void *)so, pr_state(so->so_state, so->so_mode)); return (0); } /* - * (so_oobsigcnt != 0 or SS_RCVATMARK) iff SS_OOBPEND + * (havemark != 0 or SS_RCVATMARK) iff SS_OOBPEND + * For TPI, the presence of a "mark" is indicated by sti_oobsigcnt. */ - if (!EQUIV((so->so_oobsigcnt != 0) || (so->so_state & SS_RCVATMARK), + havemark = (SOCK_IS_NONSTR(so)) ? so->so_oobmark > 0 : + SOTOTPI(so)->sti_oobsigcnt > 0; + + if (!EQUIV(havemark || (so->so_state & SS_RCVATMARK), so->so_state & SS_OOBPEND)) { - printf("Bad oob state 3 (%p): counts %d/%d state %s\n", - (void *)so, so->so_oobsigcnt, - so->so_oobcnt, pr_state(so->so_state, so->so_mode)); + printf("Bad oob state 3 (%p): state %s\n", + (void *)so, pr_state(so->so_state, so->so_mode)); return (0); } @@ -2150,21 +1661,23 @@ so_verify_oobstate(struct sonode *so) */ if (!(so->so_options & SO_OOBINLINE) && !EQUIV(so->so_oobmsg != NULL, so->so_state & SS_HAVEOOBDATA)) { - printf("Bad oob state 4 (%p): counts %d/%d state %s\n", - (void *)so, so->so_oobsigcnt, - so->so_oobcnt, pr_state(so->so_state, so->so_mode)); + printf("Bad oob state 4 (%p): state %s\n", + (void *)so, pr_state(so->so_state, so->so_mode)); return (0); } - if (so->so_oobsigcnt < so->so_oobcnt) { + + if (!SOCK_IS_NONSTR(so) && + SOTOTPI(so)->sti_oobsigcnt < SOTOTPI(so)->sti_oobcnt) { printf("Bad oob state 5 (%p): counts %d/%d state %s\n", - (void *)so, so->so_oobsigcnt, - so->so_oobcnt, pr_state(so->so_state, so->so_mode)); + (void *)so, SOTOTPI(so)->sti_oobsigcnt, + SOTOTPI(so)->sti_oobcnt, + pr_state(so->so_state, so->so_mode)); return (0); } + return (1); } #undef EQUIV - #endif /* DEBUG */ /* initialize sockfs zone specific kstat related items */ @@ -2224,8 +1737,8 @@ sockfs_update(kstat_t *ksp, int rw) return (EACCES); } - for (so = socklist.sl_list; so != NULL; so = so->so_next) { - if (so->so_accessvp != NULL && so->so_zoneid == myzoneid) { + for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) { + if (so->so_count != 0 && so->so_zoneid == myzoneid) { nactive++; } } @@ -2243,6 +1756,7 @@ sockfs_snapshot(kstat_t *ksp, void *buf, int rw) struct k_sockinfo *pksi; /* where we put sockinfo data */ t_uscalar_t sn_len; /* soa_len */ zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private; + sotpi_info_t *sti; ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid()); @@ -2257,9 +1771,10 @@ sockfs_snapshot(kstat_t *ksp, void *buf, int rw) * info into buf, in k_sockinfo format. */ pksi = (struct k_sockinfo *)buf; - for (ns = 0, so = socklist.sl_list; so != NULL; so = so->so_next) { + ns = 0; + for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) { /* only stuff active sonodes and the same zone: */ - if (so->so_accessvp == NULL || so->so_zoneid != myzoneid) { + if (so->so_count == 0 || so->so_zoneid != myzoneid) { continue; } @@ -2271,50 +1786,54 @@ sockfs_snapshot(kstat_t *ksp, void *buf, int rw) break; } + sti = SOTOTPI(so); /* copy important info into buf: */ pksi->ks_si.si_size = sizeof (struct k_sockinfo); pksi->ks_si.si_family = so->so_family; pksi->ks_si.si_type = so->so_type; pksi->ks_si.si_flag = so->so_flag; pksi->ks_si.si_state = so->so_state; - pksi->ks_si.si_serv_type = so->so_serv_type; - pksi->ks_si.si_ux_laddr_sou_magic = so->so_ux_laddr.soua_magic; - pksi->ks_si.si_ux_faddr_sou_magic = so->so_ux_faddr.soua_magic; - pksi->ks_si.si_laddr_soa_len = so->so_laddr.soa_len; - pksi->ks_si.si_faddr_soa_len = so->so_faddr.soa_len; + pksi->ks_si.si_serv_type = sti->sti_serv_type; + pksi->ks_si.si_ux_laddr_sou_magic = + sti->sti_ux_laddr.soua_magic; + pksi->ks_si.si_ux_faddr_sou_magic = + sti->sti_ux_faddr.soua_magic; + pksi->ks_si.si_laddr_soa_len = sti->sti_laddr.soa_len; + pksi->ks_si.si_faddr_soa_len = sti->sti_faddr.soa_len; pksi->ks_si.si_szoneid = so->so_zoneid; + pksi->ks_si.si_faddr_noxlate = sti->sti_faddr_noxlate; mutex_enter(&so->so_lock); - if (so->so_laddr_sa != NULL) { - ASSERT(so->so_laddr_sa->sa_data != NULL); - sn_len = so->so_laddr_len; + if (sti->sti_laddr_sa != NULL) { + ASSERT(sti->sti_laddr_sa->sa_data != NULL); + sn_len = sti->sti_laddr_len; ASSERT(sn_len <= sizeof (short) + sizeof (pksi->ks_si.si_laddr_sun_path)); pksi->ks_si.si_laddr_family = - so->so_laddr_sa->sa_family; + sti->sti_laddr_sa->sa_family; if (sn_len != 0) { /* AF_UNIX socket names are NULL terminated */ (void) strncpy(pksi->ks_si.si_laddr_sun_path, - so->so_laddr_sa->sa_data, + sti->sti_laddr_sa->sa_data, sizeof (pksi->ks_si.si_laddr_sun_path)); sn_len = strlen(pksi->ks_si.si_laddr_sun_path); } pksi->ks_si.si_laddr_sun_path[sn_len] = 0; } - if (so->so_faddr_sa != NULL) { - ASSERT(so->so_faddr_sa->sa_data != NULL); - sn_len = so->so_faddr_len; + if (sti->sti_faddr_sa != NULL) { + ASSERT(sti->sti_faddr_sa->sa_data != NULL); + sn_len = sti->sti_faddr_len; ASSERT(sn_len <= sizeof (short) + sizeof (pksi->ks_si.si_faddr_sun_path)); pksi->ks_si.si_faddr_family = - so->so_faddr_sa->sa_family; + sti->sti_faddr_sa->sa_family; if (sn_len != 0) { (void) strncpy(pksi->ks_si.si_faddr_sun_path, - so->so_faddr_sa->sa_data, + sti->sti_faddr_sa->sa_data, sizeof (pksi->ks_si.si_faddr_sun_path)); sn_len = strlen(pksi->ks_si.si_faddr_sun_path); } @@ -2325,9 +1844,9 @@ sockfs_snapshot(kstat_t *ksp, void *buf, int rw) (void) sprintf(pksi->ks_straddr[0], "%p", (void *)so); (void) sprintf(pksi->ks_straddr[1], "%p", - (void *)so->so_ux_laddr.soua_vp); + (void *)sti->sti_ux_laddr.soua_vp); (void) sprintf(pksi->ks_straddr[2], "%p", - (void *)so->so_ux_faddr.soua_vp); + (void *)sti->sti_ux_faddr.soua_vp); ns++; pksi++; @@ -2389,3 +1908,23 @@ out: return (cnt); } } + +int +so_copyin(const void *from, void *to, size_t size, int fromkernel) +{ + if (fromkernel) { + bcopy(from, to, size); + return (0); + } + return (xcopyin(from, to, size)); +} + +int +so_copyout(const void *from, void *to, size_t size, int tokernel) +{ + if (tokernel) { + bcopy(from, to, size); + return (0); + } + return (xcopyout(from, to, size)); +} diff --git a/usr/src/uts/common/fs/sockfs/socksyscalls.c b/usr/src/uts/common/fs/sockfs/socksyscalls.c index 95f4f5738d..4d0929f39b 100644 --- a/usr/src/uts/common/fs/sockfs/socksyscalls.c +++ b/usr/src/uts/common/fs/sockfs/socksyscalls.c @@ -64,7 +64,10 @@ #include <vm/seg.h> #include <vm/seg_map.h> #include <vm/seg_kpm.h> + #include <fs/sockfs/nl7c.h> +#include <fs/sockfs/sockcommon.h> +#include <fs/sockfs/socktpi.h> #ifdef SOCK_TEST int do_useracc = 1; /* Controlled by setting SO_DEBUG to 4 */ @@ -90,115 +93,39 @@ extern int xnet_truncate_print; * devpath for the kernel to use. */ int -so_socket(int domain, int type, int protocol, char *devpath, int version) +so_socket(int family, int type, int protocol, char *devpath, int version) { - vnode_t *accessvp; struct sonode *so; vnode_t *vp; struct file *fp; int fd; int error; - boolean_t wildcard = B_FALSE; - int saved_error = 0; - int sdomain = domain; - - dprint(1, ("so_socket(%d,%d,%d,%p,%d)\n", - domain, type, protocol, (void *)devpath, version)); - - if (domain == AF_NCA) { - /* - * The request is for an NCA socket so for NL7C use the - * INET domain instead and mark NL7C_AF_NCA below. - */ - domain = AF_INET; - /* - * NL7C is not supported in non-global zones, - * we enforce this restriction here. - */ - if (getzoneid() != GLOBAL_ZONEID) { - return (set_errno(ENOTSUP)); - } - } - - accessvp = solookup(domain, type, protocol, devpath, &error); - if (accessvp == NULL) { - /* - * If there is either an EPROTONOSUPPORT or EPROTOTYPE error - * it makes sense doing the wildcard lookup since the - * protocol might not be in the table. - */ - if (devpath != NULL || protocol == 0 || - !(error == EPROTONOSUPPORT || error == EPROTOTYPE)) - return (set_errno(error)); - saved_error = error; + if (devpath != NULL) { + char *buf; + size_t kdevpathlen = 0; - /* - * Try wildcard lookup. Never use devpath for wildcards. - */ - accessvp = solookup(domain, type, 0, NULL, &error); - if (accessvp == NULL) { - /* - * Can't find in kernel table - have library - * fall back to /etc/netconfig and tell us - * the devpath (The library will do this if it didn't - * already pass in a devpath). - */ - if (saved_error != 0) - error = saved_error; + buf = kmem_alloc(MAXPATHLEN, KM_SLEEP); + if ((error = copyinstr(devpath, buf, + MAXPATHLEN, &kdevpathlen)) != 0) { + kmem_free(buf, MAXPATHLEN); return (set_errno(error)); } - wildcard = B_TRUE; - } - - /* Check the device policy */ - if ((error = secpolicy_spec_open(CRED(), - accessvp, FREAD|FWRITE)) != 0) { - return (set_errno(error)); - } - - if (protocol == IPPROTO_SCTP) { - so = sosctp_create(accessvp, domain, type, protocol, version, - NULL, &error); - } else if (protocol == PROTO_SDP) { - so = sosdp_create(accessvp, domain, type, protocol, version, - NULL, &error); + so = socket_create(family, type, protocol, buf, NULL, + SOCKET_SLEEP, version, CRED(), &error); + kmem_free(buf, MAXPATHLEN); } else { - so = sotpi_create(accessvp, domain, type, protocol, version, - NULL, &error); + so = socket_create(family, type, protocol, NULL, NULL, + SOCKET_SLEEP, version, CRED(), &error); } - if (so == NULL) { + if (so == NULL) return (set_errno(error)); - } - if (sdomain == AF_NCA && domain == AF_INET) { - so->so_nl7c_flags = NL7C_AF_NCA; - } - vp = SOTOV(so); - if (wildcard) { - /* - * Issue SO_PROTOTYPE setsockopt. - */ - error = SOP_SETSOCKOPT(so, SOL_SOCKET, SO_PROTOTYPE, - &protocol, - (t_uscalar_t)sizeof (protocol)); - if (error) { - (void) VOP_CLOSE(vp, 0, 1, 0, CRED(), NULL); - VN_RELE(vp); - /* - * Setsockopt often fails with ENOPROTOOPT but socket() - * should fail with EPROTONOSUPPORT/EPROTOTYPE. - */ - if (saved_error != 0 && error == ENOPROTOOPT) - error = saved_error; - else - error = EPROTONOSUPPORT; - return (set_errno(error)); - } - } + /* Allocate a file descriptor for the socket */ + vp = SOTOV(so); if (error = falloc(vp, FWRITE|FREAD, &fp, &fd)) { - (void) VOP_CLOSE(vp, 0, 1, 0, CRED(), NULL); - VN_RELE(vp); + (void) socket_close(so, 0, CRED()); + socket_destroy(so); return (set_errno(error)); } @@ -402,6 +329,8 @@ so_socketpair(int sv[2]) int error; struct sockaddr_ux *name; size_t namelen; + sotpi_info_t *sti1; + sotpi_info_t *sti2; dprint(1, ("so_socketpair(%p)\n", (void *)sv)); @@ -425,6 +354,9 @@ so_socketpair(int sv[2]) goto done; } + sti1 = SOTOTPI(so1); + sti2 = SOTOTPI(so2); + /* * The code below makes assumptions about the "sockfs" implementation. * So make sure that the correct implementation is really used. @@ -437,12 +369,12 @@ so_socketpair(int sv[2]) * Bind both sockets and connect them with each other. * Need to allocate name/namelen for soconnect. */ - error = SOP_BIND(so1, NULL, 0, _SOBIND_UNSPEC); + error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC, CRED()); if (error) { eprintsoline(so1, error); goto done; } - error = SOP_BIND(so2, NULL, 0, _SOBIND_UNSPEC); + error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED()); if (error) { eprintsoline(so2, error); goto done; @@ -450,21 +382,21 @@ so_socketpair(int sv[2]) namelen = sizeof (struct sockaddr_ux); name = kmem_alloc(namelen, KM_SLEEP); name->sou_family = AF_UNIX; - name->sou_addr = so2->so_ux_laddr; - error = SOP_CONNECT(so1, + name->sou_addr = sti2->sti_ux_laddr; + error = socket_connect(so1, (struct sockaddr *)name, (socklen_t)namelen, - 0, _SOCONNECT_NOXLATE); + 0, _SOCONNECT_NOXLATE, CRED()); if (error) { kmem_free(name, namelen); eprintsoline(so1, error); goto done; } - name->sou_addr = so1->so_ux_laddr; - error = SOP_CONNECT(so2, + name->sou_addr = sti1->sti_ux_laddr; + error = socket_connect(so2, (struct sockaddr *)name, (socklen_t)namelen, - 0, _SOCONNECT_NOXLATE); + 0, _SOCONNECT_NOXLATE, CRED()); kmem_free(name, namelen); if (error) { eprintsoline(so2, error); @@ -487,17 +419,18 @@ so_socketpair(int sv[2]) int nfd; /* - * We could simply call SOP_LISTEN() here (which would do the + * We could simply call socket_listen() here (which would do the * binding automatically) if the code didn't rely on passing - * _SOBIND_NOXLATE to the TPI implementation of SOP_BIND(). + * _SOBIND_NOXLATE to the TPI implementation of socket_bind(). */ - error = SOP_BIND(so1, NULL, 0, _SOBIND_UNSPEC|_SOBIND_NOXLATE| - _SOBIND_LISTEN|_SOBIND_SOCKETPAIR); + error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC| + _SOBIND_NOXLATE|_SOBIND_LISTEN|_SOBIND_SOCKETPAIR, + CRED()); if (error) { eprintsoline(so1, error); goto done; } - error = SOP_BIND(so2, NULL, 0, _SOBIND_UNSPEC); + error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED()); if (error) { eprintsoline(so2, error); goto done; @@ -506,20 +439,19 @@ so_socketpair(int sv[2]) namelen = sizeof (struct sockaddr_ux); name = kmem_alloc(namelen, KM_SLEEP); name->sou_family = AF_UNIX; - name->sou_addr = so1->so_ux_laddr; - error = SOP_CONNECT(so2, + name->sou_addr = sti1->sti_ux_laddr; + error = socket_connect(so2, (struct sockaddr *)name, (socklen_t)namelen, - FNONBLOCK, _SOCONNECT_NOXLATE); + FNONBLOCK, _SOCONNECT_NOXLATE, CRED()); kmem_free(name, namelen); if (error) { if (error != EINPROGRESS) { - eprintsoline(so2, error); - goto done; + eprintsoline(so2, error); goto done; } } - error = SOP_ACCEPT(so1, 0, &nso); + error = socket_accept(so1, 0, CRED(), &nso); if (error) { eprintsoline(so1, error); goto done; @@ -529,17 +461,17 @@ so_socketpair(int sv[2]) mutex_enter(&so2->so_lock); error = sowaitconnected(so2, 0, 1); mutex_exit(&so2->so_lock); - nvp = SOTOV(nso); if (error != 0) { - (void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL); - VN_RELE(nvp); + (void) socket_close(nso, 0, CRED()); + socket_destroy(nso); eprintsoline(so2, error); goto done; } + nvp = SOTOV(nso); if (error = falloc(nvp, FWRITE|FREAD, &nfp, &nfd)) { - (void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL); - VN_RELE(nvp); + (void) socket_close(nso, 0, CRED()); + socket_destroy(nso); eprintsoline(nso, error); goto done; } @@ -603,13 +535,13 @@ bind(int sock, struct sockaddr *name, socklen_t namelen, int version) switch (version) { default: - error = SOP_BIND(so, name, namelen, 0); + error = socket_bind(so, name, namelen, 0, CRED()); break; case SOV_XPG4_2: - error = SOP_BIND(so, name, namelen, _SOBIND_XPG4_2); + error = socket_bind(so, name, namelen, _SOBIND_XPG4_2, CRED()); break; case SOV_SOCKBSD: - error = SOP_BIND(so, name, namelen, _SOBIND_SOCKBSD); + error = socket_bind(so, name, namelen, _SOBIND_SOCKBSD, CRED()); break; } done: @@ -635,7 +567,7 @@ listen(int sock, int backlog, int version) if ((so = getsonode(sock, &error, NULL)) == NULL) return (set_errno(error)); - error = SOP_LISTEN(so, backlog); + error = socket_listen(so, backlog, CRED()); releasef(sock); if (error) @@ -655,6 +587,8 @@ accept(int sock, struct sockaddr *name, socklen_t *namelenp, int version) struct vnode *nvp; struct file *nfp; int nfd; + struct sockaddr *addrp; + socklen_t addrlen; dprint(1, ("accept(%d, %p, %p)\n", sock, (void *)name, (void *)namelenp)); @@ -681,15 +615,15 @@ accept(int sock, struct sockaddr *name, socklen_t *namelenp, int version) } /* - * Allocate the user fd before SOP_ACCEPT() in order to - * catch EMFILE errors before calling SOP_ACCEPT(). + * Allocate the user fd before socket_accept() in order to + * catch EMFILE errors before calling socket_accept(). */ if ((nfd = ufalloc(0)) == -1) { eprintsoline(so, EMFILE); releasef(sock); return (set_errno(EMFILE)); } - error = SOP_ACCEPT(so, fp->f_flag, &nso); + error = socket_accept(so, fp->f_flag, CRED(), &nso); releasef(sock); if (error) { setf(nfd, NULL); @@ -698,34 +632,32 @@ accept(int sock, struct sockaddr *name, socklen_t *namelenp, int version) nvp = SOTOV(nso); - /* - * so_faddr_sa can not go away even though we are not holding so_lock. - * However, in theory its content could change from underneath us. - * But this is not possible in practice since it can only - * change due to either some socket system call - * or due to a T_CONN_CON being received from the stream head. - * Since the falloc/setf have not yet been done no thread - * can do any system call on nso and T_CONN_CON can not arrive - * on a socket that is already connected. - * Thus there is no reason to hold so_lock here. - * - * SOP_ACCEPT() is required to have set the valid bit for the faddr, - * but it could be instantly cleared by a disconnect from the transport. - * For that reason we ignore it here. - */ ASSERT(MUTEX_NOT_HELD(&nso->so_lock)); - error = copyout_name(name, namelen, namelenp, - nso->so_faddr_sa, (socklen_t)nso->so_faddr_len); + if (namelen != 0) { + addrlen = so->so_max_addr_len; + addrp = (struct sockaddr *)kmem_alloc(addrlen, KM_SLEEP); + + if ((error = socket_getpeername(nso, (struct sockaddr *)addrp, + &addrlen, B_TRUE, CRED())) == 0) { + error = copyout_name(name, namelen, namelenp, + addrp, addrlen); + } else { + ASSERT(error == EINVAL || error == ENOTCONN); + error = ECONNABORTED; + } + kmem_free(addrp, so->so_max_addr_len); + } + if (error) { setf(nfd, NULL); - (void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL); - VN_RELE(nvp); + (void) socket_close(nso, 0, CRED()); + socket_destroy(nso); return (set_errno(error)); } if (error = falloc(NULL, FWRITE|FREAD, &nfp, NULL)) { setf(nfd, NULL); - (void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL); - VN_RELE(nvp); + (void) socket_close(nso, 0, CRED()); + socket_destroy(nso); eprintsoline(so, error); return (set_errno(error)); } @@ -790,8 +722,8 @@ connect(int sock, struct sockaddr *name, socklen_t namelen, int version) } else name = NULL; - error = SOP_CONNECT(so, name, namelen, fp->f_flag, - (version != SOV_XPG4_2) ? 0 : _SOCONNECT_XPG4_2); + error = socket_connect(so, name, namelen, fp->f_flag, + (version != SOV_XPG4_2) ? 0 : _SOCONNECT_XPG4_2, CRED()); releasef(sock); if (name) kmem_free(name, (size_t)namelen); @@ -813,7 +745,7 @@ shutdown(int sock, int how, int version) if ((so = getsonode(sock, &error, NULL)) == NULL) return (set_errno(error)); - error = SOP_SHUTDOWN(so, how); + error = socket_shutdown(so, how, CRED()); releasef(sock); if (error) @@ -857,13 +789,12 @@ recvit(int sock, msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_XPG4_2); - error = SOP_RECVMSG(so, msg, uiop); + error = socket_recvmsg(so, msg, uiop, CRED()); if (error) { releasef(sock); return (set_errno(error)); } lwp_stat_update(LWP_STAT_MSGRCV, 1); - so_update_attrs(so, SOACC); releasef(sock); error = copyout_name(name, namelen, namelenp, @@ -1198,7 +1129,7 @@ sendit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags) len = uiop->uio_resid; msg->msg_flags = flags; - error = SOP_SENDMSG(so, msg, uiop); + error = socket_sendmsg(so, msg, uiop, CRED()); done1: if (control != NULL) kmem_free(control, controllen); @@ -1211,7 +1142,6 @@ done3: return (set_errno(error)); } lwp_stat_update(LWP_STAT_MSGSND, 1); - so_update_attrs(so, SOMOD); releasef(sock); return (len - uiop->uio_resid); } @@ -1413,12 +1343,8 @@ getpeername(int sock, struct sockaddr *name, socklen_t *namelenp, int version) struct sonode *so; int error; socklen_t namelen; - union { - struct sockaddr_in sin; - struct sockaddr_in6 sin6; - } sin; /* Temporary buffer, common case */ - void *addr; /* Temporary buffer, uncommon case */ - socklen_t addrlen, size; + socklen_t sock_addrlen; + struct sockaddr *sock_addrp; dprint(1, ("getpeername(%d, %p, %p)\n", sock, (void *)name, (void *)namelenp)); @@ -1432,44 +1358,16 @@ getpeername(int sock, struct sockaddr *name, socklen_t *namelenp, int version) error = EFAULT; goto rel_out; } - /* - * If a connect or accept has been done, unless we're an Xnet socket, - * the remote address has already been updated in so_faddr_sa. - */ - if (so->so_version != SOV_SOCKSTREAM && so->so_version != SOV_SOCKBSD || - !(so->so_state & SS_FADDR_VALID)) { - if ((error = SOP_GETPEERNAME(so)) != 0) - goto rel_out; - } + sock_addrlen = so->so_max_addr_len; + sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP); - if (so->so_faddr_maxlen <= sizeof (sin)) { - size = 0; - addr = &sin; - } else { - /* - * Allocate temporary to avoid holding so_lock across - * copyout - */ - size = so->so_faddr_maxlen; - addr = kmem_alloc(size, KM_SLEEP); + if ((error = socket_getpeername(so, sock_addrp, &sock_addrlen, + B_FALSE, CRED())) == 0) { + ASSERT(sock_addrlen <= so->so_max_addr_len); + error = copyout_name(name, namelen, namelenp, + (void *)sock_addrp, sock_addrlen); } - /* Prevent so_faddr_sa/len from changing while accessed */ - mutex_enter(&so->so_lock); - if (!(so->so_state & SS_ISCONNECTED)) { - mutex_exit(&so->so_lock); - error = ENOTCONN; - goto free_out; - } - addrlen = so->so_faddr_len; - bcopy(so->so_faddr_sa, addr, addrlen); - mutex_exit(&so->so_lock); - - ASSERT(MUTEX_NOT_HELD(&so->so_lock)); - error = copyout_name(name, namelen, namelenp, addr, - (so->so_state & SS_FADDR_NOXLATE) ? 0 : addrlen); -free_out: - if (size != 0) - kmem_free(addr, size); + kmem_free(sock_addrp, so->so_max_addr_len); rel_out: releasef(sock); bad: return (error != 0 ? set_errno(error) : 0); @@ -1482,13 +1380,8 @@ getsockname(int sock, struct sockaddr *name, { struct sonode *so; int error; - socklen_t namelen; - union { - struct sockaddr_in sin; - struct sockaddr_in6 sin6; - } sin; /* Temporary buffer, common case */ - void *addr; /* Temporary buffer, uncommon case */ - socklen_t addrlen, size; + socklen_t namelen, sock_addrlen; + struct sockaddr *sock_addrp; dprint(1, ("getsockname(%d, %p, %p)\n", sock, (void *)name, (void *)namelenp)); @@ -1503,39 +1396,16 @@ getsockname(int sock, struct sockaddr *name, goto rel_out; } - /* - * If a bind or accept has been done, unless we're an Xnet endpoint, - * the local address has already been updated in so_laddr_sa. - */ - if ((so->so_version != SOV_SOCKSTREAM && - so->so_version != SOV_SOCKBSD) || - !(so->so_state & SS_LADDR_VALID)) { - if ((error = SOP_GETSOCKNAME(so)) != 0) - goto rel_out; - } - - if (so->so_laddr_maxlen <= sizeof (sin)) { - size = 0; - addr = &sin; - } else { - /* - * Allocate temporary to avoid holding so_lock across - * copyout - */ - size = so->so_laddr_maxlen; - addr = kmem_alloc(size, KM_SLEEP); + sock_addrlen = so->so_max_addr_len; + sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP); + if ((error = socket_getsockname(so, sock_addrp, &sock_addrlen, + CRED())) == 0) { + ASSERT(MUTEX_NOT_HELD(&so->so_lock)); + ASSERT(sock_addrlen <= so->so_max_addr_len); + error = copyout_name(name, namelen, namelenp, + (void *)sock_addrp, sock_addrlen); } - /* Prevent so_laddr_sa/len from changing while accessed */ - mutex_enter(&so->so_lock); - addrlen = so->so_laddr_len; - bcopy(so->so_laddr_sa, addr, addrlen); - mutex_exit(&so->so_lock); - - ASSERT(MUTEX_NOT_HELD(&so->so_lock)); - error = copyout_name(name, namelen, namelenp, - addr, addrlen); - if (size != 0) - kmem_free(addr, size); + kmem_free(sock_addrp, so->so_max_addr_len); rel_out: releasef(sock); bad: return (error != 0 ? set_errno(error) : 0); @@ -1577,8 +1447,9 @@ getsockopt(int sock, } optval = kmem_alloc(optlen, KM_SLEEP); optlen_res = optlen; - error = SOP_GETSOCKOPT(so, level, option_name, optval, - &optlen_res, (version != SOV_XPG4_2) ? 0 : _SOGETSOCKOPT_XPG4_2); + error = socket_getsockopt(so, level, option_name, optval, + &optlen_res, (version != SOV_XPG4_2) ? 0 : _SOGETSOCKOPT_XPG4_2, + CRED()); releasef(sock); if (error) { kmem_free(optval, optlen); @@ -1633,8 +1504,8 @@ setsockopt(int sock, } else option_len = 0; - error = SOP_SETSOCKOPT(so, level, option_name, optval, - (t_uscalar_t)option_len); + error = socket_setsockopt(so, level, option_name, optval, + (t_uscalar_t)option_len, CRED()); done1: if (optval != buffer) kmem_free(optval, (size_t)option_len); @@ -1646,51 +1517,140 @@ done2: } /* - * Add config info when devpath is non-NULL; delete info when devpath is NULL. - * devpath is a user address. + * Add config info when name is non-NULL; delete info when name is NULL. + * name could be a device name or a module name and are user address. */ int -sockconfig(int domain, int type, int protocol, char *devpath) +sockconfig(int family, int type, int protocol, char *name) { - char *kdevpath; /* Copied in devpath string */ - size_t kdevpathlen; + char *kdevpath = NULL; /* Copied in devpath string */ + char *kmodule = NULL; + size_t pathlen = 0; int error = 0; dprint(1, ("sockconfig(%d, %d, %d, %p)\n", - domain, type, protocol, (void *)devpath)); + family, type, protocol, (void *)name)); if (secpolicy_net_config(CRED(), B_FALSE) != 0) return (set_errno(EPERM)); - if (devpath == NULL) { - /* Deleting an entry */ - kdevpath = NULL; - kdevpathlen = 0; - } else { + /* + * By default set the kdevpath and kmodule to NULL to delete an entry. + * Otherwise when name is not NULL, set the kdevpath or kmodule + * value to add an entry. + */ + if (name != NULL) { /* * Adding an entry. - * Copyin the devpath. + * Copyin the name. * This also makes it possible to check for too long pathnames. - * Compress the space needed for the devpath before passing it + * Compress the space needed for the name before passing it * to soconfig - soconfig will store the string until * the configuration is removed. */ char *buf; - buf = kmem_alloc(MAXPATHLEN, KM_SLEEP); - if ((error = copyinstr(devpath, buf, MAXPATHLEN, - &kdevpathlen)) != 0) { + if ((error = copyinstr(name, buf, MAXPATHLEN, &pathlen)) != 0) { kmem_free(buf, MAXPATHLEN); goto done; } + if (strncmp(buf, "/dev", strlen("/dev")) == 0) { + /* For device */ - kdevpath = kmem_alloc(kdevpathlen, KM_SLEEP); - bcopy(buf, kdevpath, kdevpathlen); - kdevpath[kdevpathlen - 1] = '\0'; + /* + * Special handling for NCA: + * + * DEV_NCA is never opened even if an application + * requests for AF_NCA. The device opened is instead a + * predefined AF_INET transport (NCA_INET_DEV). + * + * Prior to Volo (PSARC/2007/587) NCA would determine + * the device using a lookup, which worked then because + * all protocols were based on TPI. Since TPI is no + * longer the default, we have to explicitly state + * which device to use. + */ + if (strcmp(buf, NCA_DEV) == 0) { + /* only support entry <28, 2, 0> */ + if (family != AF_NCA || type != SOCK_STREAM || + protocol != 0) { + kmem_free(buf, MAXPATHLEN); + error = EINVAL; + goto done; + } + + pathlen = strlen(NCA_INET_DEV) + 1; + kdevpath = kmem_alloc(pathlen, KM_SLEEP); + bcopy(NCA_INET_DEV, kdevpath, pathlen); + kdevpath[pathlen - 1] = '\0'; + } else { + kdevpath = kmem_alloc(pathlen, KM_SLEEP); + bcopy(buf, kdevpath, pathlen); + kdevpath[pathlen - 1] = '\0'; + } + } else { + /* For socket module */ + kmodule = kmem_alloc(pathlen, KM_SLEEP); + bcopy(buf, kmodule, pathlen); + kmodule[pathlen - 1] = '\0'; + + pathlen = 0; + if (strcmp(kmodule, "tcp") == 0) { + /* Get the tcp device name for fallback */ + if (family == 2) { + pathlen = strlen("/dev/tcp") + 1; + kdevpath = kmem_alloc(pathlen, + KM_SLEEP); + bcopy("/dev/tcp", kdevpath, + pathlen); + kdevpath[pathlen - 1] = '\0'; + } else { + ASSERT(family == 26); + pathlen = strlen("/dev/tcp6") + 1; + kdevpath = kmem_alloc(pathlen, + KM_SLEEP); + bcopy("/dev/tcp6", kdevpath, pathlen); + kdevpath[pathlen - 1] = '\0'; + } + } else if (strcmp(kmodule, "udp") == 0) { + /* Get the udp device name for fallback */ + if (family == 2) { + pathlen = strlen("/dev/udp") + 1; + kdevpath = kmem_alloc(pathlen, + KM_SLEEP); + bcopy("/dev/udp", kdevpath, pathlen); + kdevpath[pathlen - 1] = '\0'; + } else { + ASSERT(family == 26); + pathlen = strlen("/dev/udp6") + 1; + kdevpath = kmem_alloc(pathlen, + KM_SLEEP); + bcopy("/dev/udp6", kdevpath, pathlen); + kdevpath[pathlen - 1] = '\0'; + } + } else if (strcmp(kmodule, "icmp") == 0) { + /* Get the icmp device name for fallback */ + if (family == 2) { + pathlen = strlen("/dev/rawip") + 1; + kdevpath = kmem_alloc(pathlen, + KM_SLEEP); + bcopy("/dev/rawip", kdevpath, pathlen); + kdevpath[pathlen - 1] = '\0'; + } else { + ASSERT(family == 26); + pathlen = strlen("/dev/rawip6") + 1; + kdevpath = kmem_alloc(pathlen, + KM_SLEEP); + bcopy("/dev/rawip6", kdevpath, pathlen); + kdevpath[pathlen - 1] = '\0'; + } + } + } kmem_free(buf, MAXPATHLEN); } - error = soconfig(domain, type, protocol, kdevpath, (int)kdevpathlen); + error = soconfig(family, type, protocol, kdevpath, (int)pathlen, + kmodule); done: if (error) { eprintline(error); @@ -1961,9 +1921,15 @@ snf_async_read(snf_req_t *sr) */ so = VTOSO(vp); stp = vp->v_stream; - wroff = (int)(stp->sd_wroff); - maxblk = (int)(stp->sd_maxblk); - extra = wroff + (int)(stp->sd_tail); + if (stp == NULL) { + wroff = so->so_proto_props.sopp_wroff; + maxblk = so->so_proto_props.sopp_maxblk; + extra = wroff + so->so_proto_props.sopp_tail; + } else { + wroff = (int)(stp->sd_wroff); + maxblk = (int)(stp->sd_maxblk); + extra = wroff + (int)(stp->sd_tail); + } } while ((size != 0) && (sr->sr_write_error == 0)) { @@ -1975,7 +1941,8 @@ snf_async_read(snf_req_t *sr) * need to adjust the size to the maximum * SSL record size set in the stream head. */ - if (vp->v_type == VSOCK && so->so_kssl_ctx != NULL) + if (vp->v_type == VSOCK && !SOCK_IS_NONSTR(so) && + SOTOTPI(so)->sti_kssl_ctx != NULL) iosize = (int)MIN(iosize, maxblk); if ((mp = allocb(iosize + extra, BPRI_MED)) == NULL) { @@ -2066,7 +2033,7 @@ create_thread(int operation, struct vnode *vp, file_t *fp, * store sd_qn_maxpsz into sr_maxpsz while we have stream head. * stream might be closed before thread returns from snf_async_read. */ - if (stp->sd_qn_maxpsz > 0) { + if (stp != NULL && stp->sd_qn_maxpsz > 0) { sr->sr_maxpsz = MIN(MAXBSIZE, stp->sd_qn_maxpsz); } else { sr->sr_maxpsz = MAXBSIZE; @@ -2115,9 +2082,11 @@ snf_direct_io(file_t *fp, file_t *rfp, u_offset_t fileoff, u_offset_t size, short fflag; struct vnode *vp; int ksize; + struct nmsghdr msg; ksize = 0; *count = 0; + bzero(&msg, sizeof (msg)); vp = fp->f_vnode; fflag = fp->f_flag; @@ -2138,8 +2107,11 @@ snf_direct_io(file_t *fp, file_t *rfp, u_offset_t fileoff, u_offset_t size, } iosize = MBLKL(mp); - if ((error = kstrwritemp(vp, mp, fflag)) != 0) { - freeb(mp); + error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp); + + if (error != 0) { + if (mp != NULL) + freeb(mp); break; } ksize += iosize; @@ -2233,10 +2205,13 @@ snf_segmap(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size, snf_smap_desbinfo *snfi; struct vattr va; boolean_t dowait = B_FALSE; + struct nmsghdr msg; vp = fp->f_vnode; fflag = fp->f_flag; ksize = 0; + bzero(&msg, sizeof (msg)); + for (;;) { if (ISSIG(curthread, JUSTLOOKING)) { error = EINTR; @@ -2307,9 +2282,11 @@ snf_segmap(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size, mp->b_datap->db_struioflag |= STRUIO_ZCNOTIFY; } VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL); - if ((error = kstrwritemp(vp, mp, fflag)) != 0) { + error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp); + if (error != 0) { *count = ksize; - freemsg(mp); + if (mp != NULL) + freemsg(mp); return (error); } ksize += iosize; @@ -2335,16 +2312,22 @@ done: stdata_t *stp; stp = vp->v_stream; - mutex_enter(&stp->sd_lock); - while (!(stp->sd_flag & STZCNOTIFY)) { - if (cv_wait_sig(&stp->sd_zcopy_wait, - &stp->sd_lock) == 0) { - error = EINTR; - break; + if (stp == NULL) { + struct sonode *so; + so = VTOSO(vp); + error = so_zcopy_wait(so); + } else { + mutex_enter(&stp->sd_lock); + while (!(stp->sd_flag & STZCNOTIFY)) { + if (cv_wait_sig(&stp->sd_zcopy_wait, + &stp->sd_lock) == 0) { + error = EINTR; + break; + } } + stp->sd_flag &= ~STZCNOTIFY; + mutex_exit(&stp->sd_lock); } - stp->sd_flag &= ~STZCNOTIFY; - mutex_exit(&stp->sd_lock); } return (error); } @@ -2367,6 +2350,7 @@ snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size, int maxblk = 0; int wroff = 0; struct sonode *so; + struct nmsghdr msg; vp = fp->f_vnode; if (vp->v_type == VSOCK) { @@ -2377,11 +2361,17 @@ snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size, */ so = VTOSO(vp); stp = vp->v_stream; - wroff = (int)(stp->sd_wroff); - maxblk = (int)(stp->sd_maxblk); - extra = wroff + (int)(stp->sd_tail); + if (stp == NULL) { + wroff = so->so_proto_props.sopp_wroff; + maxblk = so->so_proto_props.sopp_maxblk; + extra = wroff + so->so_proto_props.sopp_tail; + } else { + wroff = (int)(stp->sd_wroff); + maxblk = (int)(stp->sd_maxblk); + extra = wroff + (int)(stp->sd_tail); + } } - + bzero(&msg, sizeof (msg)); fflag = fp->f_flag; ksize = 0; auio.uio_iov = &aiov; @@ -2406,7 +2396,8 @@ snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size, * need to adjust the size to the maximum * SSL record size set in the stream head. */ - if (vp->v_type == VSOCK && so->so_kssl_ctx != NULL) + if (vp->v_type == VSOCK && !SOCK_IS_NONSTR(so) && + SOTOTPI(so)->sti_kssl_ctx != NULL) iosize = (int)MIN(iosize, maxblk); if ((mp = allocb(iosize + extra, BPRI_MED)) == NULL) { @@ -2434,9 +2425,13 @@ snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size, mp->b_wptr = mp->b_rptr + iosize; VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL); - if ((error = kstrwritemp(vp, mp, fflag)) != 0) { + + error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp); + + if (error != 0) { *count = ksize; - freeb(mp); + if (mp != NULL) + freeb(mp); return (error); } ksize += iosize; @@ -2540,14 +2535,17 @@ sosendfile64(file_t *fp, file_t *rfp, const struct ksendfilevec64 *sfv, if (sfv_len >= MAXBSIZE && (sfv_len >= (va_size >> 1) || (sfv->sfv_flag & SFV_NOWAIT) || sfv_len >= 0x1000000) && !vn_has_flocks(fvp) && !(fvp->v_flag & VNOMAP)) { - if ((stp->sd_copyflag & (STZCVMSAFE|STZCVMUNSAFE)) == 0) { + uint_t copyflag; + copyflag = stp != NULL ? stp->sd_copyflag : + VTOSO(vp)->so_proto_props.sopp_zcopyflag; + if ((copyflag & (STZCVMSAFE|STZCVMUNSAFE)) == 0) { int on = 1; - if (SOP_SETSOCKOPT(VTOSO(vp), SOL_SOCKET, - SO_SND_COPYAVOID, &on, sizeof (on)) == 0) + if (socket_setsockopt(VTOSO(vp), SOL_SOCKET, + SO_SND_COPYAVOID, &on, sizeof (on), CRED()) == 0) dozcopy = B_TRUE; } else { - dozcopy = (stp->sd_copyflag & STZCVMSAFE); + dozcopy = copyflag & STZCVMSAFE; } } if (dozcopy) { @@ -2555,10 +2553,19 @@ sosendfile64(file_t *fp, file_t *rfp, const struct ksendfilevec64 *sfv, error = snf_segmap(fp, fvp, sfv_off, (u_offset_t)sfv_len, &count, ((sfv->sfv_flag & SFV_NOWAIT) != 0)); } else { - if (stp->sd_qn_maxpsz == INFPSZ) + if (vp->v_type == VSOCK && stp == NULL) { + sonode_t *so = VTOSO(vp); + maxpsz = so->so_proto_props.sopp_maxpsz; + } else if (stp != NULL) { + maxpsz = stp->sd_qn_maxpsz; + } else { + maxpsz = maxphys; + } + + if (maxpsz == INFPSZ) maxpsz = maxphys; else - maxpsz = roundup(stp->sd_qn_maxpsz, MAXBSIZE); + maxpsz = roundup(maxpsz, MAXBSIZE); sf_stats.ss_file_cached++; error = snf_cache(fp, fvp, sfv_off, (u_offset_t)sfv_len, maxpsz, &count); @@ -2613,7 +2620,7 @@ sendto32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags, int soaccept(struct sonode *so, int fflag, struct sonode **nsop) { - return (SOP_ACCEPT(so, fflag, nsop)); + return (socket_accept(so, fflag, CRED(), nsop)); } int @@ -2622,9 +2629,9 @@ sobind(struct sonode *so, struct sockaddr *name, socklen_t namelen, { int error; - error = SOP_BIND(so, name, namelen, flags); + error = socket_bind(so, name, namelen, flags, CRED()); if (error == 0 && backlog != 0) - return (SOP_LISTEN(so, backlog)); + return (socket_listen(so, backlog, CRED())); return (error); } @@ -2632,59 +2639,48 @@ sobind(struct sonode *so, struct sockaddr *name, socklen_t namelen, int solisten(struct sonode *so, int backlog) { - return (SOP_LISTEN(so, backlog)); + return (socket_listen(so, backlog, CRED())); } int soconnect(struct sonode *so, const struct sockaddr *name, socklen_t namelen, int fflag, int flags) { - return (SOP_CONNECT(so, name, namelen, fflag, flags)); + return (socket_connect(so, name, namelen, fflag, flags, CRED())); } int sorecvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) { - return (SOP_RECVMSG(so, msg, uiop)); + return (socket_recvmsg(so, msg, uiop, CRED())); } int sosendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) { - return (SOP_SENDMSG(so, msg, uiop)); -} - -int -sogetpeername(struct sonode *so) -{ - return (SOP_GETPEERNAME(so)); -} - -int -sogetsockname(struct sonode *so) -{ - return (SOP_GETSOCKNAME(so)); + return (socket_sendmsg(so, msg, uiop, CRED())); } int soshutdown(struct sonode *so, int how) { - return (SOP_SHUTDOWN(so, how)); + return (socket_shutdown(so, how, CRED())); } int sogetsockopt(struct sonode *so, int level, int option_name, void *optval, socklen_t *optlenp, int flags) { - return (SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, - flags)); + return (socket_getsockopt(so, level, option_name, optval, optlenp, + flags, CRED())); } int sosetsockopt(struct sonode *so, int level, int option_name, const void *optval, t_uscalar_t optlen) { - return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen)); + return (socket_setsockopt(so, level, option_name, optval, optlen, + CRED())); } /* @@ -2692,9 +2688,25 @@ sosetsockopt(struct sonode *so, int level, int option_name, const void *optval, * able to handle the creation of TPI sockfs sockets. */ struct sonode * -socreate(vnode_t *accessvp, int domain, int type, int protocol, int version, - struct sonode *tso, int *errorp) +socreate(struct sockparams *sp, int family, int type, int protocol, int version, + int *errorp) { - return (sotpi_create(accessvp, domain, type, protocol, version, tso, - errorp)); + struct sonode *so; + + ASSERT(sp != NULL); + + so = sp->sp_smod_info->smod_sock_create_func(sp, family, type, protocol, + version, SOCKET_SLEEP, errorp, CRED()); + if (so == NULL) { + SOCKPARAMS_DEC_REF(sp); + } else { + if ((*errorp = SOP_INIT(so, NULL, CRED(), SOCKET_SLEEP)) == 0) { + /* Cannot fail, only bumps so_count */ + (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, CRED(), NULL); + } else { + socket_destroy(so); + so = NULL; + } + } + return (so); } diff --git a/usr/src/uts/common/fs/sockfs/socktpi.c b/usr/src/uts/common/fs/sockfs/socktpi.c index f27c34578b..01873727f8 100644 --- a/usr/src/uts/common/fs/sockfs/socktpi.c +++ b/usr/src/uts/common/fs/sockfs/socktpi.c @@ -32,6 +32,7 @@ #include <sys/conf.h> #include <sys/cred.h> #include <sys/kmem.h> +#include <sys/kmem_impl.h> #include <sys/sysmacros.h> #include <sys/vfs.h> #include <sys/vnode.h> @@ -45,6 +46,7 @@ #include <sys/stream.h> #include <sys/strsubr.h> #include <sys/strsun.h> +#include <sys/suntpi.h> #include <sys/ddi.h> #include <sys/esunddi.h> #include <sys/flock.h> @@ -81,6 +83,10 @@ #include <inet/kssl/ksslapi.h> +#include <fs/sockfs/sockcommon.h> +#include <fs/sockfs/socktpi.h> +#include <fs/sockfs/socktpi_impl.h> + /* * Possible failures when memory can't be allocated. The documented behavior: * @@ -170,13 +176,29 @@ int xnet_skip_checks = 0; int xnet_check_print = 0; int xnet_truncate_print = 0; +static void sotpi_destroy(struct sonode *); +static struct sonode *sotpi_create(struct sockparams *, int, int, int, int, + int, int *, cred_t *cr); + +static boolean_t sotpi_info_create(struct sonode *, int); +static void sotpi_info_init(struct sonode *); +static void sotpi_info_fini(struct sonode *); +static void sotpi_info_destroy(struct sonode *); + +/* + * Do direct function call to the transport layer below; this would + * also allow the transport to utilize read-side synchronous stream + * interface if necessary. This is a /etc/system tunable that must + * not be modified on a running system. By default this is enabled + * for performance reasons and may be disabled for debugging purposes. + */ +boolean_t socktpi_direct = B_TRUE; + +static struct kmem_cache *socktpi_cache, *socktpi_unix_cache; + extern void sigintr(k_sigset_t *, int); extern void sigunintr(k_sigset_t *); -extern void *nl7c_lookup_addr(void *, t_uscalar_t); -extern void *nl7c_add_addr(void *, t_uscalar_t); -extern void nl7c_listener_addr(void *, struct sonode *); - /* Sockets acting as an in-kernel SSL proxy */ extern mblk_t *strsock_kssl_input(vnode_t *, mblk_t *, strwakeup_t *, strsigset_t *, strsigset_t *, strpollset_t *); @@ -189,62 +211,198 @@ extern int sodput(sodirect_t *, mblk_t *); extern void sodwakeup(sodirect_t *); /* TPI sockfs sonode operations */ -static int sotpi_accept(struct sonode *, int, struct sonode **); -static int sotpi_bind(struct sonode *, struct sockaddr *, socklen_t, +int sotpi_init(struct sonode *, struct sonode *, struct cred *, int); +static int sotpi_accept(struct sonode *, int, struct cred *, + struct sonode **); +static int sotpi_bind(struct sonode *, struct sockaddr *, socklen_t, + int, struct cred *); +static int sotpi_listen(struct sonode *, int, struct cred *); static int sotpi_connect(struct sonode *, const struct sockaddr *, - socklen_t, int, int); -static int sotpi_listen(struct sonode *, int); + socklen_t, int, int, struct cred *); +extern int sotpi_recvmsg(struct sonode *, struct nmsghdr *, + struct uio *, struct cred *); static int sotpi_sendmsg(struct sonode *, struct nmsghdr *, - struct uio *); -static int sotpi_shutdown(struct sonode *, int); -static int sotpi_getsockname(struct sonode *); + struct uio *, struct cred *); +static int sotpi_sendmblk(struct sonode *, struct nmsghdr *, int, + struct cred *, mblk_t **); static int sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t, struct uio *, void *, t_uscalar_t, int); static int sodgram_direct(struct sonode *, struct sockaddr *, socklen_t, struct uio *, int); +extern int sotpi_getpeername(struct sonode *, struct sockaddr *, + socklen_t *, boolean_t, struct cred *); +static int sotpi_getsockname(struct sonode *, struct sockaddr *, + socklen_t *, struct cred *); +static int sotpi_shutdown(struct sonode *, int, struct cred *); +extern int sotpi_getsockopt(struct sonode *, int, int, void *, + socklen_t *, int, struct cred *); +extern int sotpi_setsockopt(struct sonode *, int, int, const void *, + socklen_t, struct cred *); +static int sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *, + int32_t *); +static int socktpi_plumbioctl(struct vnode *, int, intptr_t, int, + struct cred *, int32_t *); +static int sotpi_poll(struct sonode *, short, int, short *, + struct pollhead **); +static int sotpi_close(struct sonode *, int, struct cred *); + +static int i_sotpi_info_constructor(sotpi_info_t *); +static void i_sotpi_info_destructor(sotpi_info_t *); sonodeops_t sotpi_sonodeops = { + sotpi_init, /* sop_init */ sotpi_accept, /* sop_accept */ sotpi_bind, /* sop_bind */ sotpi_listen, /* sop_listen */ sotpi_connect, /* sop_connect */ sotpi_recvmsg, /* sop_recvmsg */ sotpi_sendmsg, /* sop_sendmsg */ + sotpi_sendmblk, /* sop_sendmblk */ sotpi_getpeername, /* sop_getpeername */ sotpi_getsockname, /* sop_getsockname */ sotpi_shutdown, /* sop_shutdown */ sotpi_getsockopt, /* sop_getsockopt */ - sotpi_setsockopt /* sop_setsockopt */ + sotpi_setsockopt, /* sop_setsockopt */ + sotpi_ioctl, /* sop_ioctl */ + sotpi_poll, /* sop_poll */ + sotpi_close, /* sop_close */ }; /* + * Return a TPI socket vnode. + * + * Note that sockets assume that the driver will clone (either itself + * or by using the clone driver) i.e. a socket() call will always + * result in a new vnode being created. + */ + +/* * Common create code for socket and accept. If tso is set the values * from that node is used instead of issuing a T_INFO_REQ. - * - * Assumes that the caller has a VN_HOLD on accessvp. - * The VN_RELE will occur either when sotpi_create() fails or when - * the returned sonode is freed. */ -struct sonode * -sotpi_create(vnode_t *accessvp, int domain, int type, int protocol, int version, - struct sonode *tso, int *errorp) + +/* ARGSUSED */ +static struct sonode * +sotpi_create(struct sockparams *sp, int family, int type, int protocol, + int version, int sflags, int *errorp, cred_t *cr) { struct sonode *so; - vnode_t *vp; - int flags, error; + kmem_cache_t *cp; + int sfamily = family; - ASSERT(accessvp != NULL); - vp = makesockvp(accessvp, domain, type, protocol); - ASSERT(vp != NULL); - so = VTOSO(vp); + ASSERT(sp->sp_sdev_info.sd_vnode != NULL); + + if (family == AF_NCA) { + /* + * The request is for an NCA socket so for NL7C use the + * INET domain instead and mark NL7C_AF_NCA below. + */ + family = AF_INET; + /* + * NL7C is not supported in the non-global zone, + * we enforce this restriction here. + */ + if (getzoneid() != GLOBAL_ZONEID) { + *errorp = ENOTSUP; + return (NULL); + } + } + + /* + * to be compatible with old tpi socket implementation ignore + * sleep flag (sflags) passed in + */ + cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache; + so = kmem_cache_alloc(cp, KM_SLEEP); + if (so == NULL) { + *errorp = ENOMEM; + return (NULL); + } + + sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops); + sotpi_info_init(so); + + if (sfamily == AF_NCA) { + SOTOTPI(so)->sti_nl7c_flags = NL7C_AF_NCA; + } + + if (version == SOV_DEFAULT) + version = so_default_version; + + so->so_version = (short)version; + *errorp = 0; + + return (so); +} + +static void +sotpi_destroy(struct sonode *so) +{ + kmem_cache_t *cp; + struct sockparams *origsp; + + /* + * If there is a new dealloc function (ie. smod_destroy_func), + * then it should check the correctness of the ops. + */ + + ASSERT(so->so_ops == &sotpi_sonodeops); + + origsp = SOTOTPI(so)->sti_orig_sp; + + sotpi_info_fini(so); + + if (so->so_state & SS_FALLBACK_COMP) { + /* + * A fallback happend, which means that a sotpi_info_t struct + * was allocated (as opposed to being allocated from the TPI + * sonode cache. Therefore we explicitly free the struct + * here. + */ + sotpi_info_destroy(so); + ASSERT(origsp != NULL); + + origsp->sp_smod_info->smod_sock_destroy_func(so); + SOCKPARAMS_DEC_REF(origsp); + } else { + sonode_fini(so); + cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache : + socktpi_cache; + kmem_cache_free(cp, so); + } +} + +/* ARGSUSED1 */ +int +sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags) +{ + major_t maj; + dev_t newdev; + struct vnode *vp; + int error = 0; + struct stdata *stp; + + sotpi_info_t *sti = SOTOTPI(so); + + dprint(1, ("sotpi_init()\n")); + + /* + * over write the sleep flag passed in but that is ok + * as tpi socket does not honor sleep flag. + */ + flags |= FREAD|FWRITE; - flags = FREAD|FWRITE; + /* + * Record in so_flag that it is a clone. + */ + if (getmajor(sti->sti_dev) == clone_major) + so->so_flag |= SOCLONE; - if ((type == SOCK_STREAM || type == SOCK_DGRAM) && - (domain == AF_INET || domain == AF_INET6) && - (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP || - protocol == IPPROTO_IP)) { + if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) && + (so->so_family == AF_INET || so->so_family == AF_INET6) && + (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP || + so->so_protocol == IPPROTO_IP)) { /* Tell tcp or udp that it's talking to sockets */ flags |= SO_SOCKSTR; @@ -253,25 +411,25 @@ sotpi_create(vnode_t *accessvp, int domain, int type, int protocol, int version, * make direct calls between sockfs and transport. * The final decision is left to socktpi_open(). */ - so->so_state |= SS_DIRECT; + sti->sti_direct = 1; ASSERT(so->so_type != SOCK_DGRAM || tso == NULL); if (so->so_type == SOCK_STREAM && tso != NULL) { - if (tso->so_state & SS_DIRECT) { + if (SOTOTPI(tso)->sti_direct) { /* - * Inherit SS_DIRECT from listener and pass + * Inherit sti_direct from listener and pass * SO_ACCEPTOR open flag to tcp, indicating * that this is an accept fast-path instance. */ flags |= SO_ACCEPTOR; } else { /* - * SS_DIRECT is not set on listener, meaning + * sti_direct is not set on listener, meaning * that the listener has been converted from * a socket to a stream. Ensure that the * acceptor inherits these settings. */ - so->so_state &= ~SS_DIRECT; + sti->sti_direct = 0; flags &= ~SO_SOCKSTR; } } @@ -284,30 +442,157 @@ sotpi_create(vnode_t *accessvp, int domain, int type, int protocol, int version, flags |= SO_SOCKSTR; } - /* Initialize the kernel SSL proxy fields */ - so->so_kssl_type = KSSL_NO_PROXY; - so->so_kssl_ent = NULL; - so->so_kssl_ctx = NULL; + vp = SOTOV(so); + newdev = vp->v_rdev; + maj = getmajor(newdev); + ASSERT(STREAMSTAB(maj)); - if (error = socktpi_open(&vp, flags, CRED(), NULL)) { - VN_RELE(vp); - *errorp = error; - return (NULL); - } + error = stropen(vp, &newdev, flags, cr); - if (error = so_strinit(so, tso)) { - (void) VOP_CLOSE(vp, 0, 1, 0, CRED(), NULL); - VN_RELE(vp); - *errorp = error; - return (NULL); - } + stp = vp->v_stream; + if (error == 0) { + if (so->so_flag & SOCLONE) + ASSERT(newdev != vp->v_rdev); + mutex_enter(&so->so_lock); + sti->sti_dev = newdev; + vp->v_rdev = newdev; + mutex_exit(&so->so_lock); - if (version == SOV_DEFAULT) - version = so_default_version; + if (stp->sd_flag & STRISTTY) { + /* + * this is a post SVR4 tty driver - a socket can not + * be a controlling terminal. Fail the open. + */ + (void) sotpi_close(so, flags, cr); + return (ENOTTY); /* XXX */ + } - so->so_version = (short)version; + ASSERT(stp->sd_wrq != NULL); + sti->sti_provinfo = tpi_findprov(stp->sd_wrq); - return (so); + /* + * If caller is interested in doing direct function call + * interface to/from transport module, probe the module + * directly beneath the streamhead to see if it qualifies. + * + * We turn off the direct interface when qualifications fail. + * In the acceptor case, we simply turn off the sti_direct + * flag on the socket. We do the fallback after the accept + * has completed, before the new socket is returned to the + * application. + */ + if (sti->sti_direct) { + queue_t *tq = stp->sd_wrq->q_next; + + /* + * sti_direct is currently supported and tested + * only for tcp/udp; this is the main reason to + * have the following assertions. + */ + ASSERT(so->so_family == AF_INET || + so->so_family == AF_INET6); + ASSERT(so->so_protocol == IPPROTO_UDP || + so->so_protocol == IPPROTO_TCP || + so->so_protocol == IPPROTO_IP); + ASSERT(so->so_type == SOCK_DGRAM || + so->so_type == SOCK_STREAM); + + /* + * Abort direct call interface if the module directly + * underneath the stream head is not defined with the + * _D_DIRECT flag. This could happen in the tcp or + * udp case, when some other module is autopushed + * above it, or for some reasons the expected module + * isn't purely D_MP (which is the main requirement). + * + * Else, SS_DIRECT is valid. If the read-side Q has + * _QSODIRECT set then and uioasync is enabled then + * set SS_SODIRECT to enable sodirect. + */ + if (!socktpi_direct || !(tq->q_flag & _QDIRECT) || + !(_OTHERQ(tq)->q_flag & _QDIRECT)) { + int rval; + + /* Continue on without direct calls */ + sti->sti_direct = 0; + + /* + * Cannot issue ioctl on fallback socket since + * there is no conn associated with the queue. + * The fallback downcall will notify the proto + * of the change. + */ + if (!(flags & SO_ACCEPTOR) && + !(flags & SO_FALLBACK)) { + if ((error = strioctl(vp, + _SIOCSOCKFALLBACK, 0, 0, K_TO_K, + cr, &rval)) != 0) { + (void) sotpi_close(so, flags, + cr); + return (error); + } + } + } else if ((_OTHERQ(tq)->q_flag & _QSODIRECT) && + uioasync.enabled) { + /* Enable sodirect */ + so->so_state |= SS_SODIRECT; + } + } + + if (flags & SO_FALLBACK) { + /* + * The stream created does not have a conn. + * do stream set up after conn has been assigned + */ + return (error); + } + if (error = so_strinit(so, tso)) { + (void) sotpi_close(so, flags, cr); + return (error); + } + + /* Wildcard */ + if (so->so_protocol != so->so_sockparams->sp_protocol) { + int protocol = so->so_protocol; + /* + * Issue SO_PROTOTYPE setsockopt. + */ + error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE, + &protocol, (t_uscalar_t)sizeof (protocol), cr); + if (error != 0) { + (void) sotpi_close(so, flags, cr); + /* + * Setsockopt often fails with ENOPROTOOPT but + * socket() should fail with + * EPROTONOSUPPORT/EPROTOTYPE. + */ + return (EPROTONOSUPPORT); + } + } + + } else { + /* + * While the same socket can not be reopened (unlike specfs) + * the stream head sets STREOPENFAIL when the autopush fails. + */ + if ((stp != NULL) && + (stp->sd_flag & STREOPENFAIL)) { + /* + * Open failed part way through. + */ + mutex_enter(&stp->sd_lock); + stp->sd_flag &= ~STREOPENFAIL; + mutex_exit(&stp->sd_lock); + (void) sotpi_close(so, flags, cr); + return (error); + /*NOTREACHED*/ + } + ASSERT(stp == NULL); + } + TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN, + "sockfs open:maj %d vp %p so %p error %d", + maj, vp, so, error); + return (error); } /* @@ -318,15 +603,16 @@ sotpi_create(vnode_t *accessvp, int domain, int type, int protocol, int version, static void so_automatic_bind(struct sonode *so) { + sotpi_info_t *sti = SOTOTPI(so); ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6); ASSERT(MUTEX_HELD(&so->so_lock)); ASSERT(!(so->so_state & SS_ISBOUND)); - ASSERT(so->so_unbind_mp); + ASSERT(sti->sti_unbind_mp); - ASSERT(so->so_laddr_len <= so->so_laddr_maxlen); - bzero(so->so_laddr_sa, so->so_laddr_len); - so->so_laddr_sa->sa_family = so->so_family; + ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen); + bzero(sti->sti_laddr_sa, sti->sti_laddr_len); + sti->sti_laddr_sa->sa_family = so->so_family; so->so_state |= SS_ISBOUND; } @@ -353,9 +639,10 @@ so_automatic_bind(struct sonode *so) * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected * and no listen() has been done. */ +/* ARGSUSED */ static int sotpi_bindlisten(struct sonode *so, struct sockaddr *name, - socklen_t namelen, int backlog, int flags) + socklen_t namelen, int backlog, int flags, struct cred *cr) { struct T_bind_req bind_req; struct T_bind_ack *bind_ack; @@ -370,6 +657,7 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name, t_scalar_t PRIM_type = O_T_BIND_REQ; boolean_t tcp_udp_xport; void *nl7c = NULL; + sotpi_info_t *sti = SOTOTPI(so); dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n", (void *)so, (void *)name, namelen, backlog, flags, @@ -390,10 +678,10 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name, * before binding. This message allocated when the socket is * created but it might be have been consumed. */ - if (so->so_unbind_mp == NULL) { + if (sti->sti_unbind_mp == NULL) { dprintso(so, 1, ("sobind: allocating unbind_req\n")); /* NOTE: holding so_lock while sleeping */ - so->so_unbind_mp = + sti->sti_unbind_mp = soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP); } @@ -405,17 +693,17 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name, ASSERT(name == NULL && namelen == 0); if (so->so_family == AF_UNIX) { - ASSERT(so->so_ux_bound_vp); - addr = &so->so_ux_laddr; - addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr); + ASSERT(sti->sti_ux_bound_vp); + addr = &sti->sti_ux_laddr; + addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr); dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, " "addr 0x%p, vp %p\n", addrlen, (void *)((struct so_ux_addr *)addr)->soua_vp, - (void *)so->so_ux_bound_vp)); + (void *)sti->sti_ux_bound_vp)); } else { - addr = so->so_laddr_sa; - addrlen = (t_uscalar_t)so->so_laddr_len; + addr = sti->sti_laddr_sa; + addrlen = (t_uscalar_t)sti->sti_laddr_len; } } else if (flags & _SOBIND_UNSPEC) { ASSERT(name == NULL && namelen == 0); @@ -436,21 +724,21 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name, * Use an address with same size as struct sockaddr * just like BSD. */ - so->so_laddr_len = + sti->sti_laddr_len = (socklen_t)sizeof (struct sockaddr); - ASSERT(so->so_laddr_len <= so->so_laddr_maxlen); - bzero(so->so_laddr_sa, so->so_laddr_len); - so->so_laddr_sa->sa_family = so->so_family; + ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen); + bzero(sti->sti_laddr_sa, sti->sti_laddr_len); + sti->sti_laddr_sa->sa_family = so->so_family; /* * Pass down an address with the implicit bind * magic number and the rest all zeros. * The transport will return a unique address. */ - so->so_ux_laddr.soua_vp = NULL; - so->so_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT; - addr = &so->so_ux_laddr; - addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr); + sti->sti_ux_laddr.soua_vp = NULL; + sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT; + addr = &sti->sti_ux_laddr; + addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr); break; case AF_INET: @@ -459,12 +747,12 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name, * An unspecified bind in TPI has a NULL address. * Set the address in sockfs to have the sa_family. */ - so->so_laddr_len = (so->so_family == AF_INET) ? + sti->sti_laddr_len = (so->so_family == AF_INET) ? (socklen_t)sizeof (sin_t) : (socklen_t)sizeof (sin6_t); - ASSERT(so->so_laddr_len <= so->so_laddr_maxlen); - bzero(so->so_laddr_sa, so->so_laddr_len); - so->so_laddr_sa->sa_family = so->so_family; + ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen); + bzero(sti->sti_laddr_sa, sti->sti_laddr_len); + sti->sti_laddr_sa->sa_family = so->so_family; addr = NULL; addrlen = 0; break; @@ -478,8 +766,8 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name, * protocol families. For example, AF_X25 does not * have a family field. */ - bzero(so->so_laddr_sa, so->so_laddr_len); - so->so_laddr_len = 0; /* XXX correct? */ + bzero(sti->sti_laddr_sa, sti->sti_laddr_len); + sti->sti_laddr_len = 0; /* XXX correct? */ addr = NULL; addrlen = 0; break; @@ -525,6 +813,7 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name, goto done; } } + /* X/Open requires this check */ if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) { if (xnet_check_print) { @@ -656,7 +945,7 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name, break; } - if (namelen > (t_uscalar_t)so->so_laddr_maxlen) { + if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) { error = ENAMETOOLONG; eprintsoline(so, error); goto done; @@ -664,26 +953,26 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name, /* * Save local address. */ - so->so_laddr_len = (socklen_t)namelen; - ASSERT(so->so_laddr_len <= so->so_laddr_maxlen); - bcopy(name, so->so_laddr_sa, namelen); + sti->sti_laddr_len = (socklen_t)namelen; + ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen); + bcopy(name, sti->sti_laddr_sa, namelen); - addr = so->so_laddr_sa; - addrlen = (t_uscalar_t)so->so_laddr_len; + addr = sti->sti_laddr_sa; + addrlen = (t_uscalar_t)sti->sti_laddr_len; switch (so->so_family) { case AF_INET6: case AF_INET: break; case AF_UNIX: { struct sockaddr_un *soun = - (struct sockaddr_un *)so->so_laddr_sa; + (struct sockaddr_un *)sti->sti_laddr_sa; struct vnode *vp, *rvp; struct vattr vattr; - ASSERT(so->so_ux_bound_vp == NULL); + ASSERT(sti->sti_ux_bound_vp == NULL); /* * Create vnode for the specified path name. - * Keep vnode held with a reference in so_ux_bound_vp. + * Keep vnode held with a reference in sti_ux_bound_vp. * Use the vnode pointer as the address used in the * bind with the transport. * @@ -691,7 +980,7 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name, * not observe the umask. */ /* MAXPATHLEN + soun_family + nul termination */ - if (so->so_laddr_len > + if (sti->sti_laddr_len > (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) { error = ENAMETOOLONG; eprintsoline(so, error); @@ -712,7 +1001,7 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name, /* * Establish pointer from the underlying filesystem * vnode to the socket node. - * so_ux_bound_vp and v_stream->sd_vnode form the + * sti_ux_bound_vp and v_stream->sd_vnode form the * cross-linkage between the underlying filesystem * node and the socket node. */ @@ -726,7 +1015,7 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name, ASSERT(SOTOV(so)->v_stream); mutex_enter(&vp->v_lock); vp->v_stream = SOTOV(so)->v_stream; - so->so_ux_bound_vp = vp; + sti->sti_ux_bound_vp = vp; mutex_exit(&vp->v_lock); /* @@ -734,13 +1023,14 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name, * (together with the magic number to avoid conflicts * with implicit binds) in the transport provider. */ - so->so_ux_laddr.soua_vp = (void *)so->so_ux_bound_vp; - so->so_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT; - addr = &so->so_ux_laddr; - addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr); + sti->sti_ux_laddr.soua_vp = + (void *)sti->sti_ux_bound_vp; + sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT; + addr = &sti->sti_ux_laddr; + addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr); dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n", addrlen, - ((struct so_ux_addr *)addr)->soua_vp)); + (void *)((struct so_ux_addr *)addr)->soua_vp)); break; } } /* end switch (so->so_family) */ @@ -771,14 +1061,14 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name, if (nl7c_enabled && ((addr != NULL && (so->so_family == AF_INET || so->so_family == AF_INET6) && (nl7c = nl7c_lookup_addr(addr, addrlen))) || - so->so_nl7c_flags == NL7C_AF_NCA)) { + sti->sti_nl7c_flags == NL7C_AF_NCA)) { /* * NL7C is not supported in non-global zones, * we enforce this restriction here. */ if (so->so_zoneid == GLOBAL_ZONEID) { /* An NL7C socket, mark it */ - so->so_nl7c_flags |= NL7C_ENABLED; + sti->sti_nl7c_flags |= NL7C_ENABLED; if (nl7c == NULL) { /* * Was an AF_NCA bind() so add it to the @@ -789,6 +1079,7 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name, } else nl7c = NULL; } + /* * We send a T_BIND_REQ for TCP/UDP since we know it supports it, * for other transports we will send in a O_T_BIND_REQ. @@ -804,9 +1095,9 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name, /* NOTE: holding so_lock while sleeping */ mp = soallocproto2(&bind_req, sizeof (bind_req), addr, addrlen, 0, _ALLOC_SLEEP); - so->so_state &= ~SS_LADDR_VALID; + sti->sti_laddr_valid = 0; - /* Done using so_laddr_sa - can drop the lock */ + /* Done using sti_laddr_sa - can drop the lock */ mutex_exit(&so->so_lock); /* @@ -820,13 +1111,15 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name, (so->so_family == AF_INET || so->so_family == AF_INET6) && so->so_type == SOCK_STREAM) { - if (so->so_kssl_ent != NULL) { - kssl_release_ent(so->so_kssl_ent, so, so->so_kssl_type); - so->so_kssl_ent = NULL; + if (sti->sti_kssl_ent != NULL) { + kssl_release_ent(sti->sti_kssl_ent, so, + sti->sti_kssl_type); + sti->sti_kssl_ent = NULL; } - so->so_kssl_type = kssl_check_proxy(mp, so, &so->so_kssl_ent); - switch (so->so_kssl_type) { + sti->sti_kssl_type = kssl_check_proxy(mp, so, + &sti->sti_kssl_ent); + switch (sti->sti_kssl_type) { case KSSL_NO_PROXY: break; @@ -865,11 +1158,11 @@ skip_transport: /* Mark as bound. This will be undone if we detect errors below. */ if (flags & _SOBIND_NOXLATE) { ASSERT(so->so_family == AF_UNIX); - so->so_state |= SS_FADDR_NOXLATE; + sti->sti_faddr_noxlate = 1; } ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND)); so->so_state |= SS_ISBOUND; - ASSERT(so->so_unbind_mp); + ASSERT(sti->sti_unbind_mp); /* note that we've already set SS_ACCEPTCONN above */ @@ -879,7 +1172,7 @@ skip_transport: * in return. */ addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ? - sizeof (so->so_ux_laddr) : so->so_laddr_len); + sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len); bind_ack = (struct T_bind_ack *)mp->b_rptr; /* @@ -965,7 +1258,7 @@ skip_transport: sin_t *rname, *aname; rname = (sin_t *)addr; - aname = (sin_t *)so->so_laddr_sa; + aname = (sin_t *)sti->sti_laddr_sa; /* * Take advantage of the alignment @@ -990,7 +1283,7 @@ skip_transport: */ if (aname->sin_port == 0) aname->sin_port = rname->sin_port; - so->so_state |= SS_LADDR_VALID; + sti->sti_laddr_valid = 1; break; } if (aname->sin_port != 0 && @@ -1031,31 +1324,31 @@ skip_transport: break; } case AF_UNIX: - if (bcmp(addr, &so->so_ux_laddr, addrlen) != 0) { + if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) { freemsg(mp); error = EADDRINUSE; eprintsoline(so, error); eprintso(so, ("addrlen %d, addr 0x%x, vp %p\n", addrlen, *((int *)addr), - (void *)so->so_ux_bound_vp)); + (void *)sti->sti_ux_bound_vp)); goto done; } - so->so_state |= SS_LADDR_VALID; + sti->sti_laddr_valid = 1; break; default: /* * NOTE: This assumes that addresses can be * byte-compared for equivalence. */ - if (bcmp(addr, so->so_laddr_sa, addrlen) != 0) { + if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) { freemsg(mp); error = EADDRINUSE; eprintsoline(so, error); goto done; } /* - * Don't mark SS_LADDR_VALID, as we cannot be + * Don't mark sti_laddr_valid, as we cannot be * sure that the returned address is the real * bound address when talking to an unknown * transport. @@ -1071,8 +1364,8 @@ skip_transport: * caching info here is much better performance than * a TPI/STREAMS trip to the transport for getsockname. * Any which can't for some reason _must_ _not_ set - * LADDR_VALID here for the caching version of getsockname - * to not break; + * sti_laddr_valid here for the caching version of + * getsockname to not break; */ switch (so->so_family) { case AF_UNIX: @@ -1080,18 +1373,18 @@ skip_transport: * Record the address bound with the transport * for use by socketpair. */ - bcopy(addr, &so->so_ux_laddr, addrlen); - so->so_state |= SS_LADDR_VALID; + bcopy(addr, &sti->sti_ux_laddr, addrlen); + sti->sti_laddr_valid = 1; break; case AF_INET: case AF_INET6: - ASSERT(so->so_laddr_len <= so->so_laddr_maxlen); - bcopy(addr, so->so_laddr_sa, so->so_laddr_len); - so->so_state |= SS_LADDR_VALID; + ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen); + bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len); + sti->sti_laddr_valid = 1; break; default: /* - * Don't mark SS_LADDR_VALID, as we cannot be + * Don't mark sti_laddr_valid, as we cannot be * sure that the returned address is the real * bound address when talking to an unknown * transport. @@ -1131,7 +1424,6 @@ done: so_unlock_single(so, SOLOCKED); mutex_exit(&so->so_lock); } else { - /* If the caller held the lock don't release it here */ ASSERT(MUTEX_HELD(&so->so_lock)); ASSERT(so->so_flag & SOLOCKED); } @@ -1141,13 +1433,13 @@ done: /* bind the socket */ static int sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen, - int flags) + int flags, struct cred *cr) { if ((flags & _SOBIND_SOCKETPAIR) == 0) - return (sotpi_bindlisten(so, name, namelen, 0, flags)); + return (sotpi_bindlisten(so, name, namelen, 0, flags, cr)); flags &= ~_SOBIND_SOCKETPAIR; - return (sotpi_bindlisten(so, name, namelen, 1, flags)); + return (sotpi_bindlisten(so, name, namelen, 1, flags, cr)); } /* @@ -1162,6 +1454,7 @@ sotpi_unbind(struct sonode *so, int flags) struct T_unbind_req unbind_req; int error = 0; mblk_t *mp; + sotpi_info_t *sti = SOTOTPI(so); dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n", (void *)so, flags, pr_state(so->so_state, so->so_mode))); @@ -1211,26 +1504,26 @@ sotpi_unbind(struct sonode *so, int flags) */ vnode_t *vp; - if ((vp = so->so_ux_bound_vp) != NULL) { + if ((vp = sti->sti_ux_bound_vp) != NULL) { /* Undo any SSL proxy setup */ if ((so->so_family == AF_INET || so->so_family == AF_INET6) && (so->so_type == SOCK_STREAM) && - (so->so_kssl_ent != NULL)) { - kssl_release_ent(so->so_kssl_ent, so, - so->so_kssl_type); - so->so_kssl_ent = NULL; - so->so_kssl_type = KSSL_NO_PROXY; + (sti->sti_kssl_ent != NULL)) { + kssl_release_ent(sti->sti_kssl_ent, so, + sti->sti_kssl_type); + sti->sti_kssl_ent = NULL; + sti->sti_kssl_type = KSSL_NO_PROXY; } - - so->so_ux_bound_vp = NULL; + sti->sti_ux_bound_vp = NULL; vn_rele_stream(vp); } /* Clear out address */ - so->so_laddr_len = 0; + sti->sti_laddr_len = 0; } - so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN|SS_LADDR_VALID); + so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN); + sti->sti_laddr_valid = 0; done: @@ -1246,15 +1539,17 @@ done: * For TPI conforming transports this has to first unbind with the transport * and then bind again using the new backlog. */ +/* ARGSUSED */ int -sotpi_listen(struct sonode *so, int backlog) +sotpi_listen(struct sonode *so, int backlog, struct cred *cr) { int error = 0; + sotpi_info_t *sti = SOTOTPI(so); dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n", (void *)so, backlog, pr_state(so->so_state, so->so_mode))); - if (so->so_serv_type == T_CLTS) + if (sti->sti_serv_type == T_CLTS) return (EOPNOTSUPP); /* @@ -1276,24 +1571,6 @@ sotpi_listen(struct sonode *so, int backlog) mutex_enter(&so->so_lock); so_lock_single(so); /* Set SOLOCKED */ - if (backlog < 0) - backlog = 0; - /* - * Use the same qlimit as in BSD. BSD checks the qlimit - * before queuing the next connection implying that a - * listen(sock, 0) allows one connection to be queued. - * BSD also uses 1.5 times the requested backlog. - * - * XNS Issue 4 required a strict interpretation of the backlog. - * This has been waived subsequently for Issue 4 and the change - * incorporated in XNS Issue 5. So we aren't required to do - * anything special for XPG apps. - */ - if (backlog >= (INT_MAX - 1) / 3) - backlog = INT_MAX; - else - backlog = backlog * 3 / 2 + 1; - /* * If the listen doesn't change the backlog we do nothing. * This avoids an EPROTO error from the transport. @@ -1311,7 +1588,7 @@ sotpi_listen(struct sonode *so, int backlog) goto done; } error = sotpi_bindlisten(so, NULL, 0, backlog, - _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN); + _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr); } else if (backlog > 0) { /* * AF_INET{,6} hack to avoid losing the port. @@ -1327,7 +1604,7 @@ sotpi_listen(struct sonode *so, int backlog) goto done; } error = sotpi_bindlisten(so, NULL, 0, backlog, - _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN); + _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr); } else { so->so_state |= SS_ACCEPTCONN; so->so_backlog = backlog; @@ -1349,7 +1626,7 @@ done: * the current use of sodisconnect(seqno == -1) is only for shutdown * so there is no point (and potentially incorrect) to unbind. */ -int +static int sodisconnect(struct sonode *so, t_scalar_t seqno, int flags) { struct T_discon_req discon_req; @@ -1406,8 +1683,9 @@ sodisconnect(struct sonode *so, t_scalar_t seqno, int flags) * is allowed to complete. However, it is not possible to * assert that SS_ISCONNECTED|SS_ISCONNECTING are set. */ - so->so_state &= - ~(SS_ISCONNECTED|SS_ISCONNECTING|SS_LADDR_VALID|SS_FADDR_VALID); + so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING); + SOTOTPI(so)->sti_laddr_valid = 0; + SOTOTPI(so)->sti_faddr_valid = 0; done: if (!(flags & _SODISCONNECT_LOCK_HELD)) { so_unlock_single(so, SOLOCKED); @@ -1420,8 +1698,10 @@ done: return (error); } +/* ARGSUSED */ int -sotpi_accept(struct sonode *so, int fflag, struct sonode **nsop) +sotpi_accept(struct sonode *so, int fflag, struct cred *cr, + struct sonode **nsop) { struct T_conn_ind *conn_ind; struct T_conn_res *conn_res; @@ -1436,6 +1716,8 @@ sotpi_accept(struct sonode *so, int fflag, struct sonode **nsop) t_scalar_t PRIM_type; t_scalar_t SEQ_number; size_t sinlen; + sotpi_info_t *sti = SOTOTPI(so); + sotpi_info_t *nsti; dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n", (void *)so, fflag, (void *)nsop, @@ -1454,7 +1736,7 @@ again: if ((error = sowaitconnind(so, fflag, &mp)) != 0) goto e_bad; - ASSERT(mp); + ASSERT(mp != NULL); conn_ind = (struct T_conn_ind *)mp->b_rptr; ctxmp = mp->b_cont; @@ -1475,8 +1757,7 @@ again: switch (so->so_family) { case AF_INET: case AF_INET6: - if ((optlen == sizeof (intptr_t)) && - ((so->so_state & SS_DIRECT) != 0)) { + if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) { bcopy(mp->b_rptr + conn_ind->OPT_offset, &opt, conn_ind->OPT_length); } else { @@ -1489,7 +1770,7 @@ again: * problems when sockfs sends a normal T_CONN_RES * message down the new stream. */ - if (so->so_state & SS_DIRECT) { + if (sti->sti_direct) { int rval; /* * For consistency we inform tcp to disable @@ -1498,7 +1779,7 @@ again: * because no data will ever travel upstream * on the listening socket. */ - so->so_state &= ~SS_DIRECT; + sti->sti_direct = 0; (void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK, 0, 0, K_TO_K, CRED(), &rval); } @@ -1519,7 +1800,7 @@ again: } } if (so->so_family == AF_UNIX) { - if (!(so->so_state & SS_FADDR_NOXLATE)) { + if (!sti->sti_faddr_noxlate) { src = NULL; srclen = 0; } @@ -1533,9 +1814,7 @@ again: /* * Create the new socket. */ - VN_HOLD(so->so_accessvp); - nso = sotpi_create(so->so_accessvp, so->so_family, so->so_type, - so->so_protocol, so->so_version, so, &error); + nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error); if (nso == NULL) { ASSERT(error != 0); /* @@ -1549,6 +1828,7 @@ again: goto e_disc_unl; } nvp = SOTOV(nso); + nsti = SOTOTPI(nso); /* * If the transport sent up an SSL connection context, then attach @@ -1561,7 +1841,7 @@ again: * This kssl_ctx_t is already held for us by the transport. * So, we don't need to do a kssl_hold_ctx() here. */ - nso->so_kssl_ctx = *((kssl_ctx_t *)ctxmp->b_rptr); + nsti->sti_kssl_ctx = *((kssl_ctx_t *)ctxmp->b_rptr); freemsg(ctxmp); mp->b_cont = NULL; strsetrwputdatahooks(nvp, strsock_kssl_input, @@ -1572,7 +1852,6 @@ again: mutex_enter(nso->so_direct->sod_lockp); SOD_DISABLE(nso->so_direct); mutex_exit(nso->so_direct->sod_lockp); - nso->so_direct = NULL; } } #ifdef DEBUG @@ -1591,16 +1870,16 @@ again: * NOTE: AF_UNIX NUL termination is ensured by the sender's * copyin_name(). */ - if (srclen > (t_uscalar_t)nso->so_faddr_maxlen) { + if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) { error = EINVAL; freemsg(mp); eprintsoline(so, error); goto disconnect_vp_unlocked; } - nso->so_faddr_len = (socklen_t)srclen; - ASSERT(so->so_faddr_len <= so->so_faddr_maxlen); - bcopy(src, nso->so_faddr_sa, srclen); - nso->so_state |= SS_FADDR_VALID; + nsti->sti_faddr_len = (socklen_t)srclen; + ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen); + bcopy(src, nsti->sti_faddr_sa, srclen); + nsti->sti_faddr_valid = 1; if ((DB_REF(mp) > 1) || MBLKSIZE(mp) < (sizeof (struct T_conn_res) + sizeof (intptr_t))) { @@ -1654,7 +1933,8 @@ again: mutex_exit(&nso->so_lock); } else { /* Perform NULL bind with the transport provider. */ - if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC)) != 0) { + if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC, + cr)) != 0) { ASSERT(error != ENOBUFS); freemsg(mp); eprintsoline(nso, error); @@ -1671,7 +1951,8 @@ again: * can access the new socket thus we relax the locking. */ nso->so_pgrp = so->so_pgrp; - nso->so_state |= so->so_state & (SS_ASYNC|SS_FADDR_NOXLATE); + nso->so_state |= so->so_state & SS_ASYNC; + nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate; if (nso->so_pgrp != 0) { if ((error = so_set_events(nso, nvp, CRED())) != 0) { @@ -1695,7 +1976,12 @@ again: if (nso->so_options & SO_LINGER) nso->so_linger = so->so_linger; - if ((so->so_state & SS_DIRECT) != 0) { + /* + * Note that the following sti_direct code path should be + * removed once we are confident that the direct sockets + * do not result in any degradation. + */ + if (sti->sti_direct) { ASSERT(opt != NULL); @@ -1731,22 +2017,23 @@ again: sin = (sin_t *)(ack_mp->b_rptr + sizeof (struct T_ok_ack)); - bcopy(sin, nso->so_laddr_sa, sizeof (sin_t)); - nso->so_laddr_len = sizeof (sin_t); + bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t)); + nsti->sti_laddr_len = sizeof (sin_t); } else { sin6_t *sin6; sin6 = (sin6_t *)(ack_mp->b_rptr + sizeof (struct T_ok_ack)); - bcopy(sin6, nso->so_laddr_sa, sizeof (sin6_t)); - nso->so_laddr_len = sizeof (sin6_t); + bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t)); + nsti->sti_laddr_len = sizeof (sin6_t); } freemsg(ack_mp); - nso->so_state |= SS_ISCONNECTED | SS_LADDR_VALID; - nso->so_priv = opt; + nso->so_state |= SS_ISCONNECTED; + nso->so_proto_handle = (sock_lower_handle_t)opt; + nsti->sti_laddr_valid = 1; - if (so->so_nl7c_flags & NL7C_ENABLED) { + if (sti->sti_nl7c_flags & NL7C_ENABLED) { /* * A NL7C marked listen()er so the new socket * inherits the listen()er's NL7C state, except @@ -1755,14 +2042,15 @@ again: * Only call NL7C to process the new socket if * the listen socket allows blocking i/o. */ - nso->so_nl7c_flags = so->so_nl7c_flags & (~NL7C_POLLIN); + nsti->sti_nl7c_flags = + sti->sti_nl7c_flags & (~NL7C_POLLIN); if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) { /* * Nonblocking accept() just make it * persist to defer processing to the * read-side syscall (e.g. read). */ - nso->so_nl7c_flags |= NL7C_SOPERSIST; + nsti->sti_nl7c_flags |= NL7C_SOPERSIST; } else if (nl7c_process(nso, B_FALSE)) { /* * NL7C has completed processing on the @@ -1782,12 +2070,12 @@ again: /* * It's possible, through the use of autopush for example, - * that the acceptor stream may not support SS_DIRECT - * semantics. If the new socket does not support SS_DIRECT + * that the acceptor stream may not support sti_direct + * semantics. If the new socket does not support sti_direct * we issue a _SIOCSOCKFALLBACK to inform the transport * as we would in the I_PUSH case. */ - if (!(nso->so_state & SS_DIRECT)) { + if (nsti->sti_direct == 0) { int rval; if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK, @@ -1842,7 +2130,7 @@ again: conn_res->PRIM_type = O_T_CONN_RES; PRIM_type = O_T_CONN_RES; } else { - conn_res->ACCEPTOR_id = nso->so_acceptor_id; + conn_res->ACCEPTOR_id = nsti->sti_acceptor_id; conn_res->PRIM_type = T_CONN_RES; PRIM_type = T_CONN_RES; } @@ -1871,27 +2159,28 @@ again: * If there is a sin/sin6 appended onto the T_OK_ACK use * that to set the local address. If this is not present * then we zero out the address and don't set the - * SS_LADDR_VALID bit. For AF_UNIX endpoints we copy over + * sti_laddr_valid bit. For AF_UNIX endpoints we copy over * the pathname from the listening socket. */ sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t); if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) && MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) { ack_mp->b_rptr += sizeof (struct T_ok_ack); - bcopy(ack_mp->b_rptr, nso->so_laddr_sa, sinlen); - nso->so_laddr_len = sinlen; - nso->so_state |= SS_LADDR_VALID; + bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen); + nsti->sti_laddr_len = sinlen; + nsti->sti_laddr_valid = 1; } else if (nso->so_family == AF_UNIX) { ASSERT(so->so_family == AF_UNIX); - nso->so_laddr_len = so->so_laddr_len; - ASSERT(nso->so_laddr_len <= nso->so_laddr_maxlen); - bcopy(so->so_laddr_sa, nso->so_laddr_sa, nso->so_laddr_len); - nso->so_state |= SS_LADDR_VALID; + nsti->sti_laddr_len = sti->sti_laddr_len; + ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen); + bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa, + nsti->sti_laddr_len); + nsti->sti_laddr_valid = 1; } else { - nso->so_laddr_len = so->so_laddr_len; - ASSERT(nso->so_laddr_len <= nso->so_laddr_maxlen); - bzero(nso->so_laddr_sa, nso->so_addr_size); - nso->so_laddr_sa->sa_family = nso->so_family; + nsti->sti_laddr_len = sti->sti_laddr_len; + ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen); + bzero(nsti->sti_laddr_sa, nsti->sti_addr_size); + nsti->sti_laddr_sa->sa_family = nso->so_family; } freemsg(ack_mp); @@ -1953,7 +2242,8 @@ sotpi_connect(struct sonode *so, const struct sockaddr *name, socklen_t namelen, int fflag, - int flags) + int flags, + struct cred *cr) { struct T_conn_req conn_req; int error = 0; @@ -1963,6 +2253,7 @@ sotpi_connect(struct sonode *so, void *addr; socklen_t addrlen; boolean_t need_unlock; + sotpi_info_t *sti = SOTOTPI(so); dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n", (void *)so, (void *)name, namelen, fflag, flags, @@ -1971,13 +2262,13 @@ sotpi_connect(struct sonode *so, /* * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to * avoid sleeping for memory with SOLOCKED held. - * We know that the T_CONN_REQ can't be larger than 2 * so_faddr_maxlen + * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen * + sizeof (struct T_opthdr). * (the AF_UNIX so_ux_addr_xlate() does not make the address - * exceed so_faddr_maxlen). + * exceed sti_faddr_maxlen). */ mp = soallocproto(sizeof (struct T_conn_req) + - 2 * so->so_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR); + 2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR); if (mp == NULL) { /* * Connect can not fail with ENOBUFS. A signal was @@ -2001,12 +2292,12 @@ sotpi_connect(struct sonode *so, so_lock_single(so); /* Set SOLOCKED */ need_unlock = B_TRUE; - if (so->so_unbind_mp == NULL) { + if (sti->sti_unbind_mp == NULL) { dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n")); /* NOTE: holding so_lock while sleeping */ - so->so_unbind_mp = + sti->sti_unbind_mp = soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR); - if (so->so_unbind_mp == NULL) { + if (sti->sti_unbind_mp == NULL) { error = EINTR; goto done; } @@ -2034,7 +2325,7 @@ sotpi_connect(struct sonode *so, so_automatic_bind(so); } else { error = sotpi_bind(so, NULL, 0, - _SOBIND_UNSPEC|_SOBIND_LOCK_HELD); + _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr); if (error) goto done; } @@ -2088,17 +2379,19 @@ sotpi_connect(struct sonode *so, _SODISCONNECT_LOCK_HELD); } else { so->so_state &= - ~(SS_ISCONNECTED | SS_ISCONNECTING | - SS_FADDR_VALID); - so->so_faddr_len = 0; + ~(SS_ISCONNECTED | SS_ISCONNECTING); + sti->sti_faddr_valid = 0; + sti->sti_faddr_len = 0; } + /* Remove SOLOCKED since setsockopt will grab it */ so_unlock_single(so, SOLOCKED); mutex_exit(&so->so_lock); val = 0; - (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND, - &val, (t_uscalar_t)sizeof (val)); + (void) sotpi_setsockopt(so, SOL_SOCKET, + SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val), + cr); mutex_enter(&so->so_lock); so_lock_single(so); /* Set SOLOCKED */ @@ -2112,7 +2405,7 @@ sotpi_connect(struct sonode *so, goto done; } /* - * Mark the socket if so_faddr_sa represents the transport level + * Mark the socket if sti_faddr_sa represents the transport level * address. */ if (flags & _SOCONNECT_NOXLATE) { @@ -2126,7 +2419,7 @@ sotpi_connect(struct sonode *so, soaddr_ux = (struct sockaddr_ux *)name; name = (struct sockaddr *)&soaddr_ux->sou_addr; namelen = sizeof (soaddr_ux->sou_addr); - so->so_state |= SS_FADDR_NOXLATE; + sti->sti_faddr_noxlate = 1; } /* @@ -2141,46 +2434,46 @@ sotpi_connect(struct sonode *so, * transport providers that do not support TI_GETPEERNAME. * Also used for cached foreign address for TCP and UDP. */ - if (namelen > (t_uscalar_t)so->so_faddr_maxlen) { + if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) { error = EINVAL; goto done; } - so->so_faddr_len = (socklen_t)namelen; - ASSERT(so->so_faddr_len <= so->so_faddr_maxlen); - bcopy(name, so->so_faddr_sa, namelen); - so->so_state |= SS_FADDR_VALID; + sti->sti_faddr_len = (socklen_t)namelen; + ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen); + bcopy(name, sti->sti_faddr_sa, namelen); + sti->sti_faddr_valid = 1; if (so->so_family == AF_UNIX) { - if (so->so_state & SS_FADDR_NOXLATE) { + if (sti->sti_faddr_noxlate) { /* * Already have a transport internal address. Do not * pass any (transport internal) source address. */ - addr = so->so_faddr_sa; - addrlen = (t_uscalar_t)so->so_faddr_len; + addr = sti->sti_faddr_sa; + addrlen = (t_uscalar_t)sti->sti_faddr_len; src = NULL; srclen = 0; } else { /* * Pass the sockaddr_un source address as an option * and translate the remote address. - * Holding so_lock thus so_laddr_sa can not change. + * Holding so_lock thus sti_laddr_sa can not change. */ - src = so->so_laddr_sa; - srclen = (t_uscalar_t)so->so_laddr_len; + src = sti->sti_laddr_sa; + srclen = (t_uscalar_t)sti->sti_laddr_len; dprintso(so, 1, ("sotpi_connect UNIX: srclen %d, src %p\n", srclen, src)); error = so_ux_addr_xlate(so, - so->so_faddr_sa, (socklen_t)so->so_faddr_len, + sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len, (flags & _SOCONNECT_XPG4_2), &addr, &addrlen); if (error) goto bad; } } else { - addr = so->so_faddr_sa; - addrlen = (t_uscalar_t)so->so_faddr_len; + addr = sti->sti_faddr_sa; + addrlen = (t_uscalar_t)sti->sti_faddr_len; src = NULL; srclen = 0; } @@ -2209,7 +2502,7 @@ sotpi_connect(struct sonode *so, val = 1; (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND, - &val, (t_uscalar_t)sizeof (val)); + &val, (t_uscalar_t)sizeof (val), cr); mutex_enter(&so->so_lock); so_lock_single(so); /* Set SOLOCKED */ @@ -2225,8 +2518,8 @@ sotpi_connect(struct sonode *so, */ fflag = 0; ASSERT(so->so_family != AF_UNIX); - so->so_state &= ~SS_LADDR_VALID; - } else if (so->so_laddr_len != 0) { + sti->sti_laddr_valid = 0; + } else if (sti->sti_laddr_len != 0) { /* * If the local address or port was "any" then it may be * changed by the transport as a result of the @@ -2234,21 +2527,22 @@ sotpi_connect(struct sonode *so, */ switch (so->so_family) { case AF_INET: - ASSERT(so->so_laddr_len == (socklen_t)sizeof (sin_t)); - if (((sin_t *)so->so_laddr_sa)->sin_addr.s_addr == + ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t)); + if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr == INADDR_ANY || - ((sin_t *)so->so_laddr_sa)->sin_port == 0) - so->so_state &= ~SS_LADDR_VALID; + ((sin_t *)sti->sti_laddr_sa)->sin_port == 0) + sti->sti_laddr_valid = 0; break; case AF_INET6: - ASSERT(so->so_laddr_len == (socklen_t)sizeof (sin6_t)); + ASSERT(sti->sti_laddr_len == + (socklen_t)sizeof (sin6_t)); if (IN6_IS_ADDR_UNSPECIFIED( - &((sin6_t *)so->so_laddr_sa) ->sin6_addr) || + &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) || IN6_IS_ADDR_V4MAPPED_ANY( - &((sin6_t *)so->so_laddr_sa)->sin6_addr) || - ((sin6_t *)so->so_laddr_sa)->sin6_port == 0) - so->so_state &= ~SS_LADDR_VALID; + &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) || + ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0) + sti->sti_laddr_valid = 0; break; default: @@ -2337,30 +2631,18 @@ done: case EISCONN: case EINTR: /* Non-fatal errors */ - so->so_state &= ~SS_LADDR_VALID; + sti->sti_laddr_valid = 0; /* FALLTHRU */ case 0: break; - - case EHOSTUNREACH: - if (flags & _SOCONNECT_XPG4_2) { - /* - * X/Open specification contains a requirement that - * ENETUNREACH be returned but does not require - * EHOSTUNREACH. In order to keep the test suite - * happy we mess with the errno here. - */ - error = ENETUNREACH; - } - /* FALLTHRU */ - default: ASSERT(need_unlock); /* * Fatal errors: clear SS_ISCONNECTING in case it was set, * and invalidate local-address cache */ - so->so_state &= ~(SS_ISCONNECTING | SS_LADDR_VALID); + so->so_state &= ~SS_ISCONNECTING; + sti->sti_laddr_valid = 0; /* A discon_ind might have already unbound us */ if ((flags & _SOCONNECT_DID_BIND) && (so->so_state & SS_ISBOUND)) { @@ -2379,18 +2661,20 @@ done: mutex_exit(&so->so_lock); return (error); -so_bad: error = sogeterr(so); +so_bad: error = sogeterr(so, B_TRUE); bad: eprintsoline(so, error); goto done; } +/* ARGSUSED */ int -sotpi_shutdown(struct sonode *so, int how) +sotpi_shutdown(struct sonode *so, int how, struct cred *cr) { struct T_ordrel_req ordrel_req; mblk_t *mp; uint_t old_state, state_change; int error = 0; + sotpi_info_t *sti = SOTOTPI(so); dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n", (void *)so, how, pr_state(so->so_state, so->so_mode))); @@ -2523,14 +2807,14 @@ sotpi_shutdown(struct sonode *so, int how) * For SunOS 4.X compatibility we tell the other end * that we are unable to receive at this point. */ - if (so->so_family == AF_UNIX && so->so_serv_type != T_CLTS) + if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS) so_unix_close(so); - if (so->so_serv_type == T_COTS) + if (sti->sti_serv_type == T_COTS) error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD); } if ((state_change & SS_CANTSENDMORE) && - (so->so_serv_type == T_COTS_ORD)) { + (sti->sti_serv_type == T_COTS_ORD)) { /* Send an orderly release */ ordrel_req.PRIM_type = T_ORDREL_REQ; @@ -2582,6 +2866,7 @@ so_unix_close(struct sonode *so) int error; struct T_opthdr toh; mblk_t *mp; + sotpi_info_t *sti = SOTOTPI(so); ASSERT(MUTEX_HELD(&so->so_lock)); @@ -2632,35 +2917,35 @@ so_unix_close(struct sonode *so) /* * Length and family checks. */ - error = so_addr_verify(so, so->so_faddr_sa, - (t_uscalar_t)so->so_faddr_len); + error = so_addr_verify(so, sti->sti_faddr_sa, + (t_uscalar_t)sti->sti_faddr_len); if (error) { eprintsoline(so, error); return; } - if (so->so_state & SS_FADDR_NOXLATE) { + if (sti->sti_faddr_noxlate) { /* * Already have a transport internal address. Do not * pass any (transport internal) source address. */ - addr = so->so_faddr_sa; - addrlen = (t_uscalar_t)so->so_faddr_len; + addr = sti->sti_faddr_sa; + addrlen = (t_uscalar_t)sti->sti_faddr_len; src = NULL; srclen = 0; } else { /* * Pass the sockaddr_un source address as an option * and translate the remote address. - * Holding so_lock thus so_laddr_sa can not change. + * Holding so_lock thus sti_laddr_sa can not change. */ - src = so->so_laddr_sa; - srclen = (socklen_t)so->so_laddr_len; + src = sti->sti_laddr_sa; + srclen = (socklen_t)sti->sti_laddr_len; dprintso(so, 1, ("so_ux_close: srclen %d, src %p\n", srclen, src)); error = so_ux_addr_xlate(so, - so->so_faddr_sa, - (socklen_t)so->so_faddr_len, 0, + sti->sti_faddr_sa, + (socklen_t)sti->sti_faddr_len, 0, &addr, &addrlen); if (error) { eprintsoline(so, error); @@ -2717,93 +3002,6 @@ so_unix_close(struct sonode *so) } /* - * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK. - */ -int -sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags) -{ - mblk_t *mp, *nmp; - int error; - - dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n", - (void *)so, (void *)msg, flags)); - - /* - * There is never any oob data with addresses or control since - * the T_EXDATA_IND does not carry any options. - */ - msg->msg_controllen = 0; - msg->msg_namelen = 0; - - mutex_enter(&so->so_lock); - ASSERT(so_verify_oobstate(so)); - if ((so->so_options & SO_OOBINLINE) || - (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) { - dprintso(so, 1, ("sorecvoob: inline or data consumed\n")); - mutex_exit(&so->so_lock); - return (EINVAL); - } - if (!(so->so_state & SS_HAVEOOBDATA)) { - dprintso(so, 1, ("sorecvoob: no data yet\n")); - mutex_exit(&so->so_lock); - return (EWOULDBLOCK); - } - ASSERT(so->so_oobmsg != NULL); - mp = so->so_oobmsg; - if (flags & MSG_PEEK) { - /* - * Since recv* can not return ENOBUFS we can not use dupmsg. - * Instead we revert to the consolidation private - * allocb_wait plus bcopy. - */ - mblk_t *mp1; - - mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL); - ASSERT(mp1); - - while (mp != NULL) { - ssize_t size; - - size = MBLKL(mp); - bcopy(mp->b_rptr, mp1->b_wptr, size); - mp1->b_wptr += size; - ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim); - mp = mp->b_cont; - } - mp = mp1; - } else { - /* - * Update the state indicating that the data has been consumed. - * Keep SS_OOBPEND set until data is consumed past the mark. - */ - so->so_oobmsg = NULL; - so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA; - } - dprintso(so, 1, - ("after recvoob(%p): counts %d/%d state %s\n", - (void *)so, so->so_oobsigcnt, - so->so_oobcnt, pr_state(so->so_state, so->so_mode))); - ASSERT(so_verify_oobstate(so)); - mutex_exit(&so->so_lock); - - error = 0; - nmp = mp; - while (nmp != NULL && uiop->uio_resid > 0) { - ssize_t n = MBLKL(nmp); - - n = MIN(n, uiop->uio_resid); - if (n > 0) - error = uiomove(nmp->b_rptr, n, - UIO_READ, uiop); - if (error) - break; - nmp = nmp->b_cont; - } - freemsg(mp); - return (error); -} - -/* * Called by sotpi_recvmsg when reading a non-zero amount of data. * In addition, the caller typically verifies that there is some * potential state to clear by checking @@ -2811,7 +3009,7 @@ sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags) * before calling this routine. * Note that such a check can be made without holding so_lock since * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg - * decrements so_oobsigcnt. + * decrements sti_oobsigcnt. * * When data is read *after* the point that all pending * oob data has been consumed the oob indication is cleared. @@ -2823,13 +3021,15 @@ sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags) static void sorecv_update_oobstate(struct sonode *so) { + sotpi_info_t *sti = SOTOTPI(so); + mutex_enter(&so->so_lock); ASSERT(so_verify_oobstate(so)); dprintso(so, 1, ("sorecv_update_oobstate: counts %d/%d state %s\n", - so->so_oobsigcnt, - so->so_oobcnt, pr_state(so->so_state, so->so_mode))); - if (so->so_oobsigcnt == 0) { + sti->sti_oobsigcnt, + sti->sti_oobcnt, pr_state(so->so_state, so->so_mode))); + if (sti->sti_oobsigcnt == 0) { /* No more pending oob indications */ so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK); freemsg(so->so_oobmsg); @@ -2845,10 +3045,11 @@ sorecv_update_oobstate(struct sonode *so) static int nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp) { + sotpi_info_t *sti = SOTOTPI(so); int error = 0; mblk_t *tmp = NULL; mblk_t *pmp = NULL; - mblk_t *nmp = so->so_nl7c_rcv_mp; + mblk_t *nmp = sti->sti_nl7c_rcv_mp; ASSERT(nmp != NULL); @@ -2889,25 +3090,24 @@ nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp) if (pmp != NULL) { /* Free any mblk_t(s) which we have consumed */ pmp->b_cont = NULL; - freemsg(so->so_nl7c_rcv_mp); + freemsg(sti->sti_nl7c_rcv_mp); } - if ((so->so_nl7c_rcv_mp = nmp) == NULL) { + if ((sti->sti_nl7c_rcv_mp = nmp) == NULL) { /* Last mblk_t so return the saved kstrgetmsg() rval/error */ if (error == 0) { - rval_t *p = (rval_t *)&so->so_nl7c_rcv_rval; + rval_t *p = (rval_t *)&sti->sti_nl7c_rcv_rval; error = p->r_v.r_v2; p->r_v.r_v2 = 0; } - rp->r_vals = so->so_nl7c_rcv_rval; - so->so_nl7c_rcv_rval = 0; + rp->r_vals = sti->sti_nl7c_rcv_rval; + sti->sti_nl7c_rcv_rval = 0; } else { /* More mblk_t(s) to process so no rval to return */ rp->r_vals = 0; } return (error); } - /* * Receive the next message on the queue. * If msg_controllen is non-zero when called the caller is interested in @@ -2917,8 +3117,10 @@ nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp) * The routine returns with msg_control and msg_name pointing to * kmem_alloc'ed memory which the caller has to free. */ +/* ARGSUSED */ int -sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) +sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, + struct cred *cr) { union T_primitives *tpr; mblk_t *mp; @@ -2932,10 +3134,10 @@ sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) rval_t rval; int flags; clock_t timout; - int first; int error = 0; + int reterr = 0; struct uio *suiop = NULL; - sodirect_t *sodp = so->so_direct; + sotpi_info_t *sti = SOTOTPI(so); flags = msg->msg_flags; msg->msg_flags = 0; @@ -2944,6 +3146,12 @@ sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) (void *)so, (void *)msg, flags, pr_state(so->so_state, so->so_mode), so->so_error)); + if (so->so_version == SOV_STREAM) { + so_update_attrs(so, SOACC); + /* The imaginary "sockmod" has been popped - act as a stream */ + return (strread(SOTOV(so), uiop, cr)); + } + /* * If we are not connected because we have never been connected * we return ENOTCONN. If we have been connected (but are no longer @@ -2970,9 +3178,13 @@ sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) /* Check that the transport supports OOB */ if (!(so->so_mode & SM_EXDATA)) return (EOPNOTSUPP); - return (sorecvoob(so, msg, uiop, flags)); + so_update_attrs(so, SOACC); + return (sorecvoob(so, msg, uiop, flags, + (so->so_options & SO_OOBINLINE))); } + so_update_attrs(so, SOACC); + /* * Set msg_controllen and msg_namelen to zero here to make it * simpler in the cases that no control or name is returned. @@ -2989,31 +3201,32 @@ sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) /* * If an NL7C enabled socket and not waiting for write data. */ - if ((so->so_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) == + if ((sti->sti_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) == NL7C_ENABLED) { - if (so->so_nl7c_uri) { + if (sti->sti_nl7c_uri) { /* Close uri processing for a previous request */ nl7c_close(so); } - if ((so_state & SS_CANTRCVMORE) && so->so_nl7c_rcv_mp == NULL) { + if ((so_state & SS_CANTRCVMORE) && + sti->sti_nl7c_rcv_mp == NULL) { /* Nothing to process, EOF */ mutex_exit(&so->so_lock); return (0); - } else if (so->so_nl7c_flags & NL7C_SOPERSIST) { + } else if (sti->sti_nl7c_flags & NL7C_SOPERSIST) { /* Persistent NL7C socket, try to process request */ boolean_t ret; ret = nl7c_process(so, (so->so_state & (SS_NONBLOCK|SS_NDELAY))); - rval.r_vals = so->so_nl7c_rcv_rval; + rval.r_vals = sti->sti_nl7c_rcv_rval; error = rval.r_v.r_v2; if (error) { /* Error of some sort, return it */ mutex_exit(&so->so_lock); return (error); } - if (so->so_nl7c_flags && - ! (so->so_nl7c_flags & NL7C_WAITWRITE)) { + if (sti->sti_nl7c_flags && + ! (sti->sti_nl7c_flags & NL7C_WAITWRITE)) { /* * Still an NL7C socket and no data * to pass up to the caller. @@ -3031,7 +3244,7 @@ sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) /* * Not persistent so no further NL7C processing. */ - so->so_nl7c_flags = 0; + sti->sti_nl7c_flags = 0; } } /* @@ -3081,84 +3294,23 @@ sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) else timout = -1; opflag = pflag; - first = 1; - if (uiop->uio_resid >= uioasync.mincnt && - sodp != NULL && (sodp->sod_state & SOD_ENABLED) && - uioasync.enabled && !(flags & MSG_PEEK) && - !(so_state & SS_CANTRCVMORE)) { - /* - * Big enough I/O for uioa min setup and an sodirect socket - * and sodirect enabled and uioa enabled and I/O will be done - * and not EOF so initialize the sodirect_t uioa_t with "uiop". - */ - mutex_enter(sodp->sod_lockp); - if (!uioainit(uiop, &sodp->sod_uioa)) { - /* - * Successful uioainit() so the uio_t part of the - * uioa_t will be used for all uio_t work to follow, - * we save the original "uiop" in "suiop". - */ - suiop = uiop; - uiop = (uio_t *)&sodp->sod_uioa; - /* - * Before returning to the caller the passed in uio_t - * "uiop" will be updated via a call to uioafini() - * below. - * - * Note, the uioa.uioa_state isn't set to UIOA_ENABLED - * here as first we have to uioamove() any currently - * queued M_DATA mblk_t(s) so it will be done in - * kstrgetmsg(). - */ - } - /* - * In either uioainit() success or not case note the number - * of uio bytes the caller wants for sod framework and/or - * transport (e.g. TCP) strategy. - */ - sodp->sod_want = uiop->uio_resid; - mutex_exit(sodp->sod_lockp); - } else if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) { - /* - * No uioa but still using sodirect so note the number of - * uio bytes the caller wants for sodirect framework and/or - * transport (e.g. TCP) strategy. - * - * Note, sod_lockp not held, only writer is in this function - * and only one thread at a time so not needed just to init. - */ - sodp->sod_want = uiop->uio_resid; - } + suiop = sod_rcv_init(so, flags, &uiop); retry: saved_resid = uiop->uio_resid; pri = 0; mp = NULL; - if (so->so_nl7c_rcv_mp != NULL) { + if (sti->sti_nl7c_rcv_mp != NULL) { /* Already kstrgetmsg()ed saved mblk(s) from NL7C */ error = nl7c_sorecv(so, &mp, uiop, &rval); } else { error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag, timout, &rval); } - if (error) { - switch (error) { - case EINTR: - case EWOULDBLOCK: - if (!first) - error = 0; - break; - case ETIME: - /* Returned from kstrgetmsg when timeout expires */ - if (!first) - error = 0; - else - error = EWOULDBLOCK; - break; - default: - eprintsoline(so, error); - break; - } + if (error != 0) { + /* kstrgetmsg returns ETIME when timeout expires */ + if (error == ETIME) + error = EWOULDBLOCK; goto out; } /* @@ -3198,7 +3350,6 @@ retry: if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) && uiop->uio_resid != saved_resid && uiop->uio_resid > 0) { mutex_exit(&so->so_lock); - first = 0; pflag = opflag | MSG_NOMARK; goto retry; } @@ -3238,7 +3389,6 @@ retry: if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) && uiop->uio_resid != saved_resid && uiop->uio_resid > 0) { mutex_exit(&so->so_lock); - first = 0; pflag = opflag | MSG_NOMARK; goto retry; } @@ -3436,7 +3586,6 @@ retry: controllen == 0 && uiop->uio_resid != saved_resid && uiop->uio_resid > 0) { mutex_exit(&so->so_lock); - first = 0; pflag = opflag | MSG_NOMARK; goto retry; } @@ -3446,7 +3595,7 @@ retry: dprintso(so, 1, ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld " "state %s\n", - so->so_oobsigcnt, so->so_oobcnt, + sti->sti_oobsigcnt, sti->sti_oobcnt, saved_resid - uiop->uio_resid, pr_state(so->so_state, so->so_mode))); /* @@ -3476,8 +3625,8 @@ retry: dprintso(so, 1, ("sotpi_recvmsg: consume EXDATA_IND " "counts %d/%d state %s\n", - so->so_oobsigcnt, - so->so_oobcnt, + sti->sti_oobsigcnt, + sti->sti_oobcnt, pr_state(so->so_state, so->so_mode))); pflag = MSG_ANY | MSG_DELAYERROR; @@ -3516,11 +3665,11 @@ retry: */ mutex_enter(&so->so_lock); ASSERT(so_verify_oobstate(so)); - ASSERT(so->so_oobsigcnt >= so->so_oobcnt); - ASSERT(so->so_oobsigcnt > 0); - so->so_oobsigcnt--; - ASSERT(so->so_oobcnt > 0); - so->so_oobcnt--; + ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt); + ASSERT(sti->sti_oobsigcnt > 0); + sti->sti_oobsigcnt--; + ASSERT(sti->sti_oobcnt > 0); + sti->sti_oobcnt--; /* * Since the T_EXDATA_IND has been removed from the stream * head, but we have not read data past the mark, @@ -3533,12 +3682,14 @@ retry: mutex_exit(&so->so_lock); dprintso(so, 1, ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n", - so->so_oobsigcnt, so->so_oobcnt, + sti->sti_oobsigcnt, sti->sti_oobcnt, pr_state(so->so_state, so->so_mode))); pflag = opflag; goto retry; } default: + cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n", + (void *)so, tpr->type, (void *)mp); ASSERT(0); freemsg(mp); error = EPROTO; @@ -3549,35 +3700,13 @@ retry: out: mutex_enter(&so->so_lock); out_locked: - if (sodp != NULL) { - /* Finish any sodirect and uioa processing */ - mutex_enter(sodp->sod_lockp); - if (suiop != NULL) { - /* Finish any uioa_t processing */ - int ret; - - ASSERT(uiop == (uio_t *)&sodp->sod_uioa); - ret = uioafini(suiop, (uioa_t *)uiop); - if (error == 0 && ret != 0) { - /* If no error yet, set it */ - error = ret; - } - if ((mp = sodp->sod_uioafh) != NULL) { - sodp->sod_uioafh = NULL; - sodp->sod_uioaft = NULL; - freemsg(mp); - } - } - ASSERT(sodp->sod_uioafh == NULL); - if (!(sodp->sod_state & SOD_WAKE_NOT)) { - /* Awoke */ - sodp->sod_state &= SOD_WAKE_CLR; - sodp->sod_state |= SOD_WAKE_NOT; - } - /* Last, clear sod_want value */ - sodp->sod_want = 0; - mutex_exit(sodp->sod_lockp); + if (so->so_direct != NULL) { + mutex_enter(so->so_direct->sod_lockp); + reterr = sod_rcv_done(so, suiop, uiop); + mutex_exit(so->so_direct->sod_lockp); } + if (reterr != 0 && error == 0) + error = reterr; so_unlock_read(so); /* Clear SOREADLOCKED */ mutex_exit(&so->so_lock); return (error); @@ -3605,12 +3734,13 @@ sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen, t_uscalar_t optlen; void *fds; int fdlen; + sotpi_info_t *sti = SOTOTPI(so); ASSERT(name && namelen); ASSERT(control && controllen); len = uiop->uio_resid; - if (len > (ssize_t)so->so_tidu_size) { + if (len > (ssize_t)sti->sti_tidu_size) { return (EMSGSIZE); } @@ -3630,7 +3760,7 @@ sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen, return (error); } if (so->so_family == AF_UNIX) { - if (so->so_state & SS_FADDR_NOXLATE) { + if (sti->sti_faddr_noxlate) { /* * Already have a transport internal address. Do not * pass any (transport internal) source address. @@ -3644,14 +3774,14 @@ sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen, * Pass the sockaddr_un source address as an option * and translate the remote address. * - * Note that this code does not prevent so_laddr_sa + * Note that this code does not prevent sti_laddr_sa * from changing while it is being used. Thus * if an unbind+bind occurs concurrently with this * send the peer might see a partially new and a * partially old "from" address. */ - src = so->so_laddr_sa; - srclen = (t_uscalar_t)so->so_laddr_len; + src = sti->sti_laddr_sa; + srclen = (t_uscalar_t)sti->sti_laddr_len; dprintso(so, 1, ("sosend_dgramcmsg UNIX: srclen %d, src %p\n", srclen, src)); @@ -3762,24 +3892,20 @@ sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen, * Assumes caller has verified that SS_ISCONNECTED is set. */ static int -sosend_svccmsg(struct sonode *so, - struct uio *uiop, - int more, - void *control, - t_uscalar_t controllen, - int flags) +sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control, + t_uscalar_t controllen, int flags) { struct T_optdata_req tdr; mblk_t *mp; int error; ssize_t iosize; - int first = 1; int size; struct fdbuf *fdbuf; t_uscalar_t optlen; void *fds; int fdlen; struct T_opthdr toh; + sotpi_info_t *sti = SOTOTPI(so); dprintso(so, 1, ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid)); @@ -3801,7 +3927,7 @@ sosend_svccmsg(struct sonode *so, * Error for transports with zero tidu_size. */ tdr.PRIM_type = T_OPTDATA_REQ; - iosize = so->so_tidu_size; + iosize = sti->sti_tidu_size; if (iosize <= 0) return (EMSGSIZE); if (uiop->uio_resid > iosize) { @@ -3843,7 +3969,7 @@ sosend_svccmsg(struct sonode *so, * Caught a signal waiting for memory. * Let send* return EINTR. */ - return (first ? EINTR : 0); + return (EINTR); } } soappendmsg(mp, &tdr, sizeof (tdr)); @@ -3869,13 +3995,10 @@ sosend_svccmsg(struct sonode *so, error = kstrputmsg(SOTOV(so), mp, uiop, iosize, 0, MSG_BAND, 0); if (error) { - if (!first && error == EWOULDBLOCK) - return (0); eprintsoline(so, error); return (error); } control = NULL; - first = 0; if (uiop->uio_resid > 0) { /* * Recheck for fatal errors. Fail write even though @@ -3883,13 +4006,12 @@ sosend_svccmsg(struct sonode *so, * with strwrite semantics and BSD sockets semantics. */ if (so->so_state & SS_CANTSENDMORE) { - tsignal(curthread, SIGPIPE); eprintsoline(so, error); return (EPIPE); } if (so->so_error != 0) { mutex_enter(&so->so_lock); - error = sogeterr(so); + error = sogeterr(so, B_TRUE); mutex_exit(&so->so_lock); if (error != 0) { eprintsoline(so, error); @@ -3920,11 +4042,12 @@ sosend_dgram(struct sonode *so, struct sockaddr *name, socklen_t namelen, void *src; socklen_t srclen; ssize_t len; + sotpi_info_t *sti = SOTOTPI(so); ASSERT(name != NULL && namelen != 0); len = uiop->uio_resid; - if (len > so->so_tidu_size) { + if (len > sti->sti_tidu_size) { error = EMSGSIZE; goto done; } @@ -3934,11 +4057,11 @@ sosend_dgram(struct sonode *so, struct sockaddr *name, socklen_t namelen, if (error != 0) goto done; - if (so->so_state & SS_DIRECT) + if (sti->sti_direct) return (sodgram_direct(so, name, namelen, uiop, flags)); if (so->so_family == AF_UNIX) { - if (so->so_state & SS_FADDR_NOXLATE) { + if (sti->sti_faddr_noxlate) { /* * Already have a transport internal address. Do not * pass any (transport internal) source address. @@ -3952,14 +4075,14 @@ sosend_dgram(struct sonode *so, struct sockaddr *name, socklen_t namelen, * Pass the sockaddr_un source address as an option * and translate the remote address. * - * Note that this code does not prevent so_laddr_sa + * Note that this code does not prevent sti_laddr_sa * from changing while it is being used. Thus * if an unbind+bind occurs concurrently with this * send the peer might see a partially new and a * partially old "from" address. */ - src = so->so_laddr_sa; - srclen = (socklen_t)so->so_laddr_len; + src = sti->sti_laddr_sa; + srclen = (socklen_t)sti->sti_laddr_len; dprintso(so, 1, ("sosend_dgram UNIX: srclen %d, src %p\n", srclen, src)); @@ -4048,17 +4171,14 @@ done: * Assumes caller has verified that SS_ISCONNECTED is set. */ int -sosend_svc(struct sonode *so, - struct uio *uiop, - t_scalar_t prim, - int more, - int sflag) +sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more, + int sflag) { struct T_data_req tdr; mblk_t *mp; int error; ssize_t iosize; - int first = 1; + sotpi_info_t *sti = SOTOTPI(so); dprintso(so, 1, ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n", @@ -4077,7 +4197,7 @@ sosend_svc(struct sonode *so, * Error for transports with zero tidu_size. */ tdr.PRIM_type = prim; - iosize = so->so_tidu_size; + iosize = sti->sti_tidu_size; if (iosize <= 0) return (EMSGSIZE); if (uiop->uio_resid > iosize) { @@ -4097,21 +4217,15 @@ sosend_svc(struct sonode *so, * Caught a signal waiting for memory. * Let send* return EINTR. */ - if (first) - return (EINTR); - else - return (0); + return (EINTR); } error = kstrputmsg(SOTOV(so), mp, uiop, iosize, 0, sflag | MSG_BAND, 0); if (error) { - if (!first && error == EWOULDBLOCK) - return (0); eprintsoline(so, error); return (error); } - first = 0; if (uiop->uio_resid > 0) { /* * Recheck for fatal errors. Fail write even though @@ -4119,13 +4233,12 @@ sosend_svc(struct sonode *so, * with strwrite semantics and BSD sockets semantics. */ if (so->so_state & SS_CANTSENDMORE) { - tsignal(curthread, SIGPIPE); eprintsoline(so, error); return (EPIPE); } if (so->so_error != 0) { mutex_enter(&so->so_lock); - error = sogeterr(so); + error = sogeterr(so, B_TRUE); mutex_exit(&so->so_lock); if (error != 0) { eprintsoline(so, error); @@ -4145,7 +4258,8 @@ sosend_svc(struct sonode *so, * after sending the message. */ static int -sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) +sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, + struct cred *cr) { int so_state; int so_mode; @@ -4154,22 +4268,28 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) t_uscalar_t namelen; int dontroute; int flags; + sotpi_info_t *sti = SOTOTPI(so); dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n", (void *)so, (void *)msg, msg->msg_flags, pr_state(so->so_state, so->so_mode), so->so_error)); + if (so->so_version == SOV_STREAM) { + /* The imaginary "sockmod" has been popped - act as a stream */ + so_update_attrs(so, SOMOD); + return (strwrite(SOTOV(so), uiop, cr)); + } + mutex_enter(&so->so_lock); so_state = so->so_state; if (so_state & SS_CANTSENDMORE) { mutex_exit(&so->so_lock); - tsignal(curthread, SIGPIPE); return (EPIPE); } if (so->so_error != 0) { - error = sogeterr(so); + error = sogeterr(so, B_TRUE); if (error != 0) { mutex_exit(&so->so_lock); return (error); @@ -4194,15 +4314,15 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) namelen = 0; } else { /* - * Note that this code does not prevent so_faddr_sa + * Note that this code does not prevent sti_faddr_sa * from changing while it is being used. Thus * if an "unconnect"+connect occurs concurrently with * this send the datagram might be delivered to a * garbaled address. */ - ASSERT(so->so_faddr_sa); - name = so->so_faddr_sa; - namelen = (t_uscalar_t)so->so_faddr_len; + ASSERT(sti->sti_faddr_sa); + name = sti->sti_faddr_sa; + namelen = (t_uscalar_t)sti->sti_faddr_len; } } else { if (!(so_state & SS_ISCONNECTED) && @@ -4227,7 +4347,7 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) if (!(so_state & SS_ISBOUND)) { so_lock_single(so); /* Set SOLOCKED */ error = sotpi_bind(so, NULL, 0, - _SOBIND_UNSPEC|_SOBIND_LOCK_HELD); + _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr); so_unlock_single(so, SOLOCKED); if (error) { mutex_exit(&so->so_lock); @@ -4243,20 +4363,20 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) * If sending to some other address discard the delayed * error indication. */ - if (so->so_delayed_error) { + if (sti->sti_delayed_error) { struct T_uderror_ind *tudi; void *addr; t_uscalar_t addrlen; boolean_t match = B_FALSE; - ASSERT(so->so_eaddr_mp); - error = so->so_delayed_error; - so->so_delayed_error = 0; - tudi = (struct T_uderror_ind *)so->so_eaddr_mp->b_rptr; + ASSERT(sti->sti_eaddr_mp); + error = sti->sti_delayed_error; + sti->sti_delayed_error = 0; + tudi = + (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr; addrlen = tudi->DEST_length; - addr = sogetoff(so->so_eaddr_mp, - tudi->DEST_offset, - addrlen, 1); + addr = sogetoff(sti->sti_eaddr_mp, + tudi->DEST_offset, addrlen, 1); ASSERT(addr); /* Checked by strsock_proto */ switch (so->so_family) { case AF_INET: { @@ -4292,8 +4412,8 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) match = B_TRUE; } if (match) { - freemsg(so->so_eaddr_mp); - so->so_eaddr_mp = NULL; + freemsg(sti->sti_eaddr_mp); + sti->sti_eaddr_mp = NULL; mutex_exit(&so->so_lock); #ifdef DEBUG dprintso(so, 0, @@ -4303,8 +4423,8 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) #endif /* DEBUG */ return (error); } - freemsg(so->so_eaddr_mp); - so->so_eaddr_mp = NULL; + freemsg(sti->sti_eaddr_mp); + sti->sti_eaddr_mp = NULL; } } mutex_exit(&so->so_lock); @@ -4316,7 +4436,7 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) val = 1; error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE, - &val, (t_uscalar_t)sizeof (val)); + &val, (t_uscalar_t)sizeof (val), cr); if (error) return (error); dontroute = 1; @@ -4328,6 +4448,7 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) } if (msg->msg_controllen != 0) { if (!(so_mode & SM_CONNREQUIRED)) { + so_update_attrs(so, SOMOD); error = sosend_dgramcmsg(so, name, namelen, uiop, msg->msg_control, msg->msg_controllen, flags); } else { @@ -4336,6 +4457,7 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) error = EOPNOTSUPP; goto done; } + so_update_attrs(so, SOMOD); error = sosend_svccmsg(so, uiop, !(flags & MSG_EOR), msg->msg_control, msg->msg_controllen, @@ -4344,6 +4466,7 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) goto done; } + so_update_attrs(so, SOMOD); if (!(so_mode & SM_CONNREQUIRED)) { /* * If there is no SO_DONTROUTE to turn off return immediately @@ -4368,20 +4491,25 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) } else { if (so_mode & SM_BYTESTREAM) { /* Byte stream transport - use write */ - dprintso(so, 1, ("sotpi_sendmsg: write\n")); + + /* Send M_DATA messages */ + if ((sti->sti_nl7c_flags & NL7C_ENABLED) && + (error = nl7c_data(so, uiop)) >= 0) { + /* NL7C consumed the data */ + return (error); + } /* * If there is no SO_DONTROUTE to turn off, - * SS_DIRECT is on, and there is no flow + * sti_direct is on, and there is no flow * control, we can take the fast path. */ - if (!dontroute && - (so_state & SS_DIRECT) && + if (!dontroute && sti->sti_direct != 0 && canputnext(SOTOV(so)->v_stream->sd_wrq)) { return (sostream_direct(so, uiop, - NULL, CRED())); + NULL, cr)); } - error = strwrite(SOTOV(so), uiop, CRED()); + error = strwrite(SOTOV(so), uiop, cr); goto done; } prim = T_DATA_REQ; @@ -4404,12 +4532,129 @@ done: val = 0; (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE, - &val, (t_uscalar_t)sizeof (val)); + &val, (t_uscalar_t)sizeof (val), cr); } return (error); } /* + * kstrwritemp() has very similar semantics as that of strwrite(). + * The main difference is it obtains mblks from the caller and also + * does not do any copy as done in strwrite() from user buffers to + * kernel buffers. + * + * Currently, this routine is used by sendfile to send data allocated + * within the kernel without any copying. This interface does not use the + * synchronous stream interface as synch. stream interface implies + * copying. + */ +int +kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode) +{ + struct stdata *stp; + struct queue *wqp; + mblk_t *newmp; + char waitflag; + int tempmode; + int error = 0; + int done = 0; + struct sonode *so; + boolean_t direct; + + ASSERT(vp->v_stream); + stp = vp->v_stream; + + so = VTOSO(vp); + direct = _SOTOTPI(so)->sti_direct; + + /* + * This is the sockfs direct fast path. canputnext() need + * not be accurate so we don't grab the sd_lock here. If + * we get flow-controlled, we grab sd_lock just before the + * do..while loop below to emulate what strwrite() does. + */ + wqp = stp->sd_wrq; + if (canputnext(wqp) && direct && + !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) { + return (sostream_direct(so, NULL, mp, CRED())); + } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) { + /* Fast check of flags before acquiring the lock */ + mutex_enter(&stp->sd_lock); + error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0); + mutex_exit(&stp->sd_lock); + if (error != 0) { + if (!(stp->sd_flag & STPLEX) && + (stp->sd_wput_opt & SW_SIGPIPE)) { + error = EPIPE; + } + return (error); + } + } + + waitflag = WRITEWAIT; + if (stp->sd_flag & OLDNDELAY) + tempmode = fmode & ~FNDELAY; + else + tempmode = fmode; + + mutex_enter(&stp->sd_lock); + do { + if (canputnext(wqp)) { + mutex_exit(&stp->sd_lock); + if (stp->sd_wputdatafunc != NULL) { + newmp = (stp->sd_wputdatafunc)(vp, mp, NULL, + NULL, NULL, NULL); + if (newmp == NULL) { + /* The caller will free mp */ + return (ECOMM); + } + mp = newmp; + } + putnext(wqp, mp); + return (0); + } + error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1, + &done); + } while (error == 0 && !done); + + mutex_exit(&stp->sd_lock); + /* + * EAGAIN tells the application to try again. ENOMEM + * is returned only if the memory allocation size + * exceeds the physical limits of the system. ENOMEM + * can't be true here. + */ + if (error == ENOMEM) + error = EAGAIN; + return (error); +} + +/* ARGSUSED */ +static int +sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag, + struct cred *cr, mblk_t **mpp) +{ + int error; + + if (so->so_family != AF_INET && so->so_family != AF_INET6) + return (EAFNOSUPPORT); + + if (so->so_state & SS_CANTSENDMORE) + return (EPIPE); + + if (so->so_type != SOCK_STREAM) + return (EOPNOTSUPP); + + if ((so->so_state & SS_ISCONNECTED) == 0) + return (ENOTCONN); + + error = kstrwritemp(so->so_vnode, *mpp, fflag); + if (error == 0) + *mpp = NULL; + return (error); +} + +/* * Sending data on a datagram socket. * Assumes caller has verified that SS_ISBOUND etc. are set. */ @@ -4429,6 +4674,7 @@ sodgram_direct(struct sonode *so, struct sockaddr *name, queue_t *udp_wq; boolean_t connected; mblk_t *mpdata = NULL; + sotpi_info_t *sti = SOTOTPI(so); ASSERT(name != NULL && namelen != 0); ASSERT(!(so->so_mode & SM_CONNREQUIRED)); @@ -4438,7 +4684,7 @@ sodgram_direct(struct sonode *so, struct sockaddr *name, /* Caller checked for proper length */ len = uiop->uio_resid; - ASSERT(len <= so->so_tidu_size); + ASSERT(len <= sti->sti_tidu_size); /* Length and family checks have been done by caller */ ASSERT(name->sa_family == so->so_family); @@ -4640,22 +4886,34 @@ slow_send: } /* - * Update so_faddr by asking the transport (unless AF_UNIX). + * Update sti_faddr by asking the transport (unless AF_UNIX). */ +/* ARGSUSED */ int -sotpi_getpeername(struct sonode *so) +sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen, + boolean_t accept, struct cred *cr) { struct strbuf strbuf; int error = 0, res; void *addr; t_uscalar_t addrlen; k_sigset_t smask; + sotpi_info_t *sti = SOTOTPI(so); dprintso(so, 1, ("sotpi_getpeername(%p) %s\n", (void *)so, pr_state(so->so_state, so->so_mode))); + ASSERT(*namelen > 0); mutex_enter(&so->so_lock); so_lock_single(so); /* Set SOLOCKED */ + + if (accept) { + bcopy(sti->sti_faddr_sa, name, + MIN(*namelen, sti->sti_faddr_len)); + *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len; + goto done; + } + if (!(so->so_state & SS_ISCONNECTED)) { error = ENOTCONN; goto done; @@ -4668,27 +4926,39 @@ sotpi_getpeername(struct sonode *so) } goto done; } + + if (sti->sti_faddr_valid) { + bcopy(sti->sti_faddr_sa, name, + MIN(*namelen, sti->sti_faddr_len)); + *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len; + goto done; + } + #ifdef DEBUG dprintso(so, 1, ("sotpi_getpeername (local): %s\n", - pr_addr(so->so_family, so->so_faddr_sa, - (t_uscalar_t)so->so_faddr_len))); + pr_addr(so->so_family, sti->sti_faddr_sa, + (t_uscalar_t)sti->sti_faddr_len))); #endif /* DEBUG */ if (so->so_family == AF_UNIX) { /* Transport has different name space - return local info */ + if (sti->sti_faddr_noxlate) + *namelen = 0; error = 0; goto done; } - ASSERT(so->so_faddr_sa); + ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0); + + ASSERT(sti->sti_faddr_sa); /* Allocate local buffer to use with ioctl */ - addrlen = (t_uscalar_t)so->so_faddr_maxlen; + addrlen = (t_uscalar_t)sti->sti_faddr_maxlen; mutex_exit(&so->so_lock); addr = kmem_alloc(addrlen, KM_SLEEP); /* * Issue TI_GETPEERNAME with signals masked. - * Put the result in so_faddr_sa so that getpeername works after + * Put the result in sti_faddr_sa so that getpeername works after * a shutdown(output). * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted * back to the socket. @@ -4699,16 +4969,16 @@ sotpi_getpeername(struct sonode *so) sigintr(&smask, 0); res = 0; - ASSERT(CRED()); + ASSERT(cr); error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf, - 0, K_TO_K, CRED(), &res); + 0, K_TO_K, cr, &res); sigunintr(&smask); mutex_enter(&so->so_lock); /* * If there is an error record the error in so_error put don't fail * the getpeername. Instead fallback on the recorded - * so->so_faddr_sa. + * sti->sti_faddr_sa. */ if (error) { /* @@ -4732,16 +5002,19 @@ sotpi_getpeername(struct sonode *so) error = 0; } else if (res == 0 && strbuf.len > 0 && (so->so_state & SS_ISCONNECTED)) { - ASSERT(strbuf.len <= (int)so->so_faddr_maxlen); - so->so_faddr_len = (socklen_t)strbuf.len; - bcopy(addr, so->so_faddr_sa, so->so_faddr_len); - so->so_state |= SS_FADDR_VALID; + ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen); + sti->sti_faddr_len = (socklen_t)strbuf.len; + bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len); + sti->sti_faddr_valid = 1; + + bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len)); + *namelen = sti->sti_faddr_len; } kmem_free(addr, addrlen); #ifdef DEBUG dprintso(so, 1, ("sotpi_getpeername (tp): %s\n", - pr_addr(so->so_family, so->so_faddr_sa, - (t_uscalar_t)so->so_faddr_len))); + pr_addr(so->so_family, sti->sti_faddr_sa, + (t_uscalar_t)sti->sti_faddr_len))); #endif /* DEBUG */ done: so_unlock_single(so, SOLOCKED); @@ -4750,42 +5023,39 @@ done: } /* - * Update so_laddr by asking the transport (unless AF_UNIX). + * Update sti_laddr by asking the transport (unless AF_UNIX). */ int -sotpi_getsockname(struct sonode *so) +sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen, + struct cred *cr) { struct strbuf strbuf; int error = 0, res; void *addr; t_uscalar_t addrlen; k_sigset_t smask; + sotpi_info_t *sti = SOTOTPI(so); dprintso(so, 1, ("sotpi_getsockname(%p) %s\n", (void *)so, pr_state(so->so_state, so->so_mode))); + ASSERT(*namelen > 0); mutex_enter(&so->so_lock); so_lock_single(so); /* Set SOLOCKED */ - if (!(so->so_state & SS_ISBOUND) && so->so_family != AF_UNIX) { - /* Return an all zero address except for the family */ - if (so->so_family == AF_INET) - so->so_laddr_len = (socklen_t)sizeof (sin_t); - else if (so->so_family == AF_INET6) - so->so_laddr_len = (socklen_t)sizeof (sin6_t); - ASSERT(so->so_laddr_len <= so->so_laddr_maxlen); - bzero(so->so_laddr_sa, so->so_laddr_len); - /* - * Can not assume there is a sa_family for all - * protocol families. - */ - if (so->so_family == AF_INET || so->so_family == AF_INET6) - so->so_laddr_sa->sa_family = so->so_family; - } + #ifdef DEBUG + dprintso(so, 1, ("sotpi_getsockname (local): %s\n", - pr_addr(so->so_family, so->so_laddr_sa, - (t_uscalar_t)so->so_laddr_len))); + pr_addr(so->so_family, sti->sti_laddr_sa, + (t_uscalar_t)sti->sti_laddr_len))); #endif /* DEBUG */ + if (sti->sti_laddr_valid) { + bcopy(sti->sti_laddr_sa, name, + MIN(*namelen, sti->sti_laddr_len)); + *namelen = sti->sti_laddr_len; + goto done; + } + if (so->so_family == AF_UNIX) { /* Transport has different name space - return local info */ error = 0; @@ -4796,14 +5066,15 @@ sotpi_getsockname(struct sonode *so) error = 0; goto done; } + /* Allocate local buffer to use with ioctl */ - addrlen = (t_uscalar_t)so->so_laddr_maxlen; + addrlen = (t_uscalar_t)sti->sti_laddr_maxlen; mutex_exit(&so->so_lock); addr = kmem_alloc(addrlen, KM_SLEEP); /* * Issue TI_GETMYNAME with signals masked. - * Put the result in so_laddr_sa so that getsockname works after + * Put the result in sti_laddr_sa so that getsockname works after * a shutdown(output). * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted * back to the socket. @@ -4814,16 +5085,16 @@ sotpi_getsockname(struct sonode *so) sigintr(&smask, 0); res = 0; - ASSERT(CRED()); + ASSERT(cr); error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf, - 0, K_TO_K, CRED(), &res); + 0, K_TO_K, cr, &res); sigunintr(&smask); mutex_enter(&so->so_lock); /* * If there is an error record the error in so_error put don't fail * the getsockname. Instead fallback on the recorded - * so->so_laddr_sa. + * sti->sti_laddr_sa. */ if (error) { /* @@ -4844,16 +5115,19 @@ sotpi_getsockname(struct sonode *so) error = 0; } else if (res == 0 && strbuf.len > 0 && (so->so_state & SS_ISBOUND)) { - ASSERT(strbuf.len <= (int)so->so_laddr_maxlen); - so->so_laddr_len = (socklen_t)strbuf.len; - bcopy(addr, so->so_laddr_sa, so->so_laddr_len); - so->so_state |= SS_LADDR_VALID; + ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen); + sti->sti_laddr_len = (socklen_t)strbuf.len; + bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len); + sti->sti_laddr_valid = 1; + + bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen)); + *namelen = sti->sti_laddr_len; } kmem_free(addr, addrlen); #ifdef DEBUG dprintso(so, 1, ("sotpi_getsockname (tp): %s\n", - pr_addr(so->so_family, so->so_laddr_sa, - (t_uscalar_t)so->so_laddr_len))); + pr_addr(so->so_family, sti->sti_laddr_sa, + (t_uscalar_t)sti->sti_laddr_len))); #endif /* DEBUG */ done: so_unlock_single(so, SOLOCKED); @@ -4868,9 +5142,10 @@ done: * * On the return most *optlenp bytes are copied to optval. */ +/* ARGSUSED */ int sotpi_getsockopt(struct sonode *so, int level, int option_name, - void *optval, socklen_t *optlenp, int flags) + void *optval, socklen_t *optlenp, int flags, struct cred *cr) { struct T_optmgmt_req optmgmt_req; struct T_optmgmt_ack *optmgmt_ack; @@ -4882,6 +5157,8 @@ sotpi_getsockopt(struct sonode *so, int level, int option_name, t_uscalar_t maxlen = *optlenp; t_uscalar_t len; uint32_t value; + struct timeval tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */ + struct so_snd_bufinfo snd_bufinfo; /* used for zero copy */ dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n", (void *)so, level, option_name, optval, (void *)optlenp, @@ -4914,8 +5191,6 @@ sotpi_getsockopt(struct sonode *so, int level, int option_name, #ifdef notyet case SO_SNDLOWAT: case SO_RCVLOWAT: - case SO_SNDTIMEO: - case SO_RCVTIMEO: #endif /* notyet */ case SO_DOMAIN: case SO_DGRAM_ERRIND: @@ -4925,6 +5200,14 @@ sotpi_getsockopt(struct sonode *so, int level, int option_name, goto done2; } break; + case SO_RCVTIMEO: + case SO_SNDTIMEO: + if (maxlen < (t_uscalar_t)sizeof (struct timeval)) { + error = EINVAL; + eprintsoline(so, error); + goto done2; + } + break; case SO_LINGER: if (maxlen < (t_uscalar_t)sizeof (struct linger)) { error = EINVAL; @@ -4932,6 +5215,14 @@ sotpi_getsockopt(struct sonode *so, int level, int option_name, goto done2; } break; + case SO_SND_BUFINFO: + if (maxlen < (t_uscalar_t) + sizeof (struct so_snd_bufinfo)) { + error = EINVAL; + eprintsoline(so, error); + goto done2; + } + break; } len = (t_uscalar_t)sizeof (uint32_t); /* Default */ @@ -4943,7 +5234,7 @@ sotpi_getsockopt(struct sonode *so, int level, int option_name, goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */ case SO_ERROR: - value = sogeterr(so); + value = sogeterr(so, B_TRUE); option = &value; goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */ @@ -5072,15 +5363,33 @@ sotpi_getsockopt(struct sonode *so, int level, int option_name, value = so->so_rcvlowat; option = &value; break; +#endif /* notyet */ case SO_SNDTIMEO: - value = so->so_sndtimeo; - option = &value; + case SO_RCVTIMEO: { + clock_t val; + if (option_name == SO_RCVTIMEO) + val = drv_hztousec(so->so_rcvtimeo); + else + val = drv_hztousec(so->so_sndtimeo); + tmo_val.tv_sec = val / (1000 * 1000); + tmo_val.tv_usec = val % (1000 * 1000); + option = &tmo_val; + len = (t_uscalar_t)sizeof (struct timeval); break; - case SO_RCVTIMEO: - value = so->so_rcvtimeo; - option = &value; + } + case SO_SND_BUFINFO: { + snd_bufinfo.sbi_wroff = + (so->so_proto_props).sopp_wroff; + snd_bufinfo.sbi_maxblk = + (so->so_proto_props).sopp_maxblk; + snd_bufinfo.sbi_maxpsz = + (so->so_proto_props).sopp_maxpsz; + snd_bufinfo.sbi_tail = + (so->so_proto_props).sopp_tail; + option = &snd_bufinfo; + len = (t_uscalar_t)sizeof (struct so_snd_bufinfo); break; -#endif /* notyet */ + } } } @@ -5159,6 +5468,7 @@ done: done2: so_unlock_single(so, SOLOCKED); mutex_exit(&so->so_lock); + return (error); } @@ -5168,9 +5478,10 @@ done2: * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails - * setsockopt has to work even if the transport does not support the option. */ +/* ARGSUSED */ int sotpi_setsockopt(struct sonode *so, int level, int option_name, - const void *optval, t_uscalar_t optlen) + const void *optval, t_uscalar_t optlen, struct cred *cr) { struct T_optmgmt_req optmgmt_req; struct opthdr oh; @@ -5182,7 +5493,6 @@ sotpi_setsockopt(struct sonode *so, int level, int option_name, (void *)so, level, option_name, optval, optlen, pr_state(so->so_state, so->so_mode))); - /* X/Open requires this check */ if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) { if (xnet_check_print) @@ -5190,12 +5500,6 @@ sotpi_setsockopt(struct sonode *so, int level, int option_name, return (EINVAL); } - /* Caller allocates aligned optval, or passes null */ - ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0); - /* If optval is null optlen is 0, and vice-versa */ - ASSERT(optval != NULL || optlen == 0); - ASSERT(optlen != 0 || optval == NULL); - mutex_enter(&so->so_lock); so_lock_single(so); /* Set SOLOCKED */ mutex_exit(&so->so_lock); @@ -5207,8 +5511,9 @@ sotpi_setsockopt(struct sonode *so, int level, int option_name, */ if ((level == SOL_SOCKET || level == IPPROTO_TCP) && (so->so_family == AF_INET || so->so_family == AF_INET6) && - (so->so_version == SOV_SOCKSTREAM) && (so->so_priv != NULL)) { - tcp_t *tcp = so->so_priv; + (so->so_version == SOV_SOCKSTREAM) && + (so->so_proto_handle != NULL)) { + tcp_t *tcp = (tcp_t *)so->so_proto_handle; boolean_t onoff; #define intvalue (*(int32_t *)optval) @@ -5233,6 +5538,18 @@ sotpi_setsockopt(struct sonode *so, int level, int option_name, onoff = intvalue != 0; handled = B_TRUE; break; + case SO_SNDTIMEO: + case SO_RCVTIMEO: + if (optlen != + (t_uscalar_t)sizeof (struct timeval)) { + error = EINVAL; + eprintsoline(so, error); + mutex_enter(&so->so_lock); + goto done2; + } + ASSERT(optval); + handled = B_TRUE; + break; case SO_LINGER: if (optlen != (t_uscalar_t)sizeof (struct linger)) { @@ -5373,7 +5690,7 @@ sotpi_setsockopt(struct sonode *so, int level, int option_name, mutex_enter(&so->so_lock); if (error) { eprintsoline(so, error); - goto done; + goto done2; } error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK, (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0); @@ -5406,8 +5723,6 @@ done: #ifdef notyet case SO_SNDLOWAT: case SO_RCVLOWAT: - case SO_SNDTIMEO: - case SO_RCVTIMEO: #endif /* notyet */ case SO_DGRAM_ERRIND: if (optlen != (t_uscalar_t)sizeof (int32_t)) { @@ -5418,6 +5733,16 @@ done: ASSERT(optval); handled = B_TRUE; break; + case SO_SNDTIMEO: + case SO_RCVTIMEO: + if (optlen != (t_uscalar_t)sizeof (struct timeval)) { + error = EINVAL; + eprintsoline(so, error); + goto done2; + } + ASSERT(optval); + handled = B_TRUE; + break; case SO_LINGER: if (optlen != (t_uscalar_t)sizeof (struct linger)) { error = EINVAL; @@ -5474,19 +5799,19 @@ done: case SO_DGRAM_ERRIND: if (intvalue != 0) { dprintso(so, 1, - ("sotpi_setsockopt: setting 0x%x\n", + ("socket_setsockopt: setting 0x%x\n", option_name)); so->so_options |= option_name; } else { dprintso(so, 1, - ("sotpi_setsockopt: clearing 0x%x\n", + ("socket_setsockopt: clearing 0x%x\n", option_name)); so->so_options &= ~option_name; } break; /* * The following options are only returned by us when the - * T_SVR4_OPTMGMT_REQ fails. + * transport layer fails. * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs * since the transport might adjust the value and not * return exactly what was set by the application. @@ -5497,6 +5822,9 @@ done: case SO_RCVBUF: so->so_rcvbuf = intvalue; break; + case SO_RCVPSH: + so->so_rcv_timer_interval = intvalue; + break; #ifdef notyet /* * We do not implement the semantics of these options @@ -5508,13 +5836,17 @@ done: case SO_RCVLOWAT: so->so_rcvlowat = intvalue; break; +#endif /* notyet */ case SO_SNDTIMEO: - so->so_sndtimeo = intvalue; - break; - case SO_RCVTIMEO: - so->so_rcvtimeo = intvalue; + case SO_RCVTIMEO: { + struct timeval *tl = (struct timeval *)optval; + clock_t val = tl->tv_sec * 1000 * 1000 + tl->tv_usec; + if (option_name == SO_RCVTIMEO) + so->so_rcvtimeo = drv_usectohz(val); + else + so->so_sndtimeo = drv_usectohz(val); break; -#endif /* notyet */ + } } #undef intvalue @@ -5529,8 +5861,1121 @@ done: } } done2: -ret: so_unlock_single(so, SOLOCKED); mutex_exit(&so->so_lock); return (error); } + +/* ARGSUSED */ +int +sotpi_close(struct sonode *so, int flag, struct cred *cr) +{ + struct vnode *vp = SOTOV(so); + dev_t dev; + int error = 0; + sotpi_info_t *sti = SOTOTPI(so); + + dprintso(so, 1, ("sotpi_close(%p, %x) %s\n", + (void *)vp, flag, pr_state(so->so_state, so->so_mode))); + + dev = sti->sti_dev; + + ASSERT(STREAMSTAB(getmajor(dev))); + + mutex_enter(&so->so_lock); + so_lock_single(so); /* Set SOLOCKED */ + + /* + * Only call NL7C's close on last open reference. + */ + if (sti->sti_nl7c_flags & NL7C_ENABLED) { + sti->sti_nl7c_flags = 0; + nl7c_close(so); + } + + /* + * Only call the close routine when the last open reference through + * any [s, v]node goes away. + */ + if (vp->v_stream != NULL) { + vnode_t *ux_vp; + + if (so->so_family == AF_UNIX) { + /* Could avoid this when CANTSENDMORE for !dgram */ + so_unix_close(so); + } + + mutex_exit(&so->so_lock); + /* + * Disassemble the linkage from the AF_UNIX underlying file + * system vnode to this socket (by atomically clearing + * v_stream in vn_rele_stream) before strclose clears sd_vnode + * and frees the stream head. + */ + if ((ux_vp = sti->sti_ux_bound_vp) != NULL) { + ASSERT(ux_vp->v_stream); + sti->sti_ux_bound_vp = NULL; + vn_rele_stream(ux_vp); + } + if (so->so_family == AF_INET || so->so_family == AF_INET6) { + strsetrwputdatahooks(SOTOV(so), NULL, NULL); + if (sti->sti_kssl_ent != NULL) { + kssl_release_ent(sti->sti_kssl_ent, so, + sti->sti_kssl_type); + sti->sti_kssl_ent = NULL; + } + if (sti->sti_kssl_ctx != NULL) { + kssl_release_ctx(sti->sti_kssl_ctx); + sti->sti_kssl_ctx = NULL; + } + sti->sti_kssl_type = KSSL_NO_PROXY; + } + error = strclose(vp, flag, cr); + vp->v_stream = NULL; + mutex_enter(&so->so_lock); + } + + /* + * Flush the T_DISCON_IND on sti_discon_ind_mp. + */ + so_flush_discon_ind(so); + + so_unlock_single(so, SOLOCKED); + mutex_exit(&so->so_lock); + + /* + * Needed for STREAMs. + * Decrement the device driver's reference count for streams + * opened via the clone dip. The driver was held in clone_open(). + * The absence of clone_close() forces this asymmetry. + */ + if (so->so_flag & SOCLONE) + ddi_rele_driver(getmajor(dev)); + + return (error); +} + +static int +sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode, + struct cred *cr, int32_t *rvalp) +{ + struct vnode *vp = SOTOV(so); + sotpi_info_t *sti = SOTOTPI(so); + int error = 0; + + dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n", + cmd, arg, pr_state(so->so_state, so->so_mode))); + + switch (cmd) { + case _I_INSERT: + case _I_REMOVE: + /* + * Since there's no compelling reason to support these ioctls + * on sockets, and doing so would increase the complexity + * markedly, prevent it. + */ + return (EOPNOTSUPP); + + case I_FIND: + case I_LIST: + case I_LOOK: + case I_POP: + case I_PUSH: + /* + * To prevent races and inconsistencies between the actual + * state of the stream and the state according to the sonode, + * we serialize all operations which modify or operate on the + * list of modules on the socket's stream. + */ + mutex_enter(&sti->sti_plumb_lock); + error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp); + mutex_exit(&sti->sti_plumb_lock); + return (error); + + default: + if (so->so_version != SOV_STREAM) + break; + + /* + * The imaginary "sockmod" has been popped; act as a stream. + */ + return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp)); + } + + ASSERT(so->so_version != SOV_STREAM); + + /* + * Process socket-specific ioctls. + */ + switch (cmd) { + case FIONBIO: { + int32_t value; + + if (so_copyin((void *)arg, &value, sizeof (int32_t), + (mode & (int)FKIOCTL))) + return (EFAULT); + + mutex_enter(&so->so_lock); + if (value) { + so->so_state |= SS_NDELAY; + } else { + so->so_state &= ~SS_NDELAY; + } + mutex_exit(&so->so_lock); + return (0); + } + + case FIOASYNC: { + int32_t value; + + if (so_copyin((void *)arg, &value, sizeof (int32_t), + (mode & (int)FKIOCTL))) + return (EFAULT); + + mutex_enter(&so->so_lock); + /* + * SS_ASYNC flag not already set correctly? + * (!value != !(so->so_state & SS_ASYNC)) + * but some engineers find that too hard to read. + */ + if (value == 0 && (so->so_state & SS_ASYNC) != 0 || + value != 0 && (so->so_state & SS_ASYNC) == 0) + error = so_flip_async(so, vp, mode, cr); + mutex_exit(&so->so_lock); + return (error); + } + + case SIOCSPGRP: + case FIOSETOWN: { + pid_t pgrp; + + if (so_copyin((void *)arg, &pgrp, sizeof (pid_t), + (mode & (int)FKIOCTL))) + return (EFAULT); + + mutex_enter(&so->so_lock); + dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp)); + /* Any change? */ + if (pgrp != so->so_pgrp) + error = so_set_siggrp(so, vp, pgrp, mode, cr); + mutex_exit(&so->so_lock); + return (error); + } + case SIOCGPGRP: + case FIOGETOWN: + if (so_copyout(&so->so_pgrp, (void *)arg, + sizeof (pid_t), (mode & (int)FKIOCTL))) + return (EFAULT); + return (0); + + case SIOCATMARK: { + int retval; + uint_t so_state; + + /* + * strwaitmark has a finite timeout after which it + * returns -1 if the mark state is undetermined. + * In order to avoid any race between the mark state + * in sockfs and the mark state in the stream head this + * routine loops until the mark state can be determined + * (or the urgent data indication has been removed by some + * other thread). + */ + do { + mutex_enter(&so->so_lock); + so_state = so->so_state; + mutex_exit(&so->so_lock); + if (so_state & SS_RCVATMARK) { + retval = 1; + } else if (!(so_state & SS_OOBPEND)) { + /* + * No SIGURG has been generated -- there is no + * pending or present urgent data. Thus can't + * possibly be at the mark. + */ + retval = 0; + } else { + /* + * Have the stream head wait until there is + * either some messages on the read queue, or + * STRATMARK or STRNOTATMARK gets set. The + * STRNOTATMARK flag is used so that the + * transport can send up a MSGNOTMARKNEXT + * M_DATA to indicate that it is not + * at the mark and additional data is not about + * to be send upstream. + * + * If the mark state is undetermined this will + * return -1 and we will loop rechecking the + * socket state. + */ + retval = strwaitmark(vp); + } + } while (retval == -1); + + if (so_copyout(&retval, (void *)arg, sizeof (int), + (mode & (int)FKIOCTL))) + return (EFAULT); + return (0); + } + + case I_FDINSERT: + case I_SENDFD: + case I_RECVFD: + case I_ATMARK: + case _SIOCSOCKFALLBACK: + /* + * These ioctls do not apply to sockets. I_FDINSERT can be + * used to send M_PROTO messages without modifying the socket + * state. I_SENDFD/RECVFD should not be used for socket file + * descriptor passing since they assume a twisted stream. + * SIOCATMARK must be used instead of I_ATMARK. + * + * _SIOCSOCKFALLBACK from an application should never be + * processed. It is only generated by socktpi_open() or + * in response to I_POP or I_PUSH. + */ +#ifdef DEBUG + zcmn_err(getzoneid(), CE_WARN, + "Unsupported STREAMS ioctl 0x%x on socket. " + "Pid = %d\n", cmd, curproc->p_pid); +#endif /* DEBUG */ + return (EOPNOTSUPP); + + case _I_GETPEERCRED: + if ((mode & FKIOCTL) == 0) + return (EINVAL); + + mutex_enter(&so->so_lock); + if ((so->so_mode & SM_CONNREQUIRED) == 0) { + error = ENOTSUP; + } else if ((so->so_state & SS_ISCONNECTED) == 0) { + error = ENOTCONN; + } else if (so->so_peercred != NULL) { + k_peercred_t *kp = (k_peercred_t *)arg; + kp->pc_cr = so->so_peercred; + kp->pc_cpid = so->so_cpid; + crhold(so->so_peercred); + } else { + error = EINVAL; + } + mutex_exit(&so->so_lock); + return (error); + + default: + /* + * Do the higher-order bits of the ioctl cmd indicate + * that it is an I_* streams ioctl? + */ + if ((cmd & 0xffffff00U) == STR && + so->so_version == SOV_SOCKBSD) { +#ifdef DEBUG + zcmn_err(getzoneid(), CE_WARN, + "Unsupported STREAMS ioctl 0x%x on socket. " + "Pid = %d\n", cmd, curproc->p_pid); +#endif /* DEBUG */ + return (EOPNOTSUPP); + } + return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp)); + } +} + +/* + * Handle plumbing-related ioctls. + */ +static int +socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode, + struct cred *cr, int32_t *rvalp) +{ + static const char sockmod_name[] = "sockmod"; + struct sonode *so = VTOSO(vp); + char mname[FMNAMESZ + 1]; + int error; + sotpi_info_t *sti = SOTOTPI(so); + + ASSERT(MUTEX_HELD(&sti->sti_plumb_lock)); + + if (so->so_version == SOV_SOCKBSD) + return (EOPNOTSUPP); + + if (so->so_version == SOV_STREAM) { + /* + * The imaginary "sockmod" has been popped - act as a stream. + * If this is a push of sockmod then change back to a socket. + */ + if (cmd == I_PUSH) { + error = ((mode & FKIOCTL) ? copystr : copyinstr)( + (void *)arg, mname, sizeof (mname), NULL); + + if (error == 0 && strcmp(mname, sockmod_name) == 0) { + dprintso(so, 0, ("socktpi_ioctl: going to " + "socket version\n")); + so_stream2sock(so); + return (0); + } + } + return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp)); + } + + switch (cmd) { + case I_PUSH: + if (sti->sti_direct) { + mutex_enter(&so->so_lock); + so_lock_single(so); + mutex_exit(&so->so_lock); + + error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K, + CRED(), rvalp); + + mutex_enter(&so->so_lock); + if (error == 0) + sti->sti_direct = 0; + so_unlock_single(so, SOLOCKED); + mutex_exit(&so->so_lock); + + if (error != 0) + return (error); + } + + error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp); + if (error == 0) + sti->sti_pushcnt++; + return (error); + + case I_POP: + if (sti->sti_pushcnt == 0) { + /* Emulate sockmod being popped */ + dprintso(so, 0, + ("socktpi_ioctl: going to STREAMS version\n")); + return (so_sock2stream(so)); + } + + error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp); + if (error == 0) + sti->sti_pushcnt--; + return (error); + + case I_LIST: { + struct str_mlist *kmlistp, *umlistp; + struct str_list kstrlist; + ssize_t kstrlistsize; + int i, nmods; + + STRUCT_DECL(str_list, ustrlist); + STRUCT_INIT(ustrlist, mode); + + if (arg == NULL) { + error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp); + if (error == 0) + (*rvalp)++; /* Add one for sockmod */ + return (error); + } + + error = so_copyin((void *)arg, STRUCT_BUF(ustrlist), + STRUCT_SIZE(ustrlist), mode & FKIOCTL); + if (error != 0) + return (error); + + nmods = STRUCT_FGET(ustrlist, sl_nmods); + if (nmods <= 0) + return (EINVAL); + /* + * Ceiling nmods at nstrpush to prevent someone from + * maliciously consuming lots of kernel memory. + */ + nmods = MIN(nmods, nstrpush); + + kstrlistsize = (nmods + 1) * sizeof (struct str_mlist); + kstrlist.sl_nmods = nmods; + kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP); + + error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K, + cr, rvalp); + if (error != 0) + goto done; + + /* + * Considering the module list as a 0-based array of sl_nmods + * modules, sockmod should conceptually exist at slot + * sti_pushcnt. Insert sockmod at this location by sliding all + * of the module names after so_pushcnt over by one. We know + * that there will be room to do this since we allocated + * sl_modlist with an additional slot. + */ + for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--) + kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1]; + + (void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name); + kstrlist.sl_nmods++; + + /* + * Copy all of the entries out to ustrlist. + */ + kmlistp = kstrlist.sl_modlist; + umlistp = STRUCT_FGETP(ustrlist, sl_modlist); + for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) { + error = so_copyout(kmlistp++, umlistp++, + sizeof (struct str_mlist), mode & FKIOCTL); + if (error != 0) + goto done; + } + + error = so_copyout(&i, (void *)arg, sizeof (int32_t), + mode & FKIOCTL); + if (error == 0) + *rvalp = 0; + done: + kmem_free(kstrlist.sl_modlist, kstrlistsize); + return (error); + } + case I_LOOK: + if (sti->sti_pushcnt == 0) { + return (so_copyout(sockmod_name, (void *)arg, + sizeof (sockmod_name), mode & FKIOCTL)); + } + return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp)); + + case I_FIND: + error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp); + if (error && error != EINVAL) + return (error); + + /* if not found and string was sockmod return 1 */ + if (*rvalp == 0 || error == EINVAL) { + error = ((mode & FKIOCTL) ? copystr : copyinstr)( + (void *)arg, mname, sizeof (mname), NULL); + if (error == ENAMETOOLONG) + error = EINVAL; + + if (error == 0 && strcmp(mname, sockmod_name) == 0) + *rvalp = 1; + } + return (error); + + default: + panic("socktpi_plumbioctl: unknown ioctl %d", cmd); + break; + } + + return (0); +} + +/* + * Wrapper around the streams poll routine that implements socket poll + * semantics. + * The sockfs never calls pollwakeup itself - the stream head take care + * of all pollwakeups. Since sockfs never holds so_lock when calling the + * stream head there can never be a deadlock due to holding so_lock across + * pollwakeup and acquiring so_lock in this routine. + * + * However, since the performance of VOP_POLL is critical we avoid + * acquiring so_lock here. This is based on two assumptions: + * - The poll implementation holds locks to serialize the VOP_POLL call + * and a pollwakeup for the same pollhead. This ensures that should + * e.g. so_state change during a socktpi_poll call the pollwakeup + * (which strsock_* and strrput conspire to issue) is issued after + * the state change. Thus the pollwakeup will block until VOP_POLL has + * returned and then wake up poll and have it call VOP_POLL again. + * - The reading of so_state without holding so_lock does not result in + * stale data that is older than the latest state change that has dropped + * so_lock. This is ensured by the mutex_exit issuing the appropriate + * memory barrier to force the data into the coherency domain. + */ +static int +sotpi_poll( + struct sonode *so, + short events, + int anyyet, + short *reventsp, + struct pollhead **phpp) +{ + short origevents = events; + struct vnode *vp = SOTOV(so); + int error; + int so_state = so->so_state; /* snapshot */ + sotpi_info_t *sti = SOTOTPI(so); + + dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n", + (void *)vp, pr_state(so_state, so->so_mode), so->so_error)); + + ASSERT(vp->v_type == VSOCK); + ASSERT(vp->v_stream != NULL); + + if (so->so_version == SOV_STREAM) { + /* The imaginary "sockmod" has been popped - act as a stream */ + return (strpoll(vp->v_stream, events, anyyet, + reventsp, phpp)); + } + + if (!(so_state & SS_ISCONNECTED) && + (so->so_mode & SM_CONNREQUIRED)) { + /* Not connected yet - turn off write side events */ + events &= ~(POLLOUT|POLLWRBAND); + } + /* + * Check for errors without calling strpoll if the caller wants them. + * In sockets the errors are represented as input/output events + * and there is no need to ask the stream head for this information. + */ + if (so->so_error != 0 && + ((POLLIN|POLLRDNORM|POLLOUT) & origevents) != 0) { + *reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents; + return (0); + } + /* + * Ignore M_PROTO only messages such as the T_EXDATA_IND messages. + * These message with only an M_PROTO/M_PCPROTO part and no M_DATA + * will not trigger a POLLIN event with POLLRDDATA set. + * The handling of urgent data (causing POLLRDBAND) is done by + * inspecting SS_OOBPEND below. + */ + events |= POLLRDDATA; + + /* + * After shutdown(output) a stream head write error is set. + * However, we should not return output events. + */ + events |= POLLNOERR; + error = strpoll(vp->v_stream, events, anyyet, + reventsp, phpp); + if (error) + return (error); + + ASSERT(!(*reventsp & POLLERR)); + + /* + * Notes on T_CONN_IND handling for sockets. + * + * If strpoll() returned without events, SR_POLLIN is guaranteed + * to be set, ensuring any subsequent strrput() runs pollwakeup(). + * + * Since the so_lock is not held, soqueueconnind() may have run + * and a T_CONN_IND may be waiting. We now check for any queued + * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events + * to ensure poll returns. + * + * However: + * If the T_CONN_IND hasn't arrived by the time strpoll() returns, + * when strrput() does run for an arriving M_PROTO with T_CONN_IND + * the following actions will occur; taken together they ensure the + * syscall will return. + * + * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if + * the accept() was run on a non-blocking socket sowaitconnind() + * may have already returned EWOULDBLOCK, so not be waiting to + * process the message. Additionally socktpi_poll() has probably + * proceeded past the sti_conn_ind_head check below. + * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake + * this thread, however that could occur before poll_common() + * has entered cv_wait. + * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock. + * + * Before proceeding to cv_wait() in poll_common() for an event, + * poll_common() atomically checks for T_POLLWAKE under the pc_lock, + * and if set, re-calls strpoll() to ensure the late arriving + * T_CONN_IND is recognized, and pollsys() returns. + */ + + if (sti->sti_conn_ind_head != NULL) + *reventsp |= (POLLIN|POLLRDNORM) & events; + + if (so->so_state & SS_OOBPEND) + *reventsp |= POLLRDBAND & events; + + if (sti->sti_nl7c_rcv_mp != NULL) { + *reventsp |= (POLLIN|POLLRDNORM) & events; + } + if ((sti->sti_nl7c_flags & NL7C_ENABLED) && + ((POLLIN|POLLRDNORM) & *reventsp)) { + sti->sti_nl7c_flags |= NL7C_POLLIN; + } + + return (0); +} + +/*ARGSUSED*/ +static int +socktpi_constructor(void *buf, void *cdrarg, int kmflags) +{ + sotpi_sonode_t *st = (sotpi_sonode_t *)buf; + int error = 0; + + error = sonode_constructor(buf, cdrarg, kmflags); + if (error != 0) + return (error); + + error = i_sotpi_info_constructor(&st->st_info); + if (error != 0) + sonode_destructor(buf, cdrarg); + + st->st_sonode.so_priv = &st->st_info; + + return (error); +} + +/*ARGSUSED1*/ +static void +socktpi_destructor(void *buf, void *cdrarg) +{ + sotpi_sonode_t *st = (sotpi_sonode_t *)buf; + + ASSERT(st->st_sonode.so_priv == &st->st_info); + st->st_sonode.so_priv = NULL; + + i_sotpi_info_destructor(&st->st_info); + sonode_destructor(buf, cdrarg); +} + +static int +socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags) +{ + int retval; + + if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) { + struct sonode *so = (struct sonode *)buf; + sotpi_info_t *sti = SOTOTPI(so); + + mutex_enter(&socklist.sl_lock); + + sti->sti_next_so = socklist.sl_list; + sti->sti_prev_so = NULL; + if (sti->sti_next_so != NULL) + SOTOTPI(sti->sti_next_so)->sti_prev_so = so; + socklist.sl_list = so; + + mutex_exit(&socklist.sl_lock); + + } + return (retval); +} + +static void +socktpi_unix_destructor(void *buf, void *cdrarg) +{ + struct sonode *so = (struct sonode *)buf; + sotpi_info_t *sti = SOTOTPI(so); + + mutex_enter(&socklist.sl_lock); + + if (sti->sti_next_so != NULL) + SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so; + if (sti->sti_prev_so != NULL) + SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so; + else + socklist.sl_list = sti->sti_next_so; + + mutex_exit(&socklist.sl_lock); + + socktpi_destructor(buf, cdrarg); +} + +int +socktpi_init(void) +{ + /* + * Create sonode caches. We create a special one for AF_UNIX so + * that we can track them for netstat(1m). + */ + socktpi_cache = kmem_cache_create("socktpi_cache", + sizeof (struct sotpi_sonode), 0, socktpi_constructor, + socktpi_destructor, NULL, NULL, NULL, 0); + + socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache", + sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor, + socktpi_unix_destructor, NULL, NULL, NULL, 0); + + return (0); +} + +/* + * Given a non-TPI sonode, allocate and prep it to be ready for TPI. + * + * Caller must still update state and mode using sotpi_update_state(). + * + * Returns the STREAM queue that the protocol should use. + */ +queue_t * +sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp, + boolean_t *direct, struct cred *cr) +{ + sotpi_info_t *sti; + struct sockparams *origsp = so->so_sockparams; + sock_lower_handle_t handle = so->so_proto_handle; + uint_t old_state = so->so_state; + struct stdata *stp; + struct vnode *vp; + queue_t *q; + + *direct = B_FALSE; + so->so_sockparams = newsp; + /* + * Allocate and initalize fields required by TPI. + */ + (void) sotpi_info_create(so, KM_SLEEP); + sotpi_info_init(so); + + if (sotpi_init(so, NULL, cr, SO_FALLBACK) != 0) { + sotpi_info_fini(so); + sotpi_info_destroy(so); + so->so_state = old_state; + return (NULL); + } + ASSERT(handle == so->so_proto_handle); + sti = SOTOTPI(so); + if (sti->sti_direct != 0) + *direct = B_TRUE; + + /* + * Keep the original sp around so we can properly dispose of the + * sonode when the socket is being closed. + */ + sti->sti_orig_sp = origsp; + + so_basic_strinit(so); /* skips the T_CAPABILITY_REQ */ + so_alloc_addr(so, so->so_max_addr_len); + + /* + * If the application has done a SIOCSPGRP, make sure the + * STREAM head is aware. This needs to take place before + * the protocol start sending up messages. Otherwise we + * might miss to generate SIGPOLL. + * + * It is possible that the application will receive duplicate + * signals if some were already generated for either data or + * connection indications. + */ + if (so->so_pgrp != 0) { + mutex_enter(&so->so_lock); + if (so_set_events(so, so->so_vnode, cr) != 0) + so->so_pgrp = 0; + mutex_exit(&so->so_lock); + } + + /* + * Determine which queue to use. + */ + vp = SOTOV(so); + stp = vp->v_stream; + ASSERT(stp != NULL); + q = stp->sd_wrq->q_next; + + /* + * Skip any modules that may have been auto pushed when the device + * was opened + */ + while (q->q_next != NULL) + q = q->q_next; + q = _RD(q); + + return (q); +} + +void +sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap, + struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr, + socklen_t faddrlen, short opts) +{ + sotpi_info_t *sti = SOTOTPI(so); + + so_proc_tcapability_ack(so, tcap); + + so->so_options |= opts; + + /* + * Determine whether the foreign and local address are valid + */ + if (laddrlen != 0) { + ASSERT(laddrlen <= sti->sti_laddr_maxlen); + sti->sti_laddr_len = laddrlen; + bcopy(laddr, sti->sti_laddr_sa, laddrlen); + sti->sti_laddr_valid = (so->so_state & SS_ISBOUND); + } + + if (faddrlen != 0) { + ASSERT(faddrlen <= sti->sti_faddr_maxlen); + sti->sti_faddr_len = faddrlen; + bcopy(faddr, sti->sti_faddr_sa, faddrlen); + sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED); + } + +} + +/* + * Allocate enough space to cache the local and foreign addresses. + */ +void +so_alloc_addr(struct sonode *so, t_uscalar_t maxlen) +{ + sotpi_info_t *sti = SOTOTPI(so); + + ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL); + ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0); + sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = + P2ROUNDUP(maxlen, KMEM_ALIGN); + so->so_max_addr_len = sti->sti_laddr_maxlen; + sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP); + sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa + + sti->sti_laddr_maxlen); + + if (so->so_family == AF_UNIX) { + /* + * Initialize AF_UNIX related fields. + */ + bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr)); + bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr)); + } +} + + +sotpi_info_t * +sotpi_sototpi(struct sonode *so) +{ + sotpi_info_t *sti; + + if (so == NULL) + return (NULL); + + sti = (sotpi_info_t *)so->so_priv; + + ASSERT(sti != NULL); + ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC); + + return (sti); +} + +static int +i_sotpi_info_constructor(sotpi_info_t *sti) +{ + sti->sti_magic = SOTPI_INFO_MAGIC; + sti->sti_ack_mp = NULL; + sti->sti_discon_ind_mp = NULL; + sti->sti_ux_bound_vp = NULL; + sti->sti_unbind_mp = NULL; + + sti->sti_conn_ind_head = NULL; + sti->sti_conn_ind_tail = NULL; + + sti->sti_laddr_sa = NULL; + sti->sti_faddr_sa = NULL; + + sti->sti_nl7c_flags = 0; + sti->sti_nl7c_uri = NULL; + sti->sti_nl7c_rcv_mp = NULL; + + mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL); + + return (0); +} + +static void +i_sotpi_info_destructor(sotpi_info_t *sti) +{ + ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC); + ASSERT(sti->sti_ack_mp == NULL); + ASSERT(sti->sti_discon_ind_mp == NULL); + ASSERT(sti->sti_ux_bound_vp == NULL); + ASSERT(sti->sti_unbind_mp == NULL); + + ASSERT(sti->sti_conn_ind_head == NULL); + ASSERT(sti->sti_conn_ind_tail == NULL); + + ASSERT(sti->sti_laddr_sa == NULL); + ASSERT(sti->sti_faddr_sa == NULL); + + ASSERT(sti->sti_nl7c_flags == 0); + ASSERT(sti->sti_nl7c_uri == NULL); + ASSERT(sti->sti_nl7c_rcv_mp == NULL); + + mutex_destroy(&sti->sti_plumb_lock); + cv_destroy(&sti->sti_ack_cv); +} + +/* + * Creates and attaches TPI information to the given sonode + */ +static boolean_t +sotpi_info_create(struct sonode *so, int kmflags) +{ + sotpi_info_t *sti; + + ASSERT(so->so_priv == NULL); + + if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL) + return (B_FALSE); + + if (i_sotpi_info_constructor(sti) != 0) { + kmem_free(sti, sizeof (*sti)); + return (B_FALSE); + } + + so->so_priv = (void *)sti; + return (B_TRUE); +} + +/* + * Initializes the TPI information. + */ +static void +sotpi_info_init(struct sonode *so) +{ + struct vnode *vp = SOTOV(so); + sotpi_info_t *sti = SOTOTPI(so); + time_t now; + + sti->sti_dev = so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev; + vp->v_rdev = sti->sti_dev; + + sti->sti_orig_sp = NULL; + + sti->sti_pushcnt = 0; + + now = gethrestime_sec(); + sti->sti_atime = now; + sti->sti_mtime = now; + sti->sti_ctime = now; + + sti->sti_eaddr_mp = NULL; + sti->sti_delayed_error = 0; + + sti->sti_provinfo = NULL; + + sti->sti_oobcnt = 0; + sti->sti_oobsigcnt = 0; + + ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL); + + sti->sti_laddr_sa = 0; + sti->sti_faddr_sa = 0; + sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0; + sti->sti_laddr_len = sti->sti_faddr_len = 0; + + sti->sti_laddr_valid = 0; + sti->sti_faddr_valid = 0; + sti->sti_faddr_noxlate = 0; + + sti->sti_direct = 0; + + ASSERT(sti->sti_ack_mp == NULL); + ASSERT(sti->sti_ux_bound_vp == NULL); + ASSERT(sti->sti_unbind_mp == NULL); + + ASSERT(sti->sti_conn_ind_head == NULL); + ASSERT(sti->sti_conn_ind_tail == NULL); + + /* Initialize the kernel SSL proxy fields */ + sti->sti_kssl_type = KSSL_NO_PROXY; + sti->sti_kssl_ent = NULL; + sti->sti_kssl_ctx = NULL; +} + +/* + * Given a sonode, grab the TPI info and free any data. + */ +static void +sotpi_info_fini(struct sonode *so) +{ + sotpi_info_t *sti = SOTOTPI(so); + mblk_t *mp; + + ASSERT(sti->sti_discon_ind_mp == NULL); + + if ((mp = sti->sti_conn_ind_head) != NULL) { + mblk_t *mp1; + + while (mp) { + mp1 = mp->b_next; + mp->b_next = NULL; + freemsg(mp); + mp = mp1; + } + sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL; + } + + /* + * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely + * indirect them. It also uses so_count as a validity test. + */ + mutex_enter(&so->so_lock); + + if (sti->sti_laddr_sa) { + ASSERT((caddr_t)sti->sti_faddr_sa == + (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen); + ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen); + sti->sti_laddr_valid = 0; + sti->sti_faddr_valid = 0; + kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2); + sti->sti_laddr_sa = NULL; + sti->sti_laddr_len = sti->sti_laddr_maxlen = 0; + sti->sti_faddr_sa = NULL; + sti->sti_faddr_len = sti->sti_faddr_maxlen = 0; + } + + mutex_exit(&so->so_lock); + + if ((mp = sti->sti_eaddr_mp) != NULL) { + freemsg(mp); + sti->sti_eaddr_mp = NULL; + sti->sti_delayed_error = 0; + } + + if ((mp = sti->sti_ack_mp) != NULL) { + freemsg(mp); + sti->sti_ack_mp = NULL; + } + + if ((mp = sti->sti_nl7c_rcv_mp) != NULL) { + sti->sti_nl7c_rcv_mp = NULL; + freemsg(mp); + } + sti->sti_nl7c_rcv_rval = 0; + if (sti->sti_nl7c_uri != NULL) { + nl7c_urifree(so); + /* urifree() cleared nl7c_uri */ + } + if (sti->sti_nl7c_flags) { + sti->sti_nl7c_flags = 0; + } + + ASSERT(sti->sti_ux_bound_vp == NULL); + if ((mp = sti->sti_unbind_mp) != NULL) { + freemsg(mp); + sti->sti_unbind_mp = NULL; + } +} + +/* + * Destroys the TPI information attached to a sonode. + */ +static void +sotpi_info_destroy(struct sonode *so) +{ + sotpi_info_t *sti = SOTOTPI(so); + + i_sotpi_info_destructor(sti); + kmem_free(sti, sizeof (*sti)); + + so->so_priv = NULL; +} + +/* + * Create the global sotpi socket module entry. It will never be free. + */ +smod_info_t * +sotpi_smod_create(void) +{ + smod_info_t *smodp; + + smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP); + smodp->smod_name = kmem_zalloc(strlen(SOTPI_SMOD_NAME), + 1); + (void *)strcpy(smodp->smod_name, SOTPI_SMOD_NAME); + /* + * Initilization the refcnt to 1 so it will never be free. + */ + smodp->smod_refcnt = 1; + smodp->smod_uc_version = SOCK_UC_VERSION; + smodp->smod_dc_version = SOCK_DC_VERSION; + smodp->smod_sock_create_func = &sotpi_create; + smodp->smod_sock_destroy_func = &sotpi_destroy; + return (smodp); +} diff --git a/usr/src/uts/common/fs/sockfs/socktpi.h b/usr/src/uts/common/fs/sockfs/socktpi.h new file mode 100644 index 0000000000..4c1a5de268 --- /dev/null +++ b/usr/src/uts/common/fs/sockfs/socktpi.h @@ -0,0 +1,282 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SOCKFS_SOCKTPI_H +#define _SOCKFS_SOCKTPI_H + +#include <inet/kssl/ksslapi.h> +#include <sys/sodirect.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Internal representation used for addresses. + */ +struct soaddr { + struct sockaddr *soa_sa; /* Actual address */ + t_uscalar_t soa_len; /* Length in bytes for kmem_free */ + t_uscalar_t soa_maxlen; /* Allocated length */ +}; +/* Maximum size address for transports that have ADDR_size == 1 */ +#define SOA_DEFSIZE 128 + +struct sonode; + +/* + * TPI Sockets + * ====================== + * + * A TPI socket can be created by the TPI socket module, or as a + * result of fallback. In either case, the TPI related information is + * stored in a sotpi_info_t. Sockets that are TPI based from the + * beginning will use a sotpi_sonode_t, but fallback case the + * sotpi_info_t will be allocated when needed. However, the so_priv + * field in the sonode will always point to the sotpi_info_t, and the + * structure should only be accessed via so_priv. Use SOTOTPI(). + * + * A TPI socket always corresponds to a VCHR stream representing the + * transport provider (e.g. /dev/tcp). This information is retrieved + * from the kernel socket configuration table and accessible via + * so_sockparams->sp_sdev_info. sockfs uses this to perform + * VOP_ACCESS checks before allowing an open of the transport + * provider. + * + * AF_UNIX Sockets + * ------------------------- + * + * When an AF_UNIX socket is bound to a pathname the sockfs creates a + * VSOCK vnode in the underlying file system. However, the vnodeops + * etc in this VNODE remain those of the underlying file system. + * Sockfs uses the v_stream pointer in the underlying file system + * VSOCK node to find the sonode bound to the pathname. The bound + * pathname vnode is accessed through sti_ux_vp. + * + * Out of Band Data Handling + * ------------------------- + * + * The counts (sti_oobcnt and sti_oobsigcnt) track the number of + * urgent indicates that are (logically) queued on the stream head + * read queue. The urgent data is queued on the stream head + * as follows. + * + * In the normal case the SIGURG is not generated until + * the T_EXDATA_IND arrives at the stream head. However, transports + * that have an early indication that urgent data is pending + * (e.g. TCP receiving a "new" urgent pointer value) can send up + * an M_PCPROTO/SIGURG message to generate the signal early. + * + * The mark is indicated by either: + * - a T_EXDATA_IND (with no M_DATA b_cont) with MSGMARK set. + * When this message is consumed by sorecvmsg the socket layer + * sets SS_RCVATMARK until data has been consumed past the mark. + * - a message with MSGMARKNEXT set (indicating that the + * first byte of the next message constitutes the mark). When + * the last byte of the MSGMARKNEXT message is consumed in + * the stream head the stream head sets STRATMARK. This flag + * is cleared when at least one byte is read. (Note that + * the MSGMARKNEXT messages can be of zero length when there + * is no previous data to which the marknext can be attached.) + * + * While the T_EXDATA_IND method is the common case which is used + * with all TPI transports, the MSGMARKNEXT method is needed to + * indicate the mark when e.g. the TCP urgent byte has not been + * received yet but the TCP urgent pointer has made TCP generate + * the M_PCSIG/SIGURG. + * + * The signal (the M_PCSIG carrying the SIGURG) and the mark + * indication can not be delivered as a single message, since + * the signal should be delivered as high priority and any mark + * indication must flow with the data. This implies that immediately + * when the SIGURG has been delivered if the stream head queue is + * empty it is impossible to determine if this will be the position + * of the mark. This race condition is resolved by using MSGNOTMARKNEXT + * messages and the STRNOTATMARK flag in the stream head. The + * SIOCATMARK code calls the stream head to wait for either a + * non-empty queue or one of the STR*ATMARK flags being set. + * This implies that any transport that is sending M_PCSIG(SIGURG) + * should send the appropriate MSGNOTMARKNEXT message (which can be + * zero length) after sending an M_PCSIG to prevent SIOCATMARK + * from sleeping unnecessarily. + */ + +#define SOTPI_INFO_MAGIC 0x12345678 + +/* + * Information used by TPI/STREAMS sockets + */ +typedef struct sotpi_info { + /* + * These fields are initialized once. + */ + uint32_t sti_magic; /* always set to SOTPI_INFO_MAGIC */ + dev_t sti_dev; /* device the sonode represents */ + + struct sockparams *sti_orig_sp; /* in case of fallback; the orig sp */ + + kmutex_t sti_plumb_lock; /* serializes plumbs, and the related */ + /* so_pushcnt */ + short sti_pushcnt; /* Number of modules above "sockmod" */ + + kcondvar_t sti_ack_cv; /* wait for TPI acks */ + + uint8_t + sti_laddr_valid : 1, /* sti_laddr valid for user */ + sti_faddr_valid : 1, /* sti_faddr valid for user */ + sti_faddr_noxlate : 1, /* No xlation of faddr for AF_UNIX */ + + sti_direct : 1, /* transport is directly below */ + + sti_pad_to_bit7 : 4; + + mblk_t *sti_ack_mp; /* TPI ack received from below */ + mblk_t *sti_unbind_mp; /* Preallocated T_UNBIND_REQ message */ + + time_t sti_atime; /* time of last access */ + time_t sti_mtime; /* time of last modification */ + time_t sti_ctime; /* time of last attributes change */ + + ushort_t sti_delayed_error; /* From T_uderror_ind */ + mblk_t *sti_eaddr_mp; /* for so_delayed_error */ + /* put here for delayed processing */ + + mblk_t *sti_conn_ind_head; /* b_next list of T_CONN_IND */ + mblk_t *sti_conn_ind_tail; + + uint_t sti_oobsigcnt; /* Number of SIGURG generated */ + uint_t sti_oobcnt; /* Number of T_EXDATA_IND queued */ + + /* From T_info_ack */ + t_uscalar_t sti_tsdu_size; + t_uscalar_t sti_etsdu_size; + t_scalar_t sti_addr_size; + t_uscalar_t sti_opt_size; + t_uscalar_t sti_tidu_size; + t_scalar_t sti_serv_type; + + /* From T_capability_ack */ + t_uscalar_t sti_acceptor_id; + + /* Internal provider information */ + struct tpi_provinfo *sti_provinfo; + + /* + * The local and remote addresses have multiple purposes + * but one of the key reasons for their existence and careful + * tracking in sockfs is to support getsockname and getpeername + * when the transport does not handle the TI_GET*NAME ioctls + * and caching when it does (signalled by valid bits in so_state). + * When all transports support the new TPI (with T_ADDR_REQ) + * we can revisit this code. + * + * The other usage of sti_faddr is to keep the "connected to" + * address for datagram sockets. + * + * Finally, for AF_UNIX both local and remote addresses are used + * to record the sockaddr_un since we use a separate namespace + * in the loopback transport. + */ + struct soaddr sti_laddr; /* Local address */ + struct soaddr sti_faddr; /* Peer address */ +#define sti_laddr_sa sti_laddr.soa_sa +#define sti_faddr_sa sti_faddr.soa_sa +#define sti_laddr_len sti_laddr.soa_len +#define sti_faddr_len sti_faddr.soa_len +#define sti_laddr_maxlen sti_laddr.soa_maxlen +#define sti_faddr_maxlen sti_faddr.soa_maxlen + + /* + * For AF_UNIX sockets: + * + * sti_ux_laddr/faddr records the internal addresses used with the + * transport. sti_ux_vp and v_stream->sd_vnode form the + * cross-linkage between the underlying fs vnode corresponding + * to the bound sockaddr_un and the socket node. + */ + struct so_ux_addr sti_ux_laddr; /* laddr bound with the transport */ + struct so_ux_addr sti_ux_faddr; /* temporary peer address */ + struct vnode *sti_ux_bound_vp; /* bound AF_UNIX file system vnode */ + struct sonode *sti_next_so; /* next sonode on socklist */ + struct sonode *sti_prev_so; /* previous sonode on socklist */ + mblk_t *sti_discon_ind_mp; /* T_DISCON_IND received from below */ + + /* + * For NL7C sockets: + * + * sti_nl7c_flags the NL7C state of URL processing. + * + * sti_nl7c_rcv_mp mblk_t chain of already received data to be + * passed up to the app after NL7C gives up on + * a socket. + * + * sti_nl7c_rcv_rval returned rval for last mblk_t from above. + * + * sti_nl7c_uri the URI currently being processed. + * + * sti_nl7c_rtime URI request gethrestime_sec(). + * + * sti_nl7c_addr pointer returned by nl7c_addr_lookup(). + */ + uint64_t sti_nl7c_flags; + mblk_t *sti_nl7c_rcv_mp; + int64_t sti_nl7c_rcv_rval; + void *sti_nl7c_uri; + time_t sti_nl7c_rtime; + void *sti_nl7c_addr; + + /* For sockets acting as an in-kernel SSL proxy */ + kssl_endpt_type_t sti_kssl_type; /* is proxy/is proxied/none */ + kssl_ent_t sti_kssl_ent; /* SSL config entry */ + kssl_ctx_t sti_kssl_ctx; /* SSL session context */ +} sotpi_info_t; + +struct T_capability_ack; + +extern sonodeops_t sotpi_sonodeops; + +extern int socktpi_init(void); +extern queue_t *sotpi_convert_sonode(struct sonode *, struct sockparams *, + boolean_t *, struct cred *); +extern void sotpi_update_state(struct sonode *, struct T_capability_ack *, + struct sockaddr *, socklen_t, struct sockaddr *, socklen_t, + short); + +extern sotpi_info_t *sotpi_sototpi(struct sonode *); +#ifdef DEBUG +#define SOTOTPI(so) (sotpi_sototpi(so)) +#else +#define SOTOTPI(so) ((sotpi_info_t *)(so)->so_priv) +#endif + +/* for consumers outside sockfs */ +#define _SOTOTPI(so) ((sotpi_info_t *)(so)->so_priv) + +#ifdef __cplusplus +} +#endif + +#endif /* _SOCKFS_SOCKTPI_H */ diff --git a/usr/src/uts/common/fs/sockfs/socktpi_impl.h b/usr/src/uts/common/fs/sockfs/socktpi_impl.h new file mode 100644 index 0000000000..aa0b04bf1c --- /dev/null +++ b/usr/src/uts/common/fs/sockfs/socktpi_impl.h @@ -0,0 +1,99 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SOCKFS_SOCKTPI_IMPL_H +#define _SOCKFS_SOCKTPI_IMPL_H + +#include <sys/socketvar.h> +#include <fs/sockfs/socktpi.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * so_priv will always be set to &st_info + */ +typedef struct sotpi_sonode { + struct sonode st_sonode; + struct sotpi_info st_info; +} sotpi_sonode_t; + +extern void so_proc_tcapability_ack(struct sonode *, + struct T_capability_ack *); +extern void so_basic_strinit(struct sonode *); +extern void so_alloc_addr(struct sonode *, t_uscalar_t); +extern int so_set_events(struct sonode *, vnode_t *, cred_t *); +extern int so_sock2stream(struct sonode *); +extern void so_stream2sock(struct sonode *); + +extern int so_strinit(struct sonode *, struct sonode *); +extern void so_update_attrs(struct sonode *, int); +extern int sogetrderr(vnode_t *, int, int *); +extern int sogetwrerr(vnode_t *, int, int *); +extern int so_addr_verify(struct sonode *, const struct sockaddr *, + socklen_t); +extern int so_ux_addr_xlate(struct sonode *, struct sockaddr *, + socklen_t, int, void **, socklen_t *); +extern void so_unix_close(struct sonode *); + +extern int sowaitprim(struct sonode *, t_scalar_t, t_scalar_t, + t_uscalar_t, mblk_t **, clock_t); +extern int sowaitokack(struct sonode *, t_scalar_t); +extern int sowaitack(struct sonode *, mblk_t **, clock_t); +extern void soqueueack(struct sonode *, mblk_t *); +extern int sowaitconnind(struct sonode *, int, mblk_t **); +extern void soqueueconnind(struct sonode *, mblk_t *); +extern int soflushconnind(struct sonode *, t_scalar_t); +extern void so_drain_discon_ind(struct sonode *); +extern void so_flush_discon_ind(struct sonode *); + +extern mblk_t *soallocproto(size_t, int); +extern mblk_t *soallocproto1(const void *, ssize_t, ssize_t, int); +extern void soappendmsg(mblk_t *, const void *, ssize_t); +extern mblk_t *soallocproto2(const void *, ssize_t, const void *, ssize_t, + ssize_t, int); +extern mblk_t *soallocproto3(const void *, ssize_t, const void *, ssize_t, + const void *, ssize_t, ssize_t, int); + +extern int so_set_asyncsigs(vnode_t *, pid_t, int, int, cred_t *); +extern int so_flip_async(struct sonode *, vnode_t *, int, cred_t *); +extern int so_set_siggrp(struct sonode *, vnode_t *, pid_t, int, cred_t *); + +extern void so_installhooks(struct sonode *); + +extern int kstrwritemp(struct vnode *, mblk_t *, ushort_t); +extern int sostream_direct(struct sonode *, struct uio *, + mblk_t *, cred_t *); +extern int sosend_dgram(struct sonode *, struct sockaddr *, + socklen_t, struct uio *, int); +extern int sosend_svc(struct sonode *, struct uio *, t_scalar_t, int, int); + +#ifdef __cplusplus +} +#endif + +#endif /* _SOCKFS_SOCKTPI_IMPL_H */ diff --git a/usr/src/uts/common/fs/sockfs/sockvnops.c b/usr/src/uts/common/fs/sockfs/sockvnops.c deleted file mode 100644 index e9195c5e11..0000000000 --- a/usr/src/uts/common/fs/sockfs/sockvnops.c +++ /dev/null @@ -1,1438 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include <sys/types.h> -#include <sys/thread.h> -#include <sys/t_lock.h> -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/bitmap.h> -#include <sys/buf.h> -#include <sys/cmn_err.h> -#include <sys/conf.h> -#include <sys/debug.h> -#include <sys/errno.h> -#include <sys/time.h> -#include <sys/fcntl.h> -#include <sys/flock.h> -#include <sys/file.h> -#include <sys/kmem.h> -#include <sys/mman.h> -#include <sys/open.h> -#include <sys/swap.h> -#include <sys/sysmacros.h> -#include <sys/uio.h> -#include <sys/vfs.h> -#include <sys/vfs_opreg.h> -#include <sys/vnode.h> -#include <sys/poll.h> -#include <sys/stropts.h> -#include <sys/stream.h> -#include <sys/strsubr.h> -#include <sys/strsun.h> -#include <sys/suntpi.h> -#include <sys/ioctl.h> -#include <sys/sockio.h> -#include <sys/filio.h> -#include <sys/stat.h> -#include <sys/proc.h> -#include <sys/user.h> -#include <sys/session.h> -#include <sys/vmsystm.h> -#include <sys/vtrace.h> -#include <sys/policy.h> - -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <netinet/in.h> -#include <sys/un.h> - -#define _SUN_TPI_VERSION 2 -#include <sys/tihdr.h> - -#include <vm/seg.h> -#include <vm/seg_map.h> -#include <vm/page.h> -#include <vm/pvn.h> -#include <vm/seg_dev.h> -#include <vm/seg_vn.h> - -#include <fs/fs_subr.h> - -#include <sys/esunddi.h> -#include <sys/autoconf.h> - -#include <fs/sockfs/nl7c.h> -#include <fs/sockfs/nl7curi.h> - -#include <inet/udp_impl.h> -#include <inet/tcp_impl.h> - -#include <inet/kssl/ksslapi.h> - -static int socktpi_close(struct vnode *, int, int, offset_t, struct cred *, - caller_context_t *); -static int socktpi_read(struct vnode *, struct uio *, int, struct cred *, - caller_context_t *); -static int socktpi_write(struct vnode *, struct uio *, int, struct cred *, - caller_context_t *); -static int socktpi_plumbioctl(struct vnode *, int, intptr_t, int, struct cred *, - int32_t *); -static void socktpi_inactive(struct vnode *, struct cred *, caller_context_t *); -static int socktpi_poll(struct vnode *, short, int, short *, - struct pollhead **, caller_context_t *); - -struct vnodeops *socktpi_vnodeops; - -const fs_operation_def_t socktpi_vnodeops_template[] = { - VOPNAME_OPEN, { .vop_open = socktpi_open }, - VOPNAME_CLOSE, { .vop_close = socktpi_close }, - VOPNAME_READ, { .vop_read = socktpi_read }, - VOPNAME_WRITE, { .vop_write = socktpi_write }, - VOPNAME_IOCTL, { .vop_ioctl = socktpi_ioctl }, - VOPNAME_SETFL, { .vop_setfl = socktpi_setfl }, - VOPNAME_GETATTR, { .vop_getattr = socktpi_getattr }, - VOPNAME_SETATTR, { .vop_setattr = socktpi_setattr }, - VOPNAME_ACCESS, { .vop_access = socktpi_access }, - VOPNAME_FSYNC, { .vop_fsync = socktpi_fsync }, - VOPNAME_INACTIVE, { .vop_inactive = socktpi_inactive }, - VOPNAME_FID, { .vop_fid = socktpi_fid }, - VOPNAME_SEEK, { .vop_seek = socktpi_seek }, - VOPNAME_POLL, { .vop_poll = socktpi_poll }, - VOPNAME_DISPOSE, { .error = fs_error }, - NULL, NULL -}; - -/* - * Do direct function call to the transport layer below; this would - * also allow the transport to utilize read-side synchronous stream - * interface if necessary. This is a /etc/system tunable that must - * not be modified on a running system. By default this is enabled - * for performance reasons and may be disabled for debugging purposes. - */ -boolean_t socktpi_direct = B_TRUE; - -/* - * Open routine used by socket() call. Note that vn_open checks for - * VSOCK and fails the open (and VOP_OPEN is fs_nosys). The VSOCK check is - * needed since VSOCK type vnodes exist in various underlying filesystems as - * a result of an AF_UNIX bind to a pathname. - * - * Sockets assume that the driver will clone (either itself - * or by using the clone driver) i.e. a socket() call will always - * result in a new vnode being created. This routine single-threads - * open/closes for a given vnode which is probably not needed. - */ -int -socktpi_open(struct vnode **vpp, int flag, struct cred *cr, - caller_context_t *ct) -{ - major_t maj; - dev_t newdev; - struct vnode *vp = *vpp; - struct sonode *so; - int error = 0; - struct stdata *stp; - - dprint(1, ("socktpi_open()\n")); - flag &= ~FCREAT; /* paranoia */ - - so = VTOSO(vp); - - mutex_enter(&so->so_lock); - so->so_count++; /* one more open reference */ - ASSERT(so->so_count != 0); /* wraparound */ - if (so->so_count == 1) - so->so_zoneid = getzoneid(); - mutex_exit(&so->so_lock); - - ASSERT(vp->v_type == VSOCK); - - newdev = vp->v_rdev; - maj = getmajor(newdev); - ASSERT(STREAMSTAB(maj)); - - mutex_enter(&so->so_lock); - so_lock_single(so); /* Set SOLOCKED */ - mutex_exit(&so->so_lock); - - error = stropen(vp, &newdev, flag, cr); - - stp = vp->v_stream; - if (error == 0) { - if (so->so_flag & SOCLONE) - ASSERT(newdev != vp->v_rdev); - mutex_enter(&so->so_lock); - so->so_dev = newdev; - vp->v_rdev = newdev; - so_unlock_single(so, SOLOCKED); - mutex_exit(&so->so_lock); - - if (stp->sd_flag & STRISTTY) { - /* - * this is a post SVR4 tty driver - a socket can not - * be a controlling terminal. Fail the open. - */ - (void) socktpi_close(vp, flag, 1, (offset_t)0, cr, ct); - return (ENOTTY); /* XXX */ - } - - ASSERT(stp->sd_wrq != NULL); - so->so_provinfo = tpi_findprov(stp->sd_wrq); - - /* - * If caller is interested in doing direct function call - * interface to/from transport module, probe the module - * directly beneath the streamhead to see if it qualifies. - * - * We turn off the direct interface when qualifications fail. - * In the acceptor case, we simply turn off the SS_DIRECT - * flag on the socket. We do the fallback after the accept - * has completed, before the new socket is returned to the - * application. - */ - if (so->so_state & SS_DIRECT) { - queue_t *tq = stp->sd_wrq->q_next; - - /* - * SS_DIRECT is currently supported and tested - * only for tcp/udp; this is the main reason to - * have the following assertions. - */ - ASSERT(so->so_family == AF_INET || - so->so_family == AF_INET6); - ASSERT(so->so_protocol == IPPROTO_UDP || - so->so_protocol == IPPROTO_TCP || - so->so_protocol == IPPROTO_IP); - ASSERT(so->so_type == SOCK_DGRAM || - so->so_type == SOCK_STREAM); - - /* - * Abort direct call interface if the module directly - * underneath the stream head is not defined with the - * _D_DIRECT flag. This could happen in the tcp or - * udp case, when some other module is autopushed - * above it, or for some reasons the expected module - * isn't purely D_MP (which is the main requirement). - * - * Else, SS_DIRECT is valid. If the read-side Q has - * _QSODIRECT set then and uioasync is enabled then - * set SS_SODIRECT to enable sodirect. - */ - if (!socktpi_direct || !(tq->q_flag & _QDIRECT) || - !(_OTHERQ(tq)->q_flag & _QDIRECT)) { - int rval; - - /* Continue on without direct calls */ - so->so_state &= ~SS_DIRECT; - if (!(flag & SO_ACCEPTOR)) { - if ((error = strioctl(vp, - _SIOCSOCKFALLBACK, 0, 0, K_TO_K, - CRED(), &rval)) != 0) { - (void) socktpi_close(vp, flag, - 1, (offset_t)0, cr, ct); - return (error); - } - } - } else if ((_OTHERQ(tq)->q_flag & _QSODIRECT) && - uioasync.enabled) { - /* Enable sodirect */ - so->so_state |= SS_SODIRECT; - } - } - } else { - /* - * While the same socket can not be reopened (unlike specfs) - * the stream head sets STREOPENFAIL when the autopush fails. - */ - if ((stp != NULL) && - (stp->sd_flag & STREOPENFAIL)) { - /* - * Open failed part way through. - */ - mutex_enter(&stp->sd_lock); - stp->sd_flag &= ~STREOPENFAIL; - mutex_exit(&stp->sd_lock); - - mutex_enter(&so->so_lock); - so_unlock_single(so, SOLOCKED); - mutex_exit(&so->so_lock); - (void) socktpi_close(vp, flag, 1, - (offset_t)0, cr, ct); - return (error); - /*NOTREACHED*/ - } - ASSERT(stp == NULL); - mutex_enter(&so->so_lock); - so_unlock_single(so, SOLOCKED); - ASSERT(so->so_count > 0); - so->so_count--; /* one less open reference */ - mutex_exit(&so->so_lock); - } - TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN, - "sockfs open:maj %d vp %p so %p error %d", maj, - vp, so, error); - return (error); -} - -/*ARGSUSED2*/ -static int -socktpi_close( - struct vnode *vp, - int flag, - int count, - offset_t offset, - struct cred *cr, - caller_context_t *ct) -{ - struct sonode *so; - dev_t dev; - int error = 0; - - so = VTOSO(vp); - - dprintso(so, 1, ("socktpi_close(%p, %x, %d) %s\n", - (void *)vp, flag, count, pr_state(so->so_state, so->so_mode))); - - cleanlocks(vp, ttoproc(curthread)->p_pid, 0); - cleanshares(vp, ttoproc(curthread)->p_pid); - if (vp->v_stream) - strclean(vp); - if (count > 1) - return (0); - - dev = so->so_dev; - - ASSERT(vp->v_type == VSOCK); - ASSERT(STREAMSTAB(getmajor(dev))); - - mutex_enter(&so->so_lock); - so_lock_single(so); /* Set SOLOCKED */ - ASSERT(so->so_count > 0); - so->so_count--; /* one fewer open reference */ - - /* - * Only call NL7C's close on last open reference. - */ - if (so->so_count == 0 && (so->so_nl7c_flags & NL7C_ENABLED)) { - so->so_nl7c_flags = 0; - nl7c_close(so); - } - - /* - * Only call the close routine when the last open reference through - * any [s, v]node goes away. - */ - if (so->so_count == 0 && vp->v_stream != NULL) { - vnode_t *ux_vp; - - if (so->so_family == AF_UNIX) { - /* Could avoid this when CANTSENDMORE for !dgram */ - so_unix_close(so); - } - - mutex_exit(&so->so_lock); - /* - * Disassemble the linkage from the AF_UNIX underlying file - * system vnode to this socket (by atomically clearing - * v_stream in vn_rele_stream) before strclose clears sd_vnode - * and frees the stream head. - */ - if ((ux_vp = so->so_ux_bound_vp) != NULL) { - ASSERT(ux_vp->v_stream); - so->so_ux_bound_vp = NULL; - vn_rele_stream(ux_vp); - } - if (so->so_family == AF_INET || so->so_family == AF_INET6) { - strsetrwputdatahooks(SOTOV(so), NULL, NULL); - if (so->so_kssl_ent != NULL) { - kssl_release_ent(so->so_kssl_ent, so, - so->so_kssl_type); - so->so_kssl_ent = NULL; - } - if (so->so_kssl_ctx != NULL) { - kssl_release_ctx(so->so_kssl_ctx); - so->so_kssl_ctx = NULL; - } - so->so_kssl_type = KSSL_NO_PROXY; - } - error = strclose(vp, flag, cr); - vp->v_stream = NULL; - mutex_enter(&so->so_lock); - } - - /* - * Flush the T_DISCON_IND on so_discon_ind_mp. - */ - if (so->so_count == 0) - so_flush_discon_ind(so); - - so_unlock_single(so, SOLOCKED); - mutex_exit(&so->so_lock); - - /* - * Decrement the device driver's reference count for streams - * opened via the clone dip. The driver was held in clone_open(). - * The absence of clone_close() forces this asymmetry. - */ - if (so->so_flag & SOCLONE) - ddi_rele_driver(getmajor(dev)); - - return (error); -} - -/*ARGSUSED2*/ -static int -socktpi_read( - struct vnode *vp, - struct uio *uiop, - int ioflag, - struct cred *cr, - caller_context_t *ct) -{ - struct sonode *so = VTOSO(vp); - struct nmsghdr lmsg; - - dprintso(so, 1, ("socktpi_read(%p) %s\n", - (void *)so, pr_state(so->so_state, so->so_mode))); - - ASSERT(vp->v_type == VSOCK); - so_update_attrs(so, SOACC); - - uiop->uio_extflg |= UIO_COPY_CACHED; - - if (so->so_version == SOV_STREAM) { - /* The imaginary "sockmod" has been popped - act as a stream */ - return (strread(vp, uiop, cr)); - } - lmsg.msg_namelen = 0; - lmsg.msg_controllen = 0; - lmsg.msg_flags = 0; - return (sotpi_recvmsg(so, &lmsg, uiop)); -} - -/* ARGSUSED2 */ -static int -socktpi_write( - struct vnode *vp, - struct uio *uiop, - int ioflag, - struct cred *cr, - caller_context_t *ct) -{ - struct sonode *so = VTOSO(vp); - int so_state; - int so_mode; - int error; - - dprintso(so, 1, ("socktpi_write(%p) %s\n", - (void *)so, pr_state(so->so_state, so->so_mode))); - - ASSERT(vp->v_type == VSOCK); - - if (so->so_family == AF_UNIX) - uiop->uio_extflg |= UIO_COPY_CACHED; - else - uiop->uio_extflg &= ~UIO_COPY_CACHED; - if (so->so_version == SOV_STREAM) { - /* The imaginary "sockmod" has been popped - act as a stream */ - so_update_attrs(so, SOMOD); - return (strwrite(vp, uiop, cr)); - } - /* State checks */ - so_state = so->so_state; - so_mode = so->so_mode; - if (so_state & SS_CANTSENDMORE) { - tsignal(curthread, SIGPIPE); - return (EPIPE); - } - - if (so->so_error != 0) { - mutex_enter(&so->so_lock); - error = sogeterr(so); - if (error != 0) { - mutex_exit(&so->so_lock); - return (error); - } - mutex_exit(&so->so_lock); - } - - if ((so_state & (SS_ISCONNECTED|SS_ISBOUND)) != - (SS_ISCONNECTED|SS_ISBOUND)) { - if (so_mode & SM_CONNREQUIRED) - return (ENOTCONN); - else - return (EDESTADDRREQ); - } - - if (!(so_mode & SM_CONNREQUIRED)) { - /* - * Note that this code does not prevent so_faddr_sa - * from changing while it is being used. Thus - * if an "unconnect"+connect occurs concurrently with - * this write the datagram might be delivered to a - * garbled address. - */ - so_update_attrs(so, SOMOD); - return (sosend_dgram(so, so->so_faddr_sa, - (t_uscalar_t)so->so_faddr_len, uiop, 0)); - } - so_update_attrs(so, SOMOD); - - if (so_mode & SM_BYTESTREAM) { - /* Send M_DATA messages */ - if ((so->so_nl7c_flags & NL7C_ENABLED) && - (error = nl7c_data(so, uiop)) >= 0) { - /* NL7C consumed the data */ - return (error); - } - if ((so_state & SS_DIRECT) && - canputnext(vp->v_stream->sd_wrq)) { - return (sostream_direct(so, uiop, NULL, cr)); - } - return (strwrite(vp, uiop, cr)); - } else { - /* Send T_DATA_REQ messages without MORE_flag set */ - return (sosend_svc(so, uiop, T_DATA_REQ, 0, 0)); - } -} - -int -so_copyin(const void *from, void *to, size_t size, int fromkernel) -{ - if (fromkernel) { - bcopy(from, to, size); - return (0); - } - return (xcopyin(from, to, size)); -} - -int -so_copyout(const void *from, void *to, size_t size, int tokernel) -{ - if (tokernel) { - bcopy(from, to, size); - return (0); - } - return (xcopyout(from, to, size)); -} - -/*ARGSUSED6*/ -int -socktpi_ioctl(struct vnode *vp, int cmd, intptr_t arg, int mode, - struct cred *cr, int32_t *rvalp, caller_context_t *ct) -{ - struct sonode *so = VTOSO(vp); - int error = 0; - - ASSERT(vp->v_type == VSOCK); - dprintso(so, 0, ("socktpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n", - cmd, arg, pr_state(so->so_state, so->so_mode))); - - switch (cmd) { - case _I_INSERT: - case _I_REMOVE: - /* - * Since there's no compelling reason to support these ioctls - * on sockets, and doing so would increase the complexity - * markedly, prevent it. - */ - return (EOPNOTSUPP); - - case I_FIND: - case I_LIST: - case I_LOOK: - case I_POP: - case I_PUSH: - /* - * To prevent races and inconsistencies between the actual - * state of the stream and the state according to the sonode, - * we serialize all operations which modify or operate on the - * list of modules on the socket's stream. - */ - mutex_enter(&so->so_plumb_lock); - error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp); - mutex_exit(&so->so_plumb_lock); - return (error); - - default: - if (so->so_version != SOV_STREAM) - break; - - /* - * The imaginary "sockmod" has been popped; act as a stream. - */ - return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp)); - } - - ASSERT(so->so_version != SOV_STREAM); - - /* - * Process socket-specific ioctls. - */ - switch (cmd) { - case FIONBIO: { - int32_t value; - - if (so_copyin((void *)arg, &value, sizeof (int32_t), - (mode & (int)FKIOCTL))) - return (EFAULT); - - mutex_enter(&so->so_lock); - if (value) { - so->so_state |= SS_NDELAY; - } else { - so->so_state &= ~SS_NDELAY; - } - mutex_exit(&so->so_lock); - return (0); - } - - case FIOASYNC: { - int32_t value; - - if (so_copyin((void *)arg, &value, sizeof (int32_t), - (mode & (int)FKIOCTL))) - return (EFAULT); - - mutex_enter(&so->so_lock); - /* - * SS_ASYNC flag not already set correctly? - * (!value != !(so->so_state & SS_ASYNC)) - * but some engineers find that too hard to read. - */ - if (value == 0 && (so->so_state & SS_ASYNC) != 0 || - value != 0 && (so->so_state & SS_ASYNC) == 0) - error = so_flip_async(so, vp, mode, cr); - mutex_exit(&so->so_lock); - return (error); - } - - case SIOCSPGRP: - case FIOSETOWN: { - pid_t pgrp; - - if (so_copyin((void *)arg, &pgrp, sizeof (pid_t), - (mode & (int)FKIOCTL))) - return (EFAULT); - - mutex_enter(&so->so_lock); - dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp)); - /* Any change? */ - if (pgrp != so->so_pgrp) - error = so_set_siggrp(so, vp, pgrp, mode, cr); - mutex_exit(&so->so_lock); - return (error); - } - case SIOCGPGRP: - case FIOGETOWN: - if (so_copyout(&so->so_pgrp, (void *)arg, - sizeof (pid_t), (mode & (int)FKIOCTL))) - return (EFAULT); - return (0); - - case SIOCATMARK: { - int retval; - uint_t so_state; - - /* - * strwaitmark has a finite timeout after which it - * returns -1 if the mark state is undetermined. - * In order to avoid any race between the mark state - * in sockfs and the mark state in the stream head this - * routine loops until the mark state can be determined - * (or the urgent data indication has been removed by some - * other thread). - */ - do { - mutex_enter(&so->so_lock); - so_state = so->so_state; - mutex_exit(&so->so_lock); - if (so_state & SS_RCVATMARK) { - retval = 1; - } else if (!(so_state & SS_OOBPEND)) { - /* - * No SIGURG has been generated -- there is no - * pending or present urgent data. Thus can't - * possibly be at the mark. - */ - retval = 0; - } else { - /* - * Have the stream head wait until there is - * either some messages on the read queue, or - * STRATMARK or STRNOTATMARK gets set. The - * STRNOTATMARK flag is used so that the - * transport can send up a MSGNOTMARKNEXT - * M_DATA to indicate that it is not - * at the mark and additional data is not about - * to be send upstream. - * - * If the mark state is undetermined this will - * return -1 and we will loop rechecking the - * socket state. - */ - retval = strwaitmark(vp); - } - } while (retval == -1); - - if (so_copyout(&retval, (void *)arg, sizeof (int), - (mode & (int)FKIOCTL))) - return (EFAULT); - return (0); - } - - case I_FDINSERT: - case I_SENDFD: - case I_RECVFD: - case I_ATMARK: - case _SIOCSOCKFALLBACK: - /* - * These ioctls do not apply to sockets. I_FDINSERT can be - * used to send M_PROTO messages without modifying the socket - * state. I_SENDFD/RECVFD should not be used for socket file - * descriptor passing since they assume a twisted stream. - * SIOCATMARK must be used instead of I_ATMARK. - * - * _SIOCSOCKFALLBACK from an application should never be - * processed. It is only generated by socktpi_open() or - * in response to I_POP or I_PUSH. - */ -#ifdef DEBUG - zcmn_err(getzoneid(), CE_WARN, - "Unsupported STREAMS ioctl 0x%x on socket. " - "Pid = %d\n", cmd, curproc->p_pid); -#endif /* DEBUG */ - return (EOPNOTSUPP); - - case _I_GETPEERCRED: - if ((mode & FKIOCTL) == 0) - return (EINVAL); - - mutex_enter(&so->so_lock); - if ((so->so_mode & SM_CONNREQUIRED) == 0) { - error = ENOTSUP; - } else if ((so->so_state & SS_ISCONNECTED) == 0) { - error = ENOTCONN; - } else if (so->so_peercred != NULL) { - k_peercred_t *kp = (k_peercred_t *)arg; - kp->pc_cr = so->so_peercred; - kp->pc_cpid = so->so_cpid; - crhold(so->so_peercred); - } else { - error = EINVAL; - } - mutex_exit(&so->so_lock); - return (error); - - default: - /* - * Do the higher-order bits of the ioctl cmd indicate - * that it is an I_* streams ioctl? - */ - if ((cmd & 0xffffff00U) == STR && - so->so_version == SOV_SOCKBSD) { -#ifdef DEBUG - zcmn_err(getzoneid(), CE_WARN, - "Unsupported STREAMS ioctl 0x%x on socket. " - "Pid = %d\n", cmd, curproc->p_pid); -#endif /* DEBUG */ - return (EOPNOTSUPP); - } - return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp)); - } -} - -/* - * Handle plumbing-related ioctls. - */ -static int -socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode, - struct cred *cr, int32_t *rvalp) -{ - static const char sockmod_name[] = "sockmod"; - struct sonode *so = VTOSO(vp); - char mname[FMNAMESZ + 1]; - int error; - - ASSERT(MUTEX_HELD(&so->so_plumb_lock)); - - if (so->so_version == SOV_SOCKBSD) - return (EOPNOTSUPP); - - if (so->so_version == SOV_STREAM) { - /* - * The imaginary "sockmod" has been popped - act as a stream. - * If this is a push of sockmod then change back to a socket. - */ - if (cmd == I_PUSH) { - error = ((mode & FKIOCTL) ? copystr : copyinstr)( - (void *)arg, mname, sizeof (mname), NULL); - - if (error == 0 && strcmp(mname, sockmod_name) == 0) { - dprintso(so, 0, ("socktpi_ioctl: going to " - "socket version\n")); - so_stream2sock(so); - return (0); - } - } - return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp)); - } - - switch (cmd) { - case I_PUSH: - if (so->so_state & SS_DIRECT) { - mutex_enter(&so->so_lock); - so_lock_single(so); - mutex_exit(&so->so_lock); - - error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K, - CRED(), rvalp); - - mutex_enter(&so->so_lock); - if (error == 0) - so->so_state &= ~SS_DIRECT; - so_unlock_single(so, SOLOCKED); - mutex_exit(&so->so_lock); - - if (error != 0) - return (error); - } - - error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp); - if (error == 0) - so->so_pushcnt++; - return (error); - - case I_POP: - if (so->so_pushcnt == 0) { - /* Emulate sockmod being popped */ - dprintso(so, 0, - ("socktpi_ioctl: going to STREAMS version\n")); - return (so_sock2stream(so)); - } - - error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp); - if (error == 0) - so->so_pushcnt--; - return (error); - - case I_LIST: { - struct str_mlist *kmlistp, *umlistp; - struct str_list kstrlist; - ssize_t kstrlistsize; - int i, nmods; - - STRUCT_DECL(str_list, ustrlist); - STRUCT_INIT(ustrlist, mode); - - if (arg == NULL) { - error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp); - if (error == 0) - (*rvalp)++; /* Add one for sockmod */ - return (error); - } - - error = so_copyin((void *)arg, STRUCT_BUF(ustrlist), - STRUCT_SIZE(ustrlist), mode & FKIOCTL); - if (error != 0) - return (error); - - nmods = STRUCT_FGET(ustrlist, sl_nmods); - if (nmods <= 0) - return (EINVAL); - /* - * Ceiling nmods at nstrpush to prevent someone from - * maliciously consuming lots of kernel memory. - */ - nmods = MIN(nmods, nstrpush); - - kstrlistsize = (nmods + 1) * sizeof (struct str_mlist); - kstrlist.sl_nmods = nmods; - kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP); - - error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K, - cr, rvalp); - if (error != 0) - goto done; - - /* - * Considering the module list as a 0-based array of sl_nmods - * modules, sockmod should conceptually exist at slot - * so_pushcnt. Insert sockmod at this location by sliding all - * of the module names after so_pushcnt over by one. We know - * that there will be room to do this since we allocated - * sl_modlist with an additional slot. - */ - for (i = kstrlist.sl_nmods; i > so->so_pushcnt; i--) - kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1]; - - (void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name); - kstrlist.sl_nmods++; - - /* - * Copy all of the entries out to ustrlist. - */ - kmlistp = kstrlist.sl_modlist; - umlistp = STRUCT_FGETP(ustrlist, sl_modlist); - for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) { - error = so_copyout(kmlistp++, umlistp++, - sizeof (struct str_mlist), mode & FKIOCTL); - if (error != 0) - goto done; - } - - error = so_copyout(&i, (void *)arg, sizeof (int32_t), - mode & FKIOCTL); - if (error == 0) - *rvalp = 0; - done: - kmem_free(kstrlist.sl_modlist, kstrlistsize); - return (error); - } - case I_LOOK: - if (so->so_pushcnt == 0) { - return (so_copyout(sockmod_name, (void *)arg, - sizeof (sockmod_name), mode & FKIOCTL)); - } - return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp)); - - case I_FIND: - error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp); - if (error && error != EINVAL) - return (error); - - /* if not found and string was sockmod return 1 */ - if (*rvalp == 0 || error == EINVAL) { - error = ((mode & FKIOCTL) ? copystr : copyinstr)( - (void *)arg, mname, sizeof (mname), NULL); - if (error == ENAMETOOLONG) - error = EINVAL; - - if (error == 0 && strcmp(mname, sockmod_name) == 0) - *rvalp = 1; - } - return (error); - - default: - panic("socktpi_plumbioctl: unknown ioctl %d", cmd); - break; - } - - return (0); -} - -/* - * Allow any flags. Record FNDELAY and FNONBLOCK so that they can be inherited - * from listener to acceptor. - */ -/* ARGSUSED */ -int -socktpi_setfl(vnode_t *vp, int oflags, int nflags, cred_t *cr, - caller_context_t *ct) -{ - struct sonode *so; - int error = 0; - - so = VTOSO(vp); - - dprintso(so, 0, ("socktpi_setfl: oflags 0x%x, nflags 0x%x, state %s\n", - oflags, nflags, pr_state(so->so_state, so->so_mode))); - mutex_enter(&so->so_lock); - if (nflags & FNDELAY) - so->so_state |= SS_NDELAY; - else - so->so_state &= ~SS_NDELAY; - if (nflags & FNONBLOCK) - so->so_state |= SS_NONBLOCK; - else - so->so_state &= ~SS_NONBLOCK; - mutex_exit(&so->so_lock); - - /* - * Sets/clears the SS_ASYNC flag based on the presence/absence - * of the FASYNC flag passed to fcntl(F_SETFL). - * This exists solely for BSD fcntl() FASYNC compatibility. - */ - so = VTOSO(vp->v_stream->sd_vnode); - - if (so->so_version != SOV_STREAM) { - mutex_enter(&so->so_lock); - - /* - * SS_ASYNC flag not already set correctly? - * (!(nflags & FASYNC) != !(so->so_state & SS_ASYNC)) - * but some engineers find that too hard to read. - */ - if ((nflags & FASYNC) == 0 && (so->so_state & SS_ASYNC) != 0 || - (nflags & FASYNC) != 0 && (so->so_state & SS_ASYNC) == 0) - error = so_flip_async(so, SOTOV(so), 0, CRED()); - mutex_exit(&so->so_lock); - } - return (error); -} - -/* - * Get the made up attributes for the vnode. - * 4.3BSD returns the current time for all the timestamps. - * 4.4BSD returns 0 for all the timestamps. - * Here we use the access and modified times recorded in the sonode. - * - * Just like in BSD there is not effect on the underlying file system node - * bound to an AF_UNIX pathname. - * - * When sockmod has been popped this will act just like a stream. Since - * a socket is always a clone there is no need to inspect the attributes - * of the "realvp". - */ -/* ARGSUSED */ -int -socktpi_getattr( - struct vnode *vp, - struct vattr *vap, - int flags, - struct cred *cr, - caller_context_t *ct) -{ - dev_t fsid; - struct sonode *so; - static int sonode_shift = 0; - - /* - * Calculate the amount of bitshift to a sonode pointer which will - * still keep it unique. See below. - */ - if (sonode_shift == 0) - sonode_shift = highbit(sizeof (struct sonode)); - ASSERT(sonode_shift > 0); - - so = VTOSO(vp); - fsid = so->so_fsid; - - if (so->so_version == SOV_STREAM) { - /* - * The imaginary "sockmod" has been popped - act - * as a stream - */ - vap->va_type = VCHR; - vap->va_mode = 0; - } else { - vap->va_type = vp->v_type; - vap->va_mode = S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP| - S_IROTH|S_IWOTH; - } - vap->va_uid = vap->va_gid = 0; - vap->va_fsid = fsid; - /* - * If the va_nodeid is > MAX_USHORT, then i386 stats might fail. - * So we shift down the sonode pointer to try and get the most - * uniqueness into 16-bits. - */ - vap->va_nodeid = ((ino_t)so >> sonode_shift) & 0xFFFF; - vap->va_nlink = 0; - vap->va_size = 0; - - /* - * We need to zero out the va_rdev to avoid some fstats getting - * EOVERFLOW. This also mimics SunOS 4.x and BSD behavior. - */ - vap->va_rdev = (dev_t)0; - vap->va_blksize = MAXBSIZE; - vap->va_nblocks = btod(vap->va_size); - - mutex_enter(&so->so_lock); - vap->va_atime.tv_sec = so->so_atime; - vap->va_mtime.tv_sec = so->so_mtime; - vap->va_ctime.tv_sec = so->so_ctime; - mutex_exit(&so->so_lock); - - vap->va_atime.tv_nsec = 0; - vap->va_mtime.tv_nsec = 0; - vap->va_ctime.tv_nsec = 0; - vap->va_seq = 0; - - return (0); -} - -/* - * Set attributes. - * Just like in BSD there is not effect on the underlying file system node - * bound to an AF_UNIX pathname. - * - * When sockmod has been popped this will act just like a stream. Since - * a socket is always a clone there is no need to modify the attributes - * of the "realvp". - */ -/* ARGSUSED */ -int -socktpi_setattr( - struct vnode *vp, - struct vattr *vap, - int flags, - struct cred *cr, - caller_context_t *ct) -{ - struct sonode *so = VTOSO(vp); - - /* - * If times were changed, update sonode. - */ - mutex_enter(&so->so_lock); - if (vap->va_mask & AT_ATIME) - so->so_atime = vap->va_atime.tv_sec; - if (vap->va_mask & AT_MTIME) { - so->so_mtime = vap->va_mtime.tv_sec; - so->so_ctime = gethrestime_sec(); - } - mutex_exit(&so->so_lock); - - return (0); -} - -int -socktpi_access(struct vnode *vp, int mode, int flags, struct cred *cr, - caller_context_t *ct) -{ - struct vnode *accessvp; - struct sonode *so = VTOSO(vp); - - if ((accessvp = so->so_accessvp) != NULL) - return (VOP_ACCESS(accessvp, mode, flags, cr, ct)); - else - return (0); /* Allow all access. */ -} - -/* - * 4.3BSD and 4.4BSD fail a fsync on a socket with EINVAL. - * This code does the same to be compatible and also to not give an - * application the impression that the data has actually been "synced" - * to the other end of the connection. - */ -/* ARGSUSED */ -int -socktpi_fsync(struct vnode *vp, int syncflag, struct cred *cr, - caller_context_t *ct) -{ - return (EINVAL); -} - -/* ARGSUSED */ -static void -socktpi_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct) -{ - struct sonode *so = VTOSO(vp); - - mutex_enter(&vp->v_lock); - /* - * If no one has reclaimed the vnode, remove from the - * cache now. - */ - if (vp->v_count < 1) - cmn_err(CE_PANIC, "socktpi_inactive: Bad v_count"); - - /* - * Drop the temporary hold by vn_rele now - */ - if (--vp->v_count != 0) { - mutex_exit(&vp->v_lock); - return; - } - mutex_exit(&vp->v_lock); - - /* We are the sole owner of so now */ - - ASSERT(!vn_has_cached_data(vp)); - sockfree(so); -} - -/* ARGSUSED */ -int -socktpi_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct) -{ - return (EINVAL); -} - -/* - * Sockets are not seekable. - * (and there is a bug to fix STREAMS to make them fail this as well). - */ -/*ARGSUSED*/ -int -socktpi_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, - caller_context_t *ct) -{ - return (ESPIPE); -} - -/* - * Wrapper around the streams poll routine that implements socket poll - * semantics. - * Sockfs never calls pollwakeup() itself - the stream head takes care - * of all pollwakeups. Since sockfs never holds so_lock when calling the - * stream head there can never be a deadlock due to holding so_lock across - * pollwakeup and acquiring so_lock in this routine. - * - * However, since the performance of VOP_POLL is critical we avoid - * acquiring so_lock here. This is based on the following assumptions: - * - The poll implementation holds locks to serialize the VOP_POLL call - * and a pollwakeup for the same pollhead. This ensures that should - * so_state etc change during a socktpi_poll() call, the pollwakeup() - * (which strsock_* and strrput() conspire to issue) is issued after - * the state change. Thus the pollwakeup will block until VOP_POLL has - * returned, and then wake up poll and have it call VOP_POLL again. - * - * - The reading of so_state without holding so_lock does not result in - * stale data (older than the latest state change that has dropped - * so_lock). This is ensured as mutex_exit() issues the appropriate - * memory barrier to force the data into the coherency domain. - * - * - Whilst so_state may change during the VOP_POLL call, (SS_HASCONNIND - * may have been set by an arriving connection), the above two factors - * guarantee validity of SS_ISCONNECTED/SM_CONNREQUIRED in the entry - * time snapshot. In order to capture the arrival of a connection while - * VOP_POLL was in progress, we then check real so_state, (so->so_state) - * for SS_HASCONNIND and set appropriate events to ensure poll_common() - * will not sleep. - */ -/*ARGSUSED5*/ -static int -socktpi_poll( - struct vnode *vp, - short events, - int anyyet, - short *reventsp, - struct pollhead **phpp, - caller_context_t *ct) -{ - short origevents = events; - struct sonode *so = VTOSO(vp); - int error; - int so_state = so->so_state; /* snapshot */ - - dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n", - (void *)vp, pr_state(so_state, so->so_mode), so->so_error)); - - ASSERT(vp->v_type == VSOCK); - ASSERT(vp->v_stream != NULL); - - if (so->so_version == SOV_STREAM) { - /* The imaginary "sockmod" has been popped - act as a stream */ - return (strpoll(vp->v_stream, events, anyyet, - reventsp, phpp)); - } - - if (!(so_state & SS_ISCONNECTED) && - (so->so_mode & SM_CONNREQUIRED)) { - /* Not connected yet - turn off write side events */ - events &= ~(POLLOUT|POLLWRBAND); - } - /* - * Check for errors without calling strpoll if the caller wants them. - * In sockets the errors are represented as input/output events - * and there is no need to ask the stream head for this information. - */ - if (so->so_error != 0 && - ((POLLIN|POLLRDNORM|POLLOUT) & origevents) != 0) { - *reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents; - return (0); - } - /* - * Ignore M_PROTO only messages such as the T_EXDATA_IND messages. - * These message with only an M_PROTO/M_PCPROTO part and no M_DATA - * will not trigger a POLLIN event with POLLRDDATA set. - * The handling of urgent data (causing POLLRDBAND) is done by - * inspecting SS_OOBPEND below. - */ - events |= POLLRDDATA; - - /* - * After shutdown(output) a stream head write error is set. - * However, we should not return output events. - */ - events |= POLLNOERR; - error = strpoll(vp->v_stream, events, anyyet, - reventsp, phpp); - if (error) - return (error); - - ASSERT(!(*reventsp & POLLERR)); - - /* - * Notes on T_CONN_IND handling for sockets. - * - * If strpoll() returned without events, SR_POLLIN is guaranteed - * to be set, ensuring any subsequent strrput() runs pollwakeup(). - * - * Since the so_lock is not held, soqueueconnind() may have run - * and a T_CONN_IND may be waiting. We now check for SS_HASCONNIND - * in the current so_state and set appropriate events to ensure poll - * returns. - * - * However: - * If the T_CONN_IND hasn't arrived by the time strpoll() returns, - * when strrput() does run for an arriving M_PROTO with T_CONN_IND - * the following actions will occur; taken together they ensure the - * syscall will return. - * - * 1. If a socket, soqueueconnind() will set SS_HASCONNIND but if - * the accept() was run on a non-blocking socket sowaitconnind() - * may have already returned EWOULDBLOCK, so not be waiting to - * process the message. Additionally socktpi_poll() has probably - * proceeded past the SS_HASCONNIND check below. - * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake - * this thread, however that could occur before poll_common() - * has entered cv_wait. - * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock. - * - * Before proceeding to cv_wait() in poll_common() for an event, - * poll_common() atomically checks for T_POLLWAKE under the pc_lock, - * and if set, re-calls strpoll() to ensure the late arriving - * T_CONN_IND is recognized, and pollsys() returns. - */ - if (so->so_state & (SS_HASCONNIND|SS_OOBPEND)) { - if (so->so_state & SS_HASCONNIND) - *reventsp |= (POLLIN|POLLRDNORM) & events; - if (so->so_state & SS_OOBPEND) - *reventsp |= POLLRDBAND & events; - } - - if (so->so_nl7c_rcv_mp != NULL) { - *reventsp |= (POLLIN|POLLRDNORM) & events; - } - if ((so->so_nl7c_flags & NL7C_ENABLED) && - ((POLLIN|POLLRDNORM) & *reventsp)) { - so->so_nl7c_flags |= NL7C_POLLIN; - } - - return (0); -} - -/* - * Wrapper for getmsg. If the socket has been converted to a stream - * pass the request to the stream head. - */ -int -sock_getmsg( - struct vnode *vp, - struct strbuf *mctl, - struct strbuf *mdata, - uchar_t *prip, - int *flagsp, - int fmode, - rval_t *rvp -) -{ - struct sonode *so; - - ASSERT(vp->v_type == VSOCK); - /* - * Use the stream head to find the real socket vnode. - * This is needed when namefs sits above sockfs. Some - * sockets (like SCTP) are not streams. - */ - if (!vp->v_stream) { - return (ENOSTR); - } - ASSERT(vp->v_stream->sd_vnode); - vp = vp->v_stream->sd_vnode; - ASSERT(vn_matchops(vp, socktpi_vnodeops)); - so = VTOSO(vp); - - dprintso(so, 1, ("sock_getmsg(%p) %s\n", - (void *)so, pr_state(so->so_state, so->so_mode))); - - if (so->so_version == SOV_STREAM) { - /* The imaginary "sockmod" has been popped - act as a stream */ - return (strgetmsg(vp, mctl, mdata, prip, flagsp, fmode, rvp)); - } - eprintsoline(so, ENOSTR); - return (ENOSTR); -} - -/* - * Wrapper for putmsg. If the socket has been converted to a stream - * pass the request to the stream head. - * - * Note that a while a regular socket (SOV_SOCKSTREAM) does support the - * streams ioctl set it does not support putmsg and getmsg. - * Allowing putmsg would prevent sockfs from tracking the state of - * the socket/transport and would also invalidate the locking in sockfs. - */ -int -sock_putmsg( - struct vnode *vp, - struct strbuf *mctl, - struct strbuf *mdata, - uchar_t pri, - int flag, - int fmode -) -{ - struct sonode *so; - - ASSERT(vp->v_type == VSOCK); - /* - * Use the stream head to find the real socket vnode. - * This is needed when namefs sits above sockfs. - */ - if (!vp->v_stream) { - return (ENOSTR); - } - ASSERT(vp->v_stream->sd_vnode); - vp = vp->v_stream->sd_vnode; - ASSERT(vn_matchops(vp, socktpi_vnodeops)); - so = VTOSO(vp); - - dprintso(so, 1, ("sock_putmsg(%p) %s\n", - (void *)so, pr_state(so->so_state, so->so_mode))); - - if (so->so_version == SOV_STREAM) { - /* The imaginary "sockmod" has been popped - act as a stream */ - return (strputmsg(vp, mctl, mdata, pri, flag, fmode)); - } - eprintsoline(so, ENOSTR); - return (ENOSTR); -} - -/* - * Special function called only from f_getfl(). - * Returns FASYNC if the SS_ASYNC flag is set on a socket, else 0. - * No locks are acquired here, so it is safe to use while uf_lock is held. - * This exists solely for BSD fcntl() FASYNC compatibility. - */ -int -sock_getfasync(vnode_t *vp) -{ - struct sonode *so; - - ASSERT(vp->v_type == VSOCK); - so = VTOSO(vp->v_stream->sd_vnode); - if (so->so_version == SOV_STREAM || !(so->so_state & SS_ASYNC)) - return (0); - return (FASYNC); -} diff --git a/usr/src/uts/common/inet/inetddi.c b/usr/src/uts/common/inet/inetddi.c index 48a9e3aa2e..6b0cd5839a 100644 --- a/usr/src/uts/common/inet/inetddi.c +++ b/usr/src/uts/common/inet/inetddi.c @@ -23,7 +23,6 @@ * Use is subject to license terms. */ - #include <sys/types.h> #include <sys/stat.h> #include <sys/stream.h> @@ -57,12 +56,23 @@ * * Drivers that need to masquerade as IP should set INET_DEVMTFLAGS to * IP_DEVMTFLAGS and set INET_DEVSTRTAB to ipinfo. + * + * The symbols that all socket modules must define are: + * + * INET_SOCKDESC The one-line description for this socket module + * INET_SOCK_PROTO_CREATE_FUNC The function used to create PCBs + * + * In addition, socket modules that can be converted to TPI must define: + * + * INET_SOCK_PROTO_FB_FUNC The function used to fallback to TPI */ #if !defined(INET_NAME) #error inetddi.c: INET_NAME is not defined! -#elif !defined(INET_DEVDESC) && !defined(INET_MODDESC) -#error inetddi.c: at least one of INET_DEVDESC or INET_MODDESC must be defined! +#elif !defined(INET_DEVDESC) && !defined(INET_MODDESC) && \ + !defined(INET_SOCKDESC) +#error inetddi.c: at least one of INET_DEVDESC or INET_MODDESC or \ +INET_SOCKDESC must be defined! #elif defined(INET_DEVDESC) && !defined(INET_DEVSTRTAB) #error inetddi.c: INET_DEVDESC is defined but INET_DEVSTRTAB is not! #elif defined(INET_DEVDESC) && !defined(INET_DEVMTFLAGS) @@ -73,6 +83,11 @@ #error inetddi.c: INET_MODDESC is defined but INET_MODSTRTAB is not! #elif defined(INET_MODDESC) && !defined(INET_MODMTFLAGS) #error inetddi.c: INET_MODDESC is defined but INET_MODMTFLAGS is not! +#elif defined(INET_SOCKDESC) && !defined(SOCKMOD_VERSION) +#error inetddi.c: INET_SOCKDESC is defined but SOCKMOD_VERSION is not! +#elif defined(INET_SOCKDESC) && !defined(INET_SOCK_PROTO_CREATE_FUNC) +#error inetddi.c: INET_SOCKDESC is defined but INET_SOCK_PROTO_CREATE_FUNC \ +is not! #endif #ifdef INET_DEVDESC @@ -192,8 +207,39 @@ static struct modlstrmod modlstrmod = { INET_MODDESC, &fsw }; + #endif /* INET_MODDESC */ +#ifdef INET_SOCKDESC + +#ifdef INET_SOCK_PROTO_FB_FUNC +static __smod_priv_t smodpriv = { + NULL, + NULL, + INET_SOCK_PROTO_FB_FUNC +}; +#endif /* INET_SOCK_PROTO_FB_FUNC */ + +static struct smod_reg_s smodreg = { + SOCKMOD_VERSION, + INET_NAME, + SOCK_UC_VERSION, + SOCK_DC_VERSION, + INET_SOCK_PROTO_CREATE_FUNC, +#ifdef INET_SOCK_PROTO_FB_FUNC + &smodpriv +#else + NULL +#endif /* INET_SOCK_PROTO_FB_FUNC */ +}; + +static struct modlsockmod modlsockmod = { + &mod_sockmodops, + INET_SOCKDESC, + &smodreg +}; +#endif /* INET_SOCKDESC */ + static struct modlinkage modlinkage = { MODREV_1, #ifdef INET_DEVDESC @@ -202,5 +248,8 @@ static struct modlinkage modlinkage = { #ifdef INET_MODDESC &modlstrmod, #endif +#ifdef INET_SOCKDESC + &modlsockmod, +#endif NULL }; diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h index c7ccff8a14..323c8fd0de 100644 --- a/usr/src/uts/common/inet/ip.h +++ b/usr/src/uts/common/inet/ip.h @@ -2771,7 +2771,7 @@ typedef struct ip_pktinfo { #define ILL_LOOKUP_FAILED 1 /* Used as error code */ #define IPIF_LOOKUP_FAILED 2 /* Used as error code */ -#define ILL_CAN_LOOKUP(ill) \ +#define ILL_CAN_LOOKUP(ill) \ (!((ill)->ill_state_flags & (ILL_CONDEMNED | ILL_CHANGING)) || \ IAM_WRITER_ILL(ill)) @@ -2781,7 +2781,7 @@ typedef struct ip_pktinfo { #define ILL_CAN_LOOKUP_WALKER(ill) \ (!((ill)->ill_state_flags & ILL_CONDEMNED)) -#define IPIF_CAN_LOOKUP(ipif) \ +#define IPIF_CAN_LOOKUP(ipif) \ (!((ipif)->ipif_state_flags & (IPIF_CONDEMNED | IPIF_CHANGING)) || \ IAM_WRITER_IPIF(ipif)) @@ -3166,11 +3166,15 @@ extern void icmp_unreachable(queue_t *, mblk_t *, uint8_t, zoneid_t, ip_stack_t *); extern mblk_t *ip_add_info(mblk_t *, ill_t *, uint_t, zoneid_t, ip_stack_t *); extern mblk_t *ip_bind_v4(queue_t *, mblk_t *, conn_t *); -extern int ip_bind_connected(conn_t *, mblk_t *, ipaddr_t *, uint16_t, - ipaddr_t, uint16_t, boolean_t, boolean_t, boolean_t, boolean_t); -extern boolean_t ip_bind_ipsec_policy_set(conn_t *, mblk_t *); -extern int ip_bind_laddr(conn_t *, mblk_t *, ipaddr_t, uint16_t, - boolean_t, boolean_t, boolean_t); +extern boolean_t ip_bind_ipsec_policy_set(conn_t *, mblk_t *); +extern int ip_bind_laddr_v4(conn_t *, mblk_t **, uint8_t, ipaddr_t, + uint16_t, boolean_t); +extern int ip_proto_bind_laddr_v4(conn_t *, mblk_t **, uint8_t, ipaddr_t, + uint16_t, boolean_t); +extern int ip_proto_bind_connected_v4(conn_t *, mblk_t **, + uint8_t, ipaddr_t *, uint16_t, ipaddr_t, uint16_t, boolean_t, boolean_t); +extern int ip_bind_connected_v4(conn_t *, mblk_t **, uint8_t, ipaddr_t *, + uint16_t, ipaddr_t, uint16_t, boolean_t, boolean_t); extern uint_t ip_cksum(mblk_t *, int, uint32_t); extern int ip_close(queue_t *, int); extern uint16_t ip_csum_hdr(ipha_t *); @@ -3308,7 +3312,7 @@ extern boolean_t ip_md_hcksum_attr(struct multidata_s *, struct pdesc_s *, uint32_t, uint32_t, uint32_t, uint32_t); extern boolean_t ip_md_zcopy_attr(struct multidata_s *, struct pdesc_s *, uint_t); -extern mblk_t *ip_unbind(queue_t *, mblk_t *); +extern void ip_unbind(conn_t *connp); extern phyint_t *phyint_lookup_group(char *, boolean_t, ip_stack_t *); extern phyint_t *phyint_lookup_group_ifindex(uint_t, ip_stack_t *); @@ -3577,7 +3581,6 @@ extern void ip_squeue_quiesce_ring(ill_t *, ill_rx_ring_t *); extern void ip_squeue_restart_ring(ill_t *, ill_rx_ring_t *); extern void ip_squeue_clean_all(ill_t *); -extern void ip_resume_tcp_bind(void *, mblk_t *, void *); extern void tcp_wput(queue_t *, mblk_t *); extern int ip_fill_mtuinfo(struct in6_addr *, in_port_t, @@ -3635,6 +3638,8 @@ typedef void (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *); #define SQTAG_IP_INPUT_RX_RING 39 #define SQTAG_SQUEUE_CHANGE 40 #define SQTAG_CONNECT_FINISH 41 +#define SQTAG_SYNCHRONOUS_OP 42 +#define SQTAG_TCP_SHUTDOWN_OUTPUT 43 #define NOT_OVER_IP(ip_wq) \ (ip_wq->q_next != NULL || \ @@ -3643,6 +3648,7 @@ typedef void (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *); IP_MOD_NAME) != 0 || \ ip_wq->q_qinfo->qi_minfo->mi_idnum != IP_MOD_ID) +#define PROTO_FLOW_CNTRLD(connp) (connp->conn_flow_cntrld) #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c index 90cc6a51d5..c728a687d4 100644 --- a/usr/src/uts/common/inet/ip/icmp.c +++ b/usr/src/uts/common/inet/ip/icmp.c @@ -43,7 +43,9 @@ #include <sys/zone.h> #include <sys/time.h> +#include <sys/sockio.h> #include <sys/socket.h> +#include <sys/socketvar.h> #include <sys/isa_defs.h> #include <sys/suntpi.h> #include <sys/xti_inet.h> @@ -58,7 +60,7 @@ #include <inet/common.h> #include <inet/ip.h> #include <inet/ip6.h> -#include <inet/mi.h> +#include <inet/proto_set.h> #include <inet/nd.h> #include <inet/optcom.h> #include <inet/snmpcom.h> @@ -78,6 +80,7 @@ #include <inet/ip_if.h> #include <inet/ip_impl.h> +#include <sys/disp.h> /* * Synchronization notes: @@ -99,41 +102,34 @@ */ static void icmp_addr_req(queue_t *q, mblk_t *mp); -static void icmp_bind(queue_t *q, mblk_t *mp); -static void icmp_bind_proto(queue_t *q); -static void icmp_bind_result(conn_t *, mblk_t *); -static void icmp_bind_ack(conn_t *, mblk_t *mp); -static void icmp_bind_error(conn_t *, mblk_t *mp); +static void icmp_tpi_bind(queue_t *q, mblk_t *mp); +static int icmp_bind_proto(conn_t *connp); static int icmp_build_hdrs(icmp_t *icmp); static void icmp_capability_req(queue_t *q, mblk_t *mp); -static int icmp_close(queue_t *q); -static void icmp_connect(queue_t *q, mblk_t *mp); -static void icmp_disconnect(queue_t *q, mblk_t *mp); +static int icmp_close(queue_t *q, int flags); +static void icmp_tpi_connect(queue_t *q, mblk_t *mp); +static void icmp_tpi_disconnect(queue_t *q, mblk_t *mp); static void icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error); static void icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, t_scalar_t t_error, int sys_error); -static void icmp_icmp_error(queue_t *q, mblk_t *mp); -static void icmp_icmp_error_ipv6(queue_t *q, mblk_t *mp); +static void icmp_icmp_error(conn_t *connp, mblk_t *mp); +static void icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp); static void icmp_info_req(queue_t *q, mblk_t *mp); static void icmp_input(void *, mblk_t *, void *); -static mblk_t *icmp_ip_bind_mp(icmp_t *icmp, t_scalar_t bind_prim, - t_scalar_t addr_length, in_port_t); -static int icmp_open(queue_t *q, dev_t *devp, int flag, int sflag, - cred_t *credp, boolean_t isv6); +static conn_t *icmp_open(int family, cred_t *credp, int *err, int flags); static int icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp); static int icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp); -static void icmp_output(queue_t *q, mblk_t *mp); static int icmp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp, void *thisdg_attrs); static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name); -int icmp_opt_set(queue_t *q, uint_t optset_context, +int icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, - void *thisdg_attrs, cred_t *cr, mblk_t *mblk); -int icmp_opt_get(queue_t *q, int level, int name, + void *thisdg_attrs, cred_t *cr); +int icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr); static int icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); static boolean_t icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt); @@ -144,10 +140,13 @@ static int icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, static int icmp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); static void icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err); -static void icmp_unbind(queue_t *q, mblk_t *mp); +static void icmp_tpi_unbind(queue_t *q, mblk_t *mp); static void icmp_wput(queue_t *q, mblk_t *mp); -static void icmp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6, - t_scalar_t tudr_optlen); +static void icmp_wput_fallback(queue_t *q, mblk_t *mp); +static int raw_ip_send_data_v6(queue_t *q, conn_t *connp, mblk_t *mp, + sin6_t *sin6, ip6_pkt_t *ipp); +static int raw_ip_send_data_v4(queue_t *q, conn_t *connp, mblk_t *mp, + ipaddr_t v4dst, ip4_pkt_t *pktinfop); static void icmp_wput_other(queue_t *q, mblk_t *mp); static void icmp_wput_iocdata(queue_t *q, mblk_t *mp); static void icmp_wput_restricted(queue_t *q, mblk_t *mp); @@ -158,7 +157,16 @@ static void rawip_stack_fini(netstackid_t stackid, void *arg); static void *rawip_kstat_init(netstackid_t stackid); static void rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp); static int rawip_kstat_update(kstat_t *kp, int rw); +static void rawip_stack_shutdown(netstackid_t stackid, void *arg); +static int rawip_do_getsockname(icmp_t *icmp, struct sockaddr *sa, + uint_t *salenp); +static int rawip_do_getpeername(icmp_t *icmp, struct sockaddr *sa, + uint_t *salenp); +int rawip_getsockname(sock_lower_handle_t, struct sockaddr *, + socklen_t *, cred_t *); +int rawip_getpeername(sock_lower_handle_t, struct sockaddr *, + socklen_t *, cred_t *); static struct module_info icmp_mod_info = { 5707, "icmp", 1, INFPSZ, 512, 128 @@ -177,7 +185,12 @@ static struct qinit icmprinitv6 = { }; static struct qinit icmpwinit = { - (pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info + (pfi_t)icmp_wput, NULL, NULL, NULL, NULL, &icmp_mod_info +}; + +/* ICMP entry point during fallback */ +static struct qinit icmp_fallback_sock_winit = { + (pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info }; /* For AF_INET aka /dev/icmp */ @@ -233,6 +246,11 @@ static icmpparam_t icmp_param_arr[] = { #define is_recv_hiwat is_param_arr[6].icmp_param_value #define is_max_buf is_param_arr[7].icmp_param_value +static int rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len); +static int rawip_do_connect(conn_t *connp, const struct sockaddr *sa, + socklen_t len); +static void rawip_post_ip_bind_connect(icmp_t *icmp, mblk_t *ire_mp, int error); + /* * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message * passed to icmp_wput. @@ -241,14 +259,17 @@ static icmpparam_t icmp_param_arr[] = { * message is returned by ip_bind_v4/v6. */ static void -icmp_bind(queue_t *q, mblk_t *mp) +icmp_tpi_bind(queue_t *q, mblk_t *mp) { + int error; + struct sockaddr *sa; + struct T_bind_req *tbr; + socklen_t len; sin_t *sin; sin6_t *sin6; - mblk_t *mp1; - struct T_bind_req *tbr; - icmp_t *icmp; + icmp_t *icmp; conn_t *connp = Q_TO_CONN(q); + mblk_t *mp1; icmp = connp->conn_icmp; if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { @@ -258,12 +279,14 @@ icmp_bind(queue_t *q, mblk_t *mp) icmp_err_ack(q, mp, TPROTO, 0); return; } + if (icmp->icmp_state != TS_UNBND) { (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, "icmp_bind: bad state, %d", icmp->icmp_state); icmp_err_ack(q, mp, TOUTSTATE, 0); return; } + /* * Reallocate the message to make sure we have enough room for an * address and the protocol type. @@ -274,9 +297,13 @@ icmp_bind(queue_t *q, mblk_t *mp) return; } mp = mp1; + + /* Reset the message type in preparation for shipping it back. */ + DB_TYPE(mp) = M_PCPROTO; tbr = (struct T_bind_req *)mp->b_rptr; - switch (tbr->ADDR_length) { - case 0: /* Generic request */ + len = tbr->ADDR_length; + switch (len) { + case 0: /* request for a generic port */ tbr->ADDR_offset = sizeof (struct T_bind_req); if (icmp->icmp_family == AF_INET) { tbr->ADDR_length = sizeof (sin_t); @@ -284,6 +311,8 @@ icmp_bind(queue_t *q, mblk_t *mp) *sin = sin_null; sin->sin_family = AF_INET; mp->b_wptr = (uchar_t *)&sin[1]; + sa = (struct sockaddr *)sin; + len = sizeof (sin_t); } else { ASSERT(icmp->icmp_family == AF_INET6); tbr->ADDR_length = sizeof (sin6_t); @@ -291,39 +320,21 @@ icmp_bind(queue_t *q, mblk_t *mp) *sin6 = sin6_null; sin6->sin6_family = AF_INET6; mp->b_wptr = (uchar_t *)&sin6[1]; + sa = (struct sockaddr *)sin6; + len = sizeof (sin6_t); } break; - case sizeof (sin_t): /* Complete IP address */ - sin = (sin_t *)mi_offset_param(mp, tbr->ADDR_offset, + + case sizeof (sin_t): /* Complete IPv4 address */ + sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, sizeof (sin_t)); - if (sin == NULL || !OK_32PTR((char *)sin)) { - icmp_err_ack(q, mp, TSYSERR, EINVAL); - return; - } - if (icmp->icmp_family != AF_INET || - sin->sin_family != AF_INET) { - icmp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT); - return; - } break; - case sizeof (sin6_t): /* Complete IP address */ - sin6 = (sin6_t *)mi_offset_param(mp, tbr->ADDR_offset, - sizeof (sin6_t)); - if (sin6 == NULL || !OK_32PTR((char *)sin6)) { - icmp_err_ack(q, mp, TSYSERR, EINVAL); - return; - } - if (icmp->icmp_family != AF_INET6 || - sin6->sin6_family != AF_INET6) { - icmp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT); - return; - } - /* No support for mapped addresses on raw sockets */ - if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { - icmp_err_ack(q, mp, TSYSERR, EADDRNOTAVAIL); - return; - } + + case sizeof (sin6_t): /* Complete IPv6 address */ + sa = (struct sockaddr *)mi_offset_param(mp, + tbr->ADDR_offset, sizeof (sin6_t)); break; + default: (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, "icmp_bind: bad ADDR_length %d", tbr->ADDR_length); @@ -331,6 +342,37 @@ icmp_bind(queue_t *q, mblk_t *mp) return; } + error = rawip_do_bind(connp, sa, len); +done: + ASSERT(mp->b_cont == NULL); + if (error != 0) { + if (error > 0) { + icmp_err_ack(q, mp, TSYSERR, error); + } else { + icmp_err_ack(q, mp, -error, 0); + } + } else { + tbr->PRIM_type = T_BIND_ACK; + qreply(q, mp); + } +} + +static int +rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len) +{ + sin_t *sin; + sin6_t *sin6; + icmp_t *icmp; + int error = 0; + mblk_t *ire_mp; + + + icmp = connp->conn_icmp; + + if (sa == NULL || !OK_32PTR((char *)sa)) { + return (EINVAL); + } + /* * The state must be TS_UNBND. TPI mandates that users must send * TPI primitives only 1 at a time and wait for the response before @@ -338,24 +380,53 @@ icmp_bind(queue_t *q, mblk_t *mp) */ rw_enter(&icmp->icmp_rwlock, RW_WRITER); if (icmp->icmp_state != TS_UNBND || icmp->icmp_pending_op != -1) { - rw_exit(&icmp->icmp_rwlock); - (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, - "icmp_bind: bad state, %d", icmp->icmp_state); - icmp_err_ack(q, mp, TOUTSTATE, 0); - return; + error = -TOUTSTATE; + goto done; + } + + ASSERT(len != 0); + switch (len) { + case sizeof (sin_t): /* Complete IPv4 address */ + sin = (sin_t *)sa; + if (sin->sin_family != AF_INET || + icmp->icmp_family != AF_INET) { + /* TSYSERR, EAFNOSUPPORT */ + error = EAFNOSUPPORT; + goto done; + } + break; + case sizeof (sin6_t): /* Complete IPv6 address */ + sin6 = (sin6_t *)sa; + if (sin6->sin6_family != AF_INET6 || + icmp->icmp_family != AF_INET6) { + /* TSYSERR, EAFNOSUPPORT */ + error = EAFNOSUPPORT; + goto done; + } + /* No support for mapped addresses on raw sockets */ + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + /* TSYSERR, EADDRNOTAVAIL */ + error = EADDRNOTAVAIL; + goto done; + } + break; + + default: + /* TBADADDR */ + error = EADDRNOTAVAIL; + goto done; } - icmp->icmp_pending_op = tbr->PRIM_type; + icmp->icmp_pending_op = T_BIND_REQ; + icmp->icmp_state = TS_IDLE; /* * Copy the source address into our icmp structure. This address * may still be zero; if so, ip will fill in the correct address * each time an outbound packet is passed to it. * If we are binding to a broadcast or multicast address then - * icmp_bind_ack will clear the source address when it receives - * the T_BIND_ACK. + * rawip_post_ip_bind_connect will clear the source address. */ - icmp->icmp_state = TS_IDLE; if (icmp->icmp_family == AF_INET) { ASSERT(sin != NULL); @@ -378,147 +449,136 @@ icmp_bind(queue_t *q, mblk_t *mp) error = icmp_build_hdrs(icmp); if (error != 0) { icmp->icmp_pending_op = -1; - rw_exit(&icmp->icmp_rwlock); - icmp_err_ack(q, mp, TSYSERR, error); - return; + /* + * TSYSERR + */ + goto done; } } - /* - * Place protocol type in the O_T_BIND_REQ/T_BIND_REQ following - * the address. - */ - *mp->b_wptr++ = icmp->icmp_proto; + + ire_mp = NULL; if (!(V6_OR_V4_INADDR_ANY(icmp->icmp_v6src))) { /* - * Append a request for an IRE if src not 0 (INADDR_ANY) + * request an IRE if src not 0 (INADDR_ANY) */ - mp->b_cont = allocb(sizeof (ire_t), BPRI_HI); - if (!mp->b_cont) { + ire_mp = allocb(sizeof (ire_t), BPRI_HI); + if (ire_mp == NULL) { icmp->icmp_pending_op = -1; - rw_exit(&icmp->icmp_rwlock); - icmp_err_ack(q, mp, TSYSERR, ENOMEM); - return; + error = ENOMEM; + goto done; } - mp->b_cont->b_wptr += sizeof (ire_t); - mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE; + DB_TYPE(ire_mp) = IRE_DB_REQ_TYPE; } +done: rw_exit(&icmp->icmp_rwlock); + if (error != 0) + return (error); - /* Pass the O_T_BIND_REQ/T_BIND_REQ to ip. */ - if (icmp->icmp_family == AF_INET6) - mp = ip_bind_v6(q, mp, connp, NULL); - else - mp = ip_bind_v4(q, mp, connp); - - /* The above return NULL if the bind needs to be deferred */ - if (mp != NULL) - icmp_bind_result(connp, mp); - else - CONN_INC_REF(connp); + if (icmp->icmp_family == AF_INET6) { + error = ip_proto_bind_laddr_v6(connp, &ire_mp, icmp->icmp_proto, + &sin6->sin6_addr, sin6->sin6_port, B_TRUE); + } else { + error = ip_proto_bind_laddr_v4(connp, &ire_mp, icmp->icmp_proto, + sin->sin_addr.s_addr, sin->sin_port, B_TRUE); + } + rawip_post_ip_bind_connect(icmp, ire_mp, error); + return (error); } -/* - * Send message to IP to just bind to the protocol. - */ static void -icmp_bind_proto(queue_t *q) +rawip_post_ip_bind_connect(icmp_t *icmp, mblk_t *ire_mp, int error) { - mblk_t *mp; - struct T_bind_req *tbr; - icmp_t *icmp; - conn_t *connp = Q_TO_CONN(q); - - icmp = connp->conn_icmp; - - mp = allocb(sizeof (struct T_bind_req) + sizeof (sin6_t) + 1, - BPRI_MED); - if (!mp) { + rw_enter(&icmp->icmp_rwlock, RW_WRITER); + if (icmp->icmp_state == TS_UNBND) { + /* + * not yet bound - bind sent by icmp_bind_proto. + */ + rw_exit(&icmp->icmp_rwlock); return; } - mp->b_datap->db_type = M_PROTO; - tbr = (struct T_bind_req *)mp->b_rptr; - tbr->PRIM_type = O_T_BIND_REQ; /* change to T_BIND_REQ ? */ - tbr->ADDR_offset = sizeof (struct T_bind_req); - - rw_enter(&icmp->icmp_rwlock, RW_WRITER); - if (icmp->icmp_ipversion == IPV4_VERSION) { - sin_t *sin; + ASSERT(icmp->icmp_pending_op != -1); + icmp->icmp_pending_op = -1; - tbr->ADDR_length = sizeof (sin_t); - sin = (sin_t *)&tbr[1]; - *sin = sin_null; - sin->sin_family = AF_INET; - mp->b_wptr = (uchar_t *)&sin[1]; + if (error != 0) { + if (icmp->icmp_state == TS_DATA_XFER) { + /* Connect failed */ + /* Revert back to the bound source */ + icmp->icmp_v6src = icmp->icmp_bound_v6src; + icmp->icmp_state = TS_IDLE; + if (icmp->icmp_family == AF_INET6) + (void) icmp_build_hdrs(icmp); + } else { + V6_SET_ZERO(icmp->icmp_v6src); + V6_SET_ZERO(icmp->icmp_bound_v6src); + icmp->icmp_state = TS_UNBND; + if (icmp->icmp_family == AF_INET6) + (void) icmp_build_hdrs(icmp); + } } else { - sin6_t *sin6; + if (ire_mp != NULL && ire_mp->b_datap->db_type == IRE_DB_TYPE) { + ire_t *ire; - ASSERT(icmp->icmp_ipversion == IPV6_VERSION); - tbr->ADDR_length = sizeof (sin6_t); - sin6 = (sin6_t *)&tbr[1]; - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - mp->b_wptr = (uchar_t *)&sin6[1]; - } + ire = (ire_t *)ire_mp->b_rptr; + /* + * If a broadcast/multicast address was bound set + * the source address to 0. + * This ensures no datagrams with broadcast address + * as source address are emitted (which would violate + * RFC1122 - Hosts requirements) + * Note: we get IRE_BROADCAST for IPv6 + * to "mark" a multicast local address. + */ - /* Place protocol type in the O_T_BIND_REQ following the address. */ - *mp->b_wptr++ = icmp->icmp_proto; - rw_exit(&icmp->icmp_rwlock); - /* Pass the O_T_BIND_REQ to ip. */ - if (icmp->icmp_family == AF_INET6) - mp = ip_bind_v6(q, mp, connp, NULL); - else - mp = ip_bind_v4(q, mp, connp); + if (ire->ire_type == IRE_BROADCAST && + icmp->icmp_state != TS_DATA_XFER) { + /* + * This was just a local bind to a + * MC/broadcast addr + */ + V6_SET_ZERO(icmp->icmp_v6src); + if (icmp->icmp_family == AF_INET6) + (void) icmp_build_hdrs(icmp); + } + } - /* The above return NULL if the bind needs to be deferred */ - if (mp != NULL) - icmp_bind_result(connp, mp); - else - CONN_INC_REF(connp); + } + rw_exit(&icmp->icmp_rwlock); + if (ire_mp != NULL) + freeb(ire_mp); } /* - * This is called from ip_wput_nondata to handle the results of a - * deferred RAWIP bind. It is called once the bind has been completed. + * Send message to IP to just bind to the protocol. */ -void -rawip_resume_bind(conn_t *connp, mblk_t *mp) +static int +icmp_bind_proto(conn_t *connp) { - ASSERT(connp != NULL && IPCL_IS_RAWIP(connp)); + icmp_t *icmp; + int error; + + icmp = connp->conn_icmp; - icmp_bind_result(connp, mp); + if (icmp->icmp_family == AF_INET6) + error = ip_proto_bind_laddr_v6(connp, NULL, icmp->icmp_proto, + &sin6_null.sin6_addr, 0, B_TRUE); + else + error = ip_proto_bind_laddr_v4(connp, NULL, icmp->icmp_proto, + sin_null.sin_addr.s_addr, 0, B_TRUE); - CONN_OPER_PENDING_DONE(connp); + rawip_post_ip_bind_connect(icmp, NULL, error); + return (error); } -/* - * This routine handles each T_CONN_REQ message passed to icmp. It - * associates a default destination address with the stream. - * - * This routine sends down a T_BIND_REQ to IP with the following mblks: - * T_BIND_REQ - specifying local and remote address. - * IRE_DB_REQ_TYPE - to get an IRE back containing ire_type and src - * T_OK_ACK - for the T_CONN_REQ - * T_CONN_CON - to keep the TPI user happy - * - * The connect completes in icmp_bind_result. - * When a T_BIND_ACK is received information is extracted from the IRE - * and the two appended messages are sent to the TPI user. - * Should icmp_bind_result receive T_ERROR_ACK for the T_BIND_REQ it will - * convert it to an error ack for the appropriate primitive. - */ static void -icmp_connect(queue_t *q, mblk_t *mp) +icmp_tpi_connect(queue_t *q, mblk_t *mp) { - sin_t *sin; - sin6_t *sin6; - mblk_t *mp1, *mp2; + conn_t *connp = Q_TO_CONN(q); struct T_conn_req *tcr; icmp_t *icmp; - ipaddr_t v4dst; - in6_addr_t v6dst; - uint32_t flowinfo; - conn_t *connp = Q_TO_CONN(q); + struct sockaddr *sa; + socklen_t len; + int error; icmp = connp->conn_icmp; tcr = (struct T_conn_req *)mp->b_rptr; @@ -533,54 +593,111 @@ icmp_connect(queue_t *q, mblk_t *mp) return; } - switch (tcr->DEST_length) { + len = tcr->DEST_length; + + switch (len) { default: icmp_err_ack(q, mp, TBADADDR, 0); return; - case sizeof (sin_t): - sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset, + sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, sizeof (sin_t)); - if (sin == NULL || !OK_32PTR((char *)sin)) { - icmp_err_ack(q, mp, TSYSERR, EINVAL); - return; - } - if (icmp->icmp_family != AF_INET || - sin->sin_family != AF_INET) { - icmp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT); - return; - } - v4dst = sin->sin_addr.s_addr; - IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); - ASSERT(icmp->icmp_ipversion == IPV4_VERSION); - icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + - icmp->icmp_ip_snd_options_len; break; - case sizeof (sin6_t): - sin6 = (sin6_t *)mi_offset_param(mp, tcr->DEST_offset, - sizeof (sin6_t)); - if (sin6 == NULL || !OK_32PTR((char *)sin6)) { - icmp_err_ack(q, mp, TSYSERR, EINVAL); - return; + sa = (struct sockaddr *)mi_offset_param(mp, + tcr->DEST_offset, sizeof (sin6_t)); + break; + } + + error = proto_verify_ip_addr(icmp->icmp_family, sa, len); + if (error != 0) { + icmp_err_ack(q, mp, TSYSERR, error); + return; + } + + error = rawip_do_connect(connp, sa, len); + if (error != 0) { + if (error < 0) { + icmp_err_ack(q, mp, -error, 0); + } else { + icmp_err_ack(q, mp, 0, error); } - if (icmp->icmp_family != AF_INET6 || - sin6->sin6_family != AF_INET6) { - icmp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT); + } else { + mblk_t *mp1; + + /* + * We have to send a connection confirmation to + * keep TLI happy. + */ + if (icmp->icmp_family == AF_INET) { + mp1 = mi_tpi_conn_con(NULL, (char *)sa, + sizeof (sin_t), NULL, 0); + } else { + ASSERT(icmp->icmp_family == AF_INET6); + mp1 = mi_tpi_conn_con(NULL, (char *)sa, + sizeof (sin6_t), NULL, 0); + } + if (mp1 == NULL) { + rw_exit(&icmp->icmp_rwlock); + icmp_err_ack(q, mp, TSYSERR, ENOMEM); return; } - /* No support for mapped addresses on raw sockets */ - if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { - icmp_err_ack(q, mp, TSYSERR, EADDRNOTAVAIL); + + /* + * Send ok_ack for T_CONN_REQ + */ + mp = mi_tpi_ok_ack_alloc(mp); + if (mp == NULL) { + /* Unable to reuse the T_CONN_REQ for the ack. */ + freemsg(mp1); + icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM); return; } - v6dst = sin6->sin6_addr; - ASSERT(icmp->icmp_ipversion == IPV6_VERSION); - icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len; - flowinfo = sin6->sin6_flowinfo; - break; + putnext(connp->conn_rq, mp); + putnext(connp->conn_rq, mp1); + } +} + +static int +rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len) +{ + icmp_t *icmp; + sin_t *sin; + sin6_t *sin6; + mblk_t *ire_mp; + int error; + ipaddr_t v4dst; + in6_addr_t v6dst; + + icmp = connp->conn_icmp; + + if (sa == NULL || !OK_32PTR((char *)sa)) { + return (EINVAL); + } + + ire_mp = allocb(sizeof (ire_t), BPRI_HI); + if (ire_mp == NULL) + return (ENOMEM); + DB_TYPE(ire_mp) = IRE_DB_REQ_TYPE; + + + ASSERT(sa != NULL && len != 0); + + rw_enter(&icmp->icmp_rwlock, RW_WRITER); + if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) { + rw_exit(&icmp->icmp_rwlock); + freeb(ire_mp); + return (-TOUTSTATE); } - if (icmp->icmp_ipversion == IPV4_VERSION) { + + switch (len) { + case sizeof (sin_t): + sin = (sin_t *)sa; + + ASSERT(icmp->icmp_family == AF_INET); + ASSERT(icmp->icmp_ipversion == IPV4_VERSION); + + v4dst = sin->sin_addr.s_addr; /* * Interpret a zero destination to mean loopback. * Update the T_CONN_REQ (sin/sin6) since it is used to @@ -588,15 +705,16 @@ icmp_connect(queue_t *q, mblk_t *mp) */ if (v4dst == INADDR_ANY) { v4dst = htonl(INADDR_LOOPBACK); - IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); - if (icmp->icmp_family == AF_INET) { - sin->sin_addr.s_addr = v4dst; - } else { - sin6->sin6_addr = v6dst; - } } - icmp->icmp_v6dst = v6dst; - icmp->icmp_flowinfo = 0; + + IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); + ASSERT(icmp->icmp_ipversion == IPV4_VERSION); + icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + + icmp->icmp_ip_snd_options_len; + icmp->icmp_v6dst.sin6_addr = v6dst; + icmp->icmp_v6dst.sin6_family = AF_INET6; + icmp->icmp_v6dst.sin6_flowinfo = 0; + icmp->icmp_v6dst.sin6_port = 0; /* * If the destination address is multicast and @@ -610,35 +728,42 @@ icmp_connect(queue_t *q, mblk_t *mp) IN6_IPADDR_TO_V4MAPPED(icmp->icmp_multicast_if_addr, &icmp->icmp_v6src); } - } else { + break; + case sizeof (sin6_t): + sin6 = (sin6_t *)sa; + + /* No support for mapped addresses on raw sockets */ + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + rw_exit(&icmp->icmp_rwlock); + freeb(ire_mp); + return (EADDRNOTAVAIL); + } + ASSERT(icmp->icmp_ipversion == IPV6_VERSION); + ASSERT(icmp->icmp_family == AF_INET6); + + icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len; + + icmp->icmp_v6dst = *sin6; + icmp->icmp_v6dst.sin6_port = 0; + /* * Interpret a zero destination to mean loopback. * Update the T_CONN_REQ (sin/sin6) since it is used to * generate the T_CONN_CON. */ - if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) { - v6dst = ipv6_loopback; - sin6->sin6_addr = v6dst; + if (IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6dst.sin6_addr)) { + icmp->icmp_v6dst.sin6_addr = ipv6_loopback; } - icmp->icmp_v6dst = v6dst; - icmp->icmp_flowinfo = flowinfo; /* * If the destination address is multicast and * an outgoing multicast interface has been set, * then the ip bind logic will pick the correct source * address (i.e. matching the outgoing multicast interface). */ + break; } - rw_enter(&icmp->icmp_rwlock, RW_WRITER); - if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) { - rw_exit(&icmp->icmp_rwlock); - (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, - "icmp_connect: bad state, %d", icmp->icmp_state); - icmp_err_ack(q, mp, TOUTSTATE, 0); - return; - } icmp->icmp_pending_op = T_CONN_REQ; if (icmp->icmp_state == TS_DATA_XFER) { @@ -647,74 +772,22 @@ icmp_connect(queue_t *q, mblk_t *mp) icmp->icmp_state = TS_IDLE; } - /* - * Send down bind to IP to verify that there is a route - * and to determine the source address. - * This will come back as T_BIND_ACK with an IRE_DB_TYPE in rput. - */ - if (icmp->icmp_family == AF_INET) { - mp1 = icmp_ip_bind_mp(icmp, O_T_BIND_REQ, sizeof (ipa_conn_t), - sin->sin_port); - } else { - ASSERT(icmp->icmp_family == AF_INET6); - mp1 = icmp_ip_bind_mp(icmp, O_T_BIND_REQ, sizeof (ipa6_conn_t), - sin6->sin6_port); - } - if (mp1 == NULL) { - icmp->icmp_pending_op = -1; - rw_exit(&icmp->icmp_rwlock); - icmp_err_ack(q, mp, TSYSERR, ENOMEM); - return; - } - - /* - * We also have to send a connection confirmation to - * keep TLI happy. Prepare it for icmp_bind_result. - */ - if (icmp->icmp_family == AF_INET) { - mp2 = mi_tpi_conn_con(NULL, (char *)sin, sizeof (*sin), NULL, - 0); - } else { - ASSERT(icmp->icmp_family == AF_INET6); - mp2 = mi_tpi_conn_con(NULL, (char *)sin6, sizeof (*sin6), NULL, - 0); - } - if (mp2 == NULL) { - freemsg(mp1); - icmp->icmp_pending_op = -1; - rw_exit(&icmp->icmp_rwlock); - icmp_err_ack(q, mp, TSYSERR, ENOMEM); - return; - } - - mp = mi_tpi_ok_ack_alloc(mp); - if (mp == NULL) { - /* Unable to reuse the T_CONN_REQ for the ack. */ - freemsg(mp2); - icmp->icmp_pending_op = -1; - rw_exit(&icmp->icmp_rwlock); - icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM); - return; - } - icmp->icmp_state = TS_DATA_XFER; rw_exit(&icmp->icmp_rwlock); - /* Hang onto the T_OK_ACK and T_CONN_CON for later. */ - linkb(mp1, mp); - linkb(mp1, mp2); - - mblk_setcred(mp1, connp->conn_cred); - if (icmp->icmp_family == AF_INET) - mp1 = ip_bind_v4(q, mp1, connp); - else - mp1 = ip_bind_v6(q, mp1, connp, NULL); - - /* The above return NULL if the bind needs to be deferred */ - if (mp1 != NULL) - icmp_bind_result(connp, mp1); - else - CONN_INC_REF(connp); + if (icmp->icmp_family == AF_INET6) { + error = ip_proto_bind_connected_v6(connp, &ire_mp, + icmp->icmp_proto, &icmp->icmp_v6src, 0, + &icmp->icmp_v6dst.sin6_addr, + NULL, sin6->sin6_port, B_TRUE, B_TRUE); + } else { + error = ip_proto_bind_connected_v4(connp, &ire_mp, + icmp->icmp_proto, &V4_PART_OF_V6(icmp->icmp_v6src), 0, + V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr), sin->sin_port, + B_TRUE, B_TRUE); + } + rawip_post_ip_bind_connect(icmp, ire_mp, error); + return (error); } static void @@ -733,6 +806,7 @@ icmp_close_free(conn_t *connp) kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t)); icmp->icmp_filter = NULL; } + /* Free memory associated with sticky options */ if (icmp->icmp_sticky_hdrs_len != 0) { kmem_free(icmp->icmp_sticky_hdrs, @@ -754,16 +828,18 @@ icmp_close_free(conn_t *connp) } static int -icmp_close(queue_t *q) +rawip_do_close(conn_t *connp) { - conn_t *connp = (conn_t *)q->q_ptr; - ASSERT(connp != NULL && IPCL_IS_RAWIP(connp)); ip_quiesce_conn(connp); - qprocsoff(connp->conn_rq); + if (!IPCL_IS_NONSTR(connp)) { + qprocsoff(connp->conn_rq); + } + ASSERT(connp->conn_icmp->icmp_fallback_queue_head == NULL && + connp->conn_icmp->icmp_fallback_queue_tail == NULL); icmp_close_free(connp); /* @@ -778,11 +854,36 @@ icmp_close(queue_t *q) */ ASSERT(connp->conn_ref == 1); - inet_minor_free(connp->conn_minor_arena, connp->conn_dev); + if (!IPCL_IS_NONSTR(connp)) { + inet_minor_free(connp->conn_minor_arena, connp->conn_dev); + } else { + ip_close_helper_stream(connp); + } connp->conn_ref--; ipcl_conn_destroy(connp); + return (0); +} + +static int +icmp_close(queue_t *q, int flags) +{ + conn_t *connp; + + if (flags & SO_FALLBACK) { + /* + * stream is being closed while in fallback + * simply free the resources that were allocated + */ + inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); + qprocsoff(q); + goto done; + } + + connp = Q_TO_CONN(q); + (void) rawip_do_close(connp); +done: q->q_ptr = WR(q)->q_ptr = NULL; return (0); } @@ -793,88 +894,102 @@ icmp_close(queue_t *q) * in sending a T_BIND_REQ to IP to restore the binding to just * the local address. * - * This routine sends down a T_BIND_REQ to IP with the following mblks: - * T_BIND_REQ - specifying just the local address. - * T_OK_ACK - for the T_DISCON_REQ - * - * The disconnect completes in icmp_bind_result. - * When a T_BIND_ACK is received the appended T_OK_ACK is sent to the TPI user. - * Should icmp_bind_result receive T_ERROR_ACK for the T_BIND_REQ it will - * convert it to an error ack for the appropriate primitive. + * The disconnect completes in rawip_post_ip_bind_connect. */ -static void -icmp_disconnect(queue_t *q, mblk_t *mp) +static int +icmp_do_disconnect(conn_t *connp) { icmp_t *icmp; - mblk_t *mp1; - conn_t *connp = Q_TO_CONN(q); + mblk_t *ire_mp; + int error; icmp = connp->conn_icmp; rw_enter(&icmp->icmp_rwlock, RW_WRITER); if (icmp->icmp_state != TS_DATA_XFER || icmp->icmp_pending_op != -1) { rw_exit(&icmp->icmp_rwlock); - (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, - "icmp_disconnect: bad state, %d", icmp->icmp_state); - icmp_err_ack(q, mp, TOUTSTATE, 0); - return; + return (-TOUTSTATE); } icmp->icmp_pending_op = T_DISCON_REQ; icmp->icmp_v6src = icmp->icmp_bound_v6src; icmp->icmp_state = TS_IDLE; - /* - * Send down bind to IP to remove the full binding and revert - * to the local address binding. - */ - if (icmp->icmp_family == AF_INET) { - mp1 = icmp_ip_bind_mp(icmp, O_T_BIND_REQ, sizeof (sin_t), 0); - } else { - ASSERT(icmp->icmp_family == AF_INET6); - mp1 = icmp_ip_bind_mp(icmp, O_T_BIND_REQ, sizeof (sin6_t), 0); - } - if (mp1 == NULL) { - icmp->icmp_pending_op = -1; - rw_exit(&icmp->icmp_rwlock); - icmp_err_ack(q, mp, TSYSERR, ENOMEM); - return; - } - mp = mi_tpi_ok_ack_alloc(mp); - if (mp == NULL) { - /* Unable to reuse the T_DISCON_REQ for the ack. */ - icmp->icmp_pending_op = -1; - rw_exit(&icmp->icmp_rwlock); - icmp_err_ack_prim(q, mp1, T_DISCON_REQ, TSYSERR, ENOMEM); - return; - } if (icmp->icmp_family == AF_INET6) { - int error; - /* Rebuild the header template */ error = icmp_build_hdrs(icmp); if (error != 0) { icmp->icmp_pending_op = -1; rw_exit(&icmp->icmp_rwlock); - icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, error); - freemsg(mp1); - return; + return (error); } } rw_exit(&icmp->icmp_rwlock); - /* Append the T_OK_ACK to the T_BIND_REQ for icmp_bind_result */ - linkb(mp1, mp); + ire_mp = allocb(sizeof (ire_t), BPRI_HI); + if (ire_mp == NULL) { + return (ENOMEM); + } - if (icmp->icmp_family == AF_INET6) - mp1 = ip_bind_v6(q, mp1, connp, NULL); - else - mp1 = ip_bind_v4(q, mp1, connp); + if (icmp->icmp_family == AF_INET6) { + error = ip_proto_bind_laddr_v6(connp, &ire_mp, icmp->icmp_proto, + &icmp->icmp_bound_v6src, 0, B_TRUE); + } else { - /* The above return NULL if the bind needs to be deferred */ - if (mp1 != NULL) - icmp_bind_result(connp, mp1); - else - CONN_INC_REF(connp); + error = ip_proto_bind_laddr_v4(connp, &ire_mp, icmp->icmp_proto, + V4_PART_OF_V6(icmp->icmp_bound_v6src), 0, B_TRUE); + } + + rawip_post_ip_bind_connect(icmp, ire_mp, error); + + return (error); +} + +static void +icmp_tpi_disconnect(queue_t *q, mblk_t *mp) +{ + conn_t *connp = Q_TO_CONN(q); + int error; + + /* + * Allocate the largest primitive we need to send back + * T_error_ack is > than T_ok_ack + */ + mp = reallocb(mp, sizeof (struct T_error_ack), 1); + if (mp == NULL) { + /* Unable to reuse the T_DISCON_REQ for the ack. */ + icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM); + return; + } + + error = icmp_do_disconnect(connp); + + if (error != 0) { + if (error > 0) { + icmp_err_ack(q, mp, 0, error); + } else { + icmp_err_ack(q, mp, -error, 0); + } + } else { + mp = mi_tpi_ok_ack_alloc(mp); + ASSERT(mp != NULL); + qreply(q, mp); + } + +} + +static int +icmp_disconnect(conn_t *connp) +{ + int error; + icmp_t *icmp = connp->conn_icmp; + + icmp->icmp_dgram_errind = B_FALSE; + + error = icmp_do_disconnect(connp); + + if (error < 0) + error = proto_tlitosyserr(-error); + return (error); } /* This routine creates a T_ERROR_ACK message and passes it upstream. */ @@ -905,22 +1020,20 @@ icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, /* * icmp_icmp_error is called by icmp_input to process ICMP * messages passed up by IP. - * Generates the appropriate T_UDERROR_IND for permanent - * (non-transient) errors. + * Generates the appropriate permanent (non-transient) errors. * Assumes that IP has pulled up everything up to and including * the ICMP header. */ static void -icmp_icmp_error(queue_t *q, mblk_t *mp) +icmp_icmp_error(conn_t *connp, mblk_t *mp) { icmph_t *icmph; ipha_t *ipha; int iph_hdr_length; sin_t sin; - sin6_t sin6; mblk_t *mp1; int error = 0; - icmp_t *icmp = Q_TO_ICMP(q); + icmp_t *icmp = connp->conn_icmp; ipha = (ipha_t *)mp->b_rptr; @@ -928,10 +1041,19 @@ icmp_icmp_error(queue_t *q, mblk_t *mp) if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) { ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION); - icmp_icmp_error_ipv6(q, mp); + icmp_icmp_error_ipv6(connp, mp); return; } - ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); + + /* + * icmp does not support v4 mapped addresses + * so we can never be here for a V6 socket + * i.e. icmp_family == AF_INET6 + */ + ASSERT((IPH_HDR_VERSION(ipha) == IPV4_VERSION) && + (icmp->icmp_family == AF_INET)); + + ASSERT(icmp->icmp_family == AF_INET); /* Skip past the outer IP and ICMP headers */ iph_hdr_length = IPH_HDR_LENGTH(ipha); @@ -974,25 +1096,32 @@ icmp_icmp_error(queue_t *q, mblk_t *mp) return; } - switch (icmp->icmp_family) { - case AF_INET: - sin = sin_null; - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = ipha->ipha_dst; - mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0, - error); - break; - case AF_INET6: - sin6 = sin6_null; - sin6.sin6_family = AF_INET6; - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &sin6.sin6_addr); + sin = sin_null; + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = ipha->ipha_dst; + if (IPCL_IS_NONSTR(connp)) { + rw_enter(&icmp->icmp_rwlock, RW_WRITER); + if (icmp->icmp_state == TS_DATA_XFER) { + if (sin.sin_addr.s_addr == + V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr)) { + rw_exit(&icmp->icmp_rwlock); + (*connp->conn_upcalls->su_set_error) + (connp->conn_upper_handle, error); + goto done; + } + } else { + icmp->icmp_delayed_error = error; + *((sin_t *)&icmp->icmp_delayed_addr) = sin; + } + rw_exit(&icmp->icmp_rwlock); + } else { - mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), - NULL, 0, error); - break; + mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, + 0, error); + if (mp1 != NULL) + putnext(connp->conn_rq, mp1); } - if (mp1) - putnext(q, mp1); +done: freemsg(mp); } @@ -1004,7 +1133,7 @@ icmp_icmp_error(queue_t *q, mblk_t *mp) * as the ICMPv6 header. */ static void -icmp_icmp_error_ipv6(queue_t *q, mblk_t *mp) +icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) { icmp6_t *icmp6; ip6_t *ip6h, *outer_ip6h; @@ -1013,7 +1142,7 @@ icmp_icmp_error_ipv6(queue_t *q, mblk_t *mp) sin6_t sin6; mblk_t *mp1; int error = 0; - icmp_t *icmp = Q_TO_ICMP(q); + icmp_t *icmp = connp->conn_icmp; outer_ip6h = (ip6_t *)mp->b_rptr; if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6) @@ -1085,7 +1214,7 @@ icmp_icmp_error_ipv6(queue_t *q, mblk_t *mp) sin6 = (sin6_t *)&tudi[1]; bzero(sin6, sizeof (sin6_t)); sin6->sin6_family = AF_INET6; - sin6->sin6_addr = icmp->icmp_v6dst; + sin6->sin6_addr = icmp->icmp_v6dst.sin6_addr; toh = (struct T_opthdr *)&sin6[1]; toh->level = IPPROTO_IPV6; @@ -1103,7 +1232,14 @@ icmp_icmp_error_ipv6(queue_t *q, mblk_t *mp) * message. Free it, then send our empty message. */ freemsg(mp); - putnext(q, newmp); + if (!IPCL_IS_NONSTR(connp)) { + putnext(connp->conn_rq, newmp); + } else { + (*connp->conn_upcalls->su_recv) + (connp->conn_upper_handle, newmp, 0, 0, &error, + NULL); + ASSERT(error == 0); + } return; } case ICMP6_TIME_EXCEEDED: @@ -1138,10 +1274,29 @@ icmp_icmp_error_ipv6(queue_t *q, mblk_t *mp) sin6.sin6_addr = ip6h->ip6_dst; sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; - mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), NULL, 0, - error); - if (mp1) - putnext(q, mp1); + if (IPCL_IS_NONSTR(connp)) { + rw_enter(&icmp->icmp_rwlock, RW_WRITER); + if (icmp->icmp_state == TS_DATA_XFER) { + if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr, + &icmp->icmp_v6dst.sin6_addr)) { + rw_exit(&icmp->icmp_rwlock); + (*connp->conn_upcalls->su_set_error) + (connp->conn_upper_handle, error); + goto done; + } + } else { + icmp->icmp_delayed_error = error; + *((sin6_t *)&icmp->icmp_delayed_addr) = sin6; + } + rw_exit(&icmp->icmp_rwlock); + } else { + + mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), + NULL, 0, error); + if (mp1 != NULL) + putnext(connp->conn_rq, mp1); + } +done: freemsg(mp); } @@ -1249,6 +1404,18 @@ icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp) tap->OPT_size = icmp_max_optsize; } +static void +icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap, + t_uscalar_t cap_bits1) +{ + tcap->CAP_bits1 = 0; + + if (cap_bits1 & TC1_INFO) { + icmp_copy_info(&tcap->INFO_ack, icmp); + tcap->CAP_bits1 |= TC1_INFO; + } +} + /* * This routine responds to T_CAPABILITY_REQ messages. It is called by * icmp_wput. Much of the T_CAPABILITY_ACK information is copied from @@ -1270,12 +1437,8 @@ icmp_capability_req(queue_t *q, mblk_t *mp) return; tcap = (struct T_capability_ack *)mp->b_rptr; - tcap->CAP_bits1 = 0; - if (cap_bits1 & TC1_INFO) { - icmp_copy_info(&tcap->INFO_ack, icmp); - tcap->CAP_bits1 |= TC1_INFO; - } + icmp_do_capability_ack(icmp, tcap, cap_bits1); qreply(q, mp); } @@ -1298,182 +1461,131 @@ icmp_info_req(queue_t *q, mblk_t *mp) qreply(q, mp); } -/* - * IP recognizes seven kinds of bind requests: - * - * - A zero-length address binds only to the protocol number. - * - * - A 4-byte address is treated as a request to - * validate that the address is a valid local IPv4 - * address, appropriate for an application to bind to. - * IP does the verification, but does not make any note - * of the address at this time. - * - * - A 16-byte address contains is treated as a request - * to validate a local IPv6 address, as the 4-byte - * address case above. - * - * - A 16-byte sockaddr_in to validate the local IPv4 address and also - * use it for the inbound fanout of packets. - * - * - A 24-byte sockaddr_in6 to validate the local IPv6 address and also - * use it for the inbound fanout of packets. - * - * - A 12-byte address (ipa_conn_t) containing complete IPv4 fanout - * information consisting of local and remote addresses - * and ports (unused for raw sockets). In this case, the addresses are both - * validated as appropriate for this operation, and, if - * so, the information is retained for use in the - * inbound fanout. - * - * - A 36-byte address address (ipa6_conn_t) containing complete IPv6 - * fanout information, like the 12-byte case above. - * - * IP will also fill in the IRE request mblk with information - * regarding our peer. In all cases, we notify IP of our protocol - * type by appending a single protocol byte to the bind request. - */ -static mblk_t * -icmp_ip_bind_mp(icmp_t *icmp, t_scalar_t bind_prim, t_scalar_t addr_length, - in_port_t fport) +/* For /dev/icmp aka AF_INET open */ +static int +icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, + int family) { - char *cp; - mblk_t *mp; - struct T_bind_req *tbr; - ipa_conn_t *ac; - ipa6_conn_t *ac6; - sin_t *sin; - sin6_t *sin6; + conn_t *connp; + dev_t conn_dev; + icmp_stack_t *is; + int error; - ASSERT(bind_prim == O_T_BIND_REQ || bind_prim == T_BIND_REQ); - ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock)); - mp = allocb(sizeof (*tbr) + addr_length + 1, BPRI_HI); - if (mp == NULL) - return (NULL); - mp->b_datap->db_type = M_PROTO; - tbr = (struct T_bind_req *)mp->b_rptr; - tbr->PRIM_type = bind_prim; - tbr->ADDR_offset = sizeof (*tbr); - tbr->CONIND_number = 0; - tbr->ADDR_length = addr_length; - cp = (char *)&tbr[1]; - switch (addr_length) { - case sizeof (ipa_conn_t): - ASSERT(icmp->icmp_family == AF_INET); - /* Append a request for an IRE */ - mp->b_cont = allocb(sizeof (ire_t), BPRI_HI); - if (mp->b_cont == NULL) { - freemsg(mp); - return (NULL); - } - mp->b_cont->b_wptr += sizeof (ire_t); - mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE; + conn_dev = NULL; - /* cp known to be 32 bit aligned */ - ac = (ipa_conn_t *)cp; - ac->ac_laddr = V4_PART_OF_V6(icmp->icmp_v6src); - ac->ac_faddr = V4_PART_OF_V6(icmp->icmp_v6dst); - ac->ac_fport = fport; - ac->ac_lport = 0; - break; + /* If the stream is already open, return immediately. */ + if (q->q_ptr != NULL) + return (0); - case sizeof (ipa6_conn_t): - ASSERT(icmp->icmp_family == AF_INET6); - /* Append a request for an IRE */ - mp->b_cont = allocb(sizeof (ire_t), BPRI_HI); - if (mp->b_cont == NULL) { - freemsg(mp); - return (NULL); - } - mp->b_cont->b_wptr += sizeof (ire_t); - mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE; + if (sflag == MODOPEN) + return (EINVAL); - /* cp known to be 32 bit aligned */ - ac6 = (ipa6_conn_t *)cp; - ac6->ac6_laddr = icmp->icmp_v6src; - ac6->ac6_faddr = icmp->icmp_v6dst; - ac6->ac6_fport = fport; - ac6->ac6_lport = 0; - break; + /* + * Since ICMP is not used so heavily, allocating from the small + * arena should be sufficient. + */ + if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) { + return (EBUSY); + } - case sizeof (sin_t): - ASSERT(icmp->icmp_family == AF_INET); - /* Append a request for an IRE */ - mp->b_cont = allocb(sizeof (ire_t), BPRI_HI); - if (!mp->b_cont) { - freemsg(mp); - return (NULL); - } - mp->b_cont->b_wptr += sizeof (ire_t); - mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE; + if (flag & SO_FALLBACK) { + /* + * Non streams socket needs a stream to fallback to + */ + RD(q)->q_ptr = (void *)conn_dev; + WR(q)->q_qinfo = &icmp_fallback_sock_winit; + WR(q)->q_ptr = (void *)ip_minor_arena_sa; + qprocson(q); + return (0); + } - sin = (sin_t *)cp; - *sin = sin_null; - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = V4_PART_OF_V6(icmp->icmp_bound_v6src); - break; + connp = icmp_open(family, credp, &error, KM_SLEEP); + if (connp == NULL) { + ASSERT(error != NULL); + inet_minor_free(ip_minor_arena_sa, connp->conn_dev); + return (error); + } - case sizeof (sin6_t): - ASSERT(icmp->icmp_family == AF_INET6); - /* Append a request for an IRE */ - mp->b_cont = allocb(sizeof (ire_t), BPRI_HI); - if (!mp->b_cont) { - freemsg(mp); - return (NULL); - } - mp->b_cont->b_wptr += sizeof (ire_t); - mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE; + *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); + connp->conn_dev = conn_dev; + connp->conn_minor_arena = ip_minor_arena_sa; - sin6 = (sin6_t *)cp; - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - sin6->sin6_addr = icmp->icmp_bound_v6src; - break; + is = connp->conn_icmp->icmp_is; + + /* + * Initialize the icmp_t structure for this stream. + */ + q->q_ptr = connp; + WR(q)->q_ptr = connp; + connp->conn_rq = q; + connp->conn_wq = WR(q); + + if (connp->conn_icmp->icmp_family == AF_INET6) { + /* Build initial header template for transmit */ + rw_enter(&connp->conn_icmp->icmp_rwlock, RW_WRITER); + if ((error = icmp_build_hdrs(connp->conn_icmp)) != 0) { + rw_exit(&connp->conn_icmp->icmp_rwlock); + inet_minor_free(ip_minor_arena_sa, connp->conn_dev); + ipcl_conn_destroy(connp); + return (error); + } + rw_exit(&connp->conn_icmp->icmp_rwlock); } - /* Add protocol number to end */ - cp[addr_length] = icmp->icmp_proto; - mp->b_wptr = (uchar_t *)&cp[addr_length + 1]; - return (mp); + + + q->q_hiwat = is->is_recv_hiwat; + WR(q)->q_hiwat = is->is_xmit_hiwat; + WR(q)->q_lowat = is->is_xmit_lowat; + + qprocson(q); + + /* Set the Stream head write offset. */ + (void) proto_set_tx_wroff(q, connp, + connp->conn_icmp->icmp_max_hdr_len + is->is_wroff_extra); + (void) proto_set_rx_hiwat(connp->conn_rq, connp, q->q_hiwat); + + mutex_enter(&connp->conn_lock); + connp->conn_state_flags &= ~CONN_INCIPIENT; + mutex_exit(&connp->conn_lock); + + return (0); } -/* For /dev/icmp aka AF_INET open */ +/* For /dev/icmp4 aka AF_INET open */ static int icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) { - return (icmp_open(q, devp, flag, sflag, credp, B_FALSE)); + return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET)); } /* For /dev/icmp6 aka AF_INET6 open */ static int icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) { - return (icmp_open(q, devp, flag, sflag, credp, B_TRUE)); + return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6)); } /* * This is the open routine for icmp. It allocates a icmp_t structure for * the stream and, on the first open of the module, creates an ND table. */ -/*ARGSUSED2*/ -static int -icmp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, - boolean_t isv6) +/* ARGSUSED */ +static conn_t * +icmp_open(int family, cred_t *credp, int *err, int flags) { - int err; icmp_t *icmp; conn_t *connp; - dev_t conn_dev; zoneid_t zoneid; netstack_t *ns; icmp_stack_t *is; + boolean_t isv6 = B_FALSE; - /* If the stream is already open, return immediately. */ - if (q->q_ptr != NULL) - return (0); - - if (sflag == MODOPEN) - return (EINVAL); + *err = secpolicy_net_icmpaccess(credp); + if (*err != 0) + return (NULL); + if (family == AF_INET6) + isv6 = B_TRUE; ns = netstack_find_by_cred(credp); ASSERT(ns != NULL); is = ns->netstack_icmp; @@ -1488,20 +1600,11 @@ icmp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, else zoneid = crgetzoneid(credp); - /* - * Since ICMP is not used so heavily, allocating from the small - * arena should be sufficient. - */ - if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) { - netstack_rele(ns); - return (EBUSY); - } - *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); + ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP); - connp = ipcl_conn_create(IPCL_RAWIPCONN, KM_SLEEP, ns); - connp->conn_dev = conn_dev; - connp->conn_minor_arena = ip_minor_arena_sa; + connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns); icmp = connp->conn_icmp; + icmp->icmp_v6dst = sin6_null; /* * ipcl_conn_create did a netstack_hold. Undo the hold that was @@ -1509,14 +1612,6 @@ icmp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, */ netstack_rele(ns); - /* - * Initialize the icmp_t structure for this stream. - */ - q->q_ptr = connp; - WR(q)->q_ptr = connp; - connp->conn_rq = q; - connp->conn_wq = WR(q); - rw_enter(&icmp->icmp_rwlock, RW_WRITER); ASSERT(connp->conn_ulp == IPPROTO_ICMP); ASSERT(connp->conn_icmp == icmp); @@ -1561,37 +1656,14 @@ icmp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, icmp->icmp_is = is; - q->q_hiwat = is->is_recv_hiwat; - WR(q)->q_hiwat = is->is_xmit_hiwat; - WR(q)->q_lowat = is->is_xmit_lowat; - connp->conn_recv = icmp_input; crhold(credp); connp->conn_cred = credp; - mutex_enter(&connp->conn_lock); - connp->conn_state_flags &= ~CONN_INCIPIENT; - mutex_exit(&connp->conn_lock); - - qprocson(q); - - if (icmp->icmp_family == AF_INET6) { - /* Build initial header template for transmit */ - if ((err = icmp_build_hdrs(icmp)) != 0) { - rw_exit(&icmp->icmp_rwlock); - qprocsoff(q); - ipcl_conn_destroy(connp); - return (err); - } - } rw_exit(&icmp->icmp_rwlock); - /* Set the Stream head write offset. */ - (void) mi_set_sth_wroff(q, - icmp->icmp_max_hdr_len + is->is_wroff_extra); - (void) mi_set_sth_hiwat(q, q->q_hiwat); - - return (0); + connp->conn_flow_cntrld = B_FALSE; + return (connp); } /* @@ -1657,14 +1729,15 @@ icmp_opt_default(queue_t *q, int level, int name, uchar_t *ptr) * It returns the size of the option retrieved. */ int -icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr) +icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) { - conn_t *connp = Q_TO_CONN(q); - icmp_t *icmp = connp->conn_icmp; - icmp_stack_t *is = icmp->icmp_is; - int *i1 = (int *)ptr; + icmp_t *icmp = connp->conn_icmp; + icmp_stack_t *is = icmp->icmp_is; + int *i1 = (int *)ptr; ip6_pkt_t *ipp = &icmp->icmp_sticky_ipp; + int ret = 0; + ASSERT(RW_READ_HELD(&icmp->icmp_rwlock)); switch (level) { case SOL_SOCKET: switch (name) { @@ -1696,12 +1769,12 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr) break; case SO_SNDBUF: - ASSERT(q->q_hiwat <= INT_MAX); - *i1 = (int)q->q_hiwat; + ASSERT(icmp->icmp_xmit_hiwat <= INT_MAX); + *i1 = icmp->icmp_xmit_hiwat; break; case SO_RCVBUF: - ASSERT(RD(q)->q_hiwat <= INT_MAX); - *i1 = (int)RD(q)->q_hiwat; + ASSERT(icmp->icmp_recv_hiwat <= INT_MAX); + *i1 = icmp->icmp_recv_hiwat; break; case SO_DGRAM_ERRIND: *i1 = icmp->icmp_dgram_errind; @@ -1726,21 +1799,25 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr) * case SO_ALLZONES: */ default: - return (-1); + ret = -1; + goto done; } break; case IPPROTO_IP: /* * Only allow IPv4 option processing on IPv4 sockets. */ - if (icmp->icmp_family != AF_INET) - return (-1); + if (icmp->icmp_family != AF_INET) { + ret = -1; + goto done; + } switch (name) { case IP_OPTIONS: case T_IP_OPTIONS: /* Options are passed up with each packet */ - return (0); + ret = 0; + goto done; case IP_HDRINCL: *i1 = (int)icmp->icmp_hdrincl; break; @@ -1754,13 +1831,16 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr) case IP_MULTICAST_IF: /* 0 address if not set */ *(ipaddr_t *)ptr = icmp->icmp_multicast_if_addr; - return (sizeof (ipaddr_t)); + ret = sizeof (ipaddr_t); + goto done; case IP_MULTICAST_TTL: *(uchar_t *)ptr = icmp->icmp_multicast_ttl; - return (sizeof (uchar_t)); + ret = sizeof (uchar_t); + goto done; case IP_MULTICAST_LOOP: *ptr = connp->conn_multicast_loop; - return (sizeof (uint8_t)); + ret = sizeof (uint8_t); + goto done; case IP_BOUND_IF: /* Zero if not set */ *i1 = icmp->icmp_bound_if; @@ -1768,12 +1848,12 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr) case IP_UNSPEC_SRC: *ptr = icmp->icmp_unspec_source; break; /* goto sizeof (int) option return */ - case IP_BROADCAST_TTL: - *(uchar_t *)ptr = connp->conn_broadcast_ttl; - return (sizeof (uchar_t)); case IP_RECVIF: *ptr = icmp->icmp_recvif; break; /* goto sizeof (int) option return */ + case IP_BROADCAST_TTL: + *(uchar_t *)ptr = connp->conn_broadcast_ttl; + return (sizeof (uchar_t)); case IP_RECVPKTINFO: /* * This also handles IP_PKTINFO. @@ -1784,7 +1864,8 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr) * error for IP_PKTINFO as it's not supported as a * sticky option. */ - return (-EINVAL); + ret = -EINVAL; + goto done; /* * Cannot "get" the value of following options * at this level. Action is same as "default" to @@ -1815,15 +1896,18 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr) * case IP_NEXTHOP: */ default: - return (-1); + ret = -1; + goto done; } break; case IPPROTO_IPV6: /* * Only allow IPv6 option processing on native IPv6 sockets. */ - if (icmp->icmp_family != AF_INET6) - return (-1); + if (icmp->icmp_family != AF_INET6) { + ret = -1; + goto done; + } switch (name) { case IPV6_UNICAST_HOPS: *i1 = (unsigned int)icmp->icmp_ttl; @@ -1850,8 +1934,10 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr) * Return offset or -1 if no checksum offset. * Does not apply to IPPROTO_ICMPV6 */ - if (icmp->icmp_proto == IPPROTO_ICMPV6) - return (-1); + if (icmp->icmp_proto == IPPROTO_ICMPV6) { + ret = -1; + goto done; + } if (icmp->icmp_raw_checksum) { *i1 = icmp->icmp_checksum_off; @@ -1868,7 +1954,8 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr) case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: /* cannot "get" the value for these */ - return (-1); + ret = -1; + goto done; case IPV6_RECVPKTINFO: *i1 = icmp->icmp_ip_recvpktinfo; break; @@ -1912,7 +1999,8 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr) pkti->ipi6_addr = ipp->ipp_addr; else pkti->ipi6_addr = ipv6_all_zeros; - return (sizeof (struct in6_pktinfo)); + ret = sizeof (struct in6_pktinfo); + goto done; } case IPV6_NEXTHOP: { sin6_t *sin6 = (sin6_t *)ptr; @@ -1922,7 +2010,8 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr) *sin6 = sin6_null; sin6->sin6_family = AF_INET6; sin6->sin6_addr = ipp->ipp_nexthop; - return (sizeof (sin6_t)); + ret = (sizeof (sin6_t)); + goto done; } case IPV6_HOPOPTS: if (!(ipp->ipp_fields & IPPF_HOPOPTS)) @@ -1937,28 +2026,38 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr) ptr[1] = (ipp->ipp_hopoptslen - icmp->icmp_label_len_v6 + 7) / 8 - 1; } - return (ipp->ipp_hopoptslen - icmp->icmp_label_len_v6); + ret = (ipp->ipp_hopoptslen - icmp->icmp_label_len_v6); + goto done; case IPV6_RTHDRDSTOPTS: if (!(ipp->ipp_fields & IPPF_RTDSTOPTS)) return (0); bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen); - return (ipp->ipp_rtdstoptslen); + ret = ipp->ipp_rtdstoptslen; + goto done; case IPV6_RTHDR: if (!(ipp->ipp_fields & IPPF_RTHDR)) return (0); bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen); - return (ipp->ipp_rthdrlen); + ret = ipp->ipp_rthdrlen; + goto done; case IPV6_DSTOPTS: - if (!(ipp->ipp_fields & IPPF_DSTOPTS)) - return (0); + if (!(ipp->ipp_fields & IPPF_DSTOPTS)) { + ret = 0; + goto done; + } bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen); - return (ipp->ipp_dstoptslen); + ret = ipp->ipp_dstoptslen; + goto done; case IPV6_PATHMTU: - if (!(ipp->ipp_fields & IPPF_PATHMTU)) - return (0); - - return (ip_fill_mtuinfo(&icmp->icmp_v6dst, 0, - (struct ip6_mtuinfo *)ptr, is->is_netstack)); + if (!(ipp->ipp_fields & IPPF_PATHMTU)) { + ret = 0; + } else { + ret = ip_fill_mtuinfo( + &icmp->icmp_v6dst.sin6_addr, 0, + (struct ip6_mtuinfo *)ptr, + is->is_netstack); + } + goto done; case IPV6_TCLASS: if (ipp->ipp_fields & IPPF_TCLASS) *i1 = ipp->ipp_tclass; @@ -1967,18 +2066,21 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr) IPV6_DEFAULT_VERS_AND_FLOW); break; default: - return (-1); + ret = -1; + goto done; } break; case IPPROTO_ICMPV6: /* * Only allow IPv6 option processing on native IPv6 sockets. */ - if (icmp->icmp_family != AF_INET6) - return (-1); + if (icmp->icmp_family != AF_INET6) { + ret = -1; + } - if (icmp->icmp_proto != IPPROTO_ICMPV6) - return (-1); + if (icmp->icmp_proto != IPPROTO_ICMPV6) { + ret = -1; + } switch (name) { case ICMP6_FILTER: @@ -1989,14 +2091,19 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr) (void) bcopy(icmp->icmp_filter, ptr, sizeof (icmp6_filter_t)); } - return (sizeof (icmp6_filter_t)); + ret = sizeof (icmp6_filter_t); + goto done; default: - return (-1); + ret = -1; + goto done; } default: - return (-1); + ret = -1; + goto done; } - return (sizeof (int)); + ret = sizeof (int); +done: + return (ret); } /* @@ -2004,84 +2111,36 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr) * It returns the size of the option retrieved. */ int -icmp_opt_get(queue_t *q, int level, int name, uchar_t *ptr) +icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) { - icmp_t *icmp = Q_TO_ICMP(q); + conn_t *connp = Q_TO_CONN(q); + icmp_t *icmp = connp->conn_icmp; int err; rw_enter(&icmp->icmp_rwlock, RW_READER); - err = icmp_opt_get_locked(q, level, name, ptr); + err = icmp_opt_get(connp, level, name, ptr); rw_exit(&icmp->icmp_rwlock); return (err); } - -/* This routine sets socket options. */ -/* ARGSUSED */ int -icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name, - uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, - void *thisdg_attrs, cred_t *cr, mblk_t *mblk) +icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen, + uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, cred_t *cr, + void *thisdg_attrs, boolean_t checkonly) { - conn_t *connp = Q_TO_CONN(q); - icmp_t *icmp = connp->conn_icmp; - icmp_stack_t *is = icmp->icmp_is; + int *i1 = (int *)invalp; boolean_t onoff = (*i1 == 0) ? 0 : 1; - boolean_t checkonly; + icmp_t *icmp = connp->conn_icmp; + icmp_stack_t *is = icmp->icmp_is; int error; - switch (optset_context) { - case SETFN_OPTCOM_CHECKONLY: - checkonly = B_TRUE; - /* - * Note: Implies T_CHECK semantics for T_OPTCOM_REQ - * inlen != 0 implies value supplied and - * we have to "pretend" to set it. - * inlen == 0 implies that there is no - * value part in T_CHECK request and just validation - * done elsewhere should be enough, we just return here. - */ - if (inlen == 0) { - *outlenp = 0; - return (0); - } - break; - case SETFN_OPTCOM_NEGOTIATE: - checkonly = B_FALSE; - break; - case SETFN_UD_NEGOTIATE: - case SETFN_CONN_NEGOTIATE: - checkonly = B_FALSE; - /* - * Negotiating local and "association-related" options - * through T_UNITDATA_REQ. - * - * Following routine can filter out ones we do not - * want to be "set" this way. - */ - if (!icmp_opt_allow_udr_set(level, name)) { - *outlenp = 0; - return (EINVAL); - } - break; - default: - /* - * We should never get here - */ - *outlenp = 0; - return (EINVAL); - } - - ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || - (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); - + ASSERT(RW_WRITE_HELD(&icmp->icmp_rwlock)); /* * For fixed length options, no sanity check * of passed in length is done. It is assumed *_optcom_req() * routines do the right thing. */ - switch (level) { case SOL_SOCKET: switch (name) { @@ -2161,12 +2220,14 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name, /* Drop lock across the bind operation */ rw_exit(&icmp->icmp_rwlock); - icmp_bind_proto(q); + (void) icmp_bind_proto(connp); rw_enter(&icmp->icmp_rwlock, RW_WRITER); return (0); case SO_REUSEADDR: - if (!checkonly) + if (!checkonly) { icmp->icmp_reuseaddr = onoff; + PASS_OPT_TO_IP(connp); + } break; /* @@ -2174,16 +2235,22 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name, * but are only meaningful to IP. */ case SO_DONTROUTE: - if (!checkonly) + if (!checkonly) { icmp->icmp_dontroute = onoff; + PASS_OPT_TO_IP(connp); + } break; case SO_USELOOPBACK: - if (!checkonly) + if (!checkonly) { icmp->icmp_useloopback = onoff; + PASS_OPT_TO_IP(connp); + } break; case SO_BROADCAST: - if (!checkonly) + if (!checkonly) { icmp->icmp_broadcast = onoff; + PASS_OPT_TO_IP(connp); + } break; case SO_SNDBUF: @@ -2192,7 +2259,10 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name, return (ENOBUFS); } if (!checkonly) { - q->q_hiwat = *i1; + if (!IPCL_IS_NONSTR(connp)) { + connp->conn_wq->q_hiwat = *i1; + } + icmp->icmp_xmit_hiwat = *i1; } break; case SO_RCVBUF: @@ -2201,9 +2271,10 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name, return (ENOBUFS); } if (!checkonly) { - RD(q)->q_hiwat = *i1; + icmp->icmp_recv_hiwat = *i1; rw_exit(&icmp->icmp_rwlock); - (void) mi_set_sth_hiwat(RD(q), *i1); + (void) proto_set_rx_hiwat(connp->conn_rq, connp, + *i1); rw_enter(&icmp->icmp_rwlock, RW_WRITER); } break; @@ -2273,8 +2344,9 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name, icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + icmp->icmp_ip_snd_options_len; rw_exit(&icmp->icmp_rwlock); - (void) mi_set_sth_wroff(RD(q), icmp->icmp_max_hdr_len + - is->is_wroff_extra); + (void) proto_set_tx_wroff(connp->conn_rq == NULL ? NULL: + RD(connp->conn_rq), connp, + icmp->icmp_max_hdr_len + is->is_wroff_extra); rw_enter(&icmp->icmp_rwlock, RW_WRITER); break; case IP_HDRINCL: @@ -2297,8 +2369,10 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name, * TODO should check OPTMGMT reply and undo this if * there is an error. */ - if (!checkonly) + if (!checkonly) { icmp->icmp_multicast_if_addr = *i1; + PASS_OPT_TO_IP(connp); + } break; case IP_MULTICAST_TTL: if (!checkonly) @@ -2308,23 +2382,29 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name, if (!checkonly) { connp->conn_multicast_loop = (*invalp == 0) ? 0 : 1; + PASS_OPT_TO_IP(connp); } break; case IP_BOUND_IF: - if (!checkonly) + if (!checkonly) { icmp->icmp_bound_if = *i1; + PASS_OPT_TO_IP(connp); + } break; case IP_UNSPEC_SRC: - if (!checkonly) + if (!checkonly) { icmp->icmp_unspec_source = onoff; + PASS_OPT_TO_IP(connp); + } break; case IP_BROADCAST_TTL: if (!checkonly) connp->conn_broadcast_ttl = *invalp; break; case IP_RECVIF: - if (!checkonly) + if (!checkonly) { icmp->icmp_recvif = onoff; + } /* * pass to ip */ @@ -2354,8 +2434,9 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name, } - if (inlen != sizeof (struct in_pktinfo)) + if (inlen != sizeof (struct in_pktinfo)) { return (EINVAL); + } if ((attr_pktinfop = (ip4_pkt_t *)thisdg_attrs) == NULL) { @@ -2436,8 +2517,10 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name, switch (name) { case IPV6_MULTICAST_IF: - if (!checkonly) + if (!checkonly) { icmp->icmp_multicast_if_index = *i1; + PASS_OPT_TO_IP(connp); + } break; case IPV6_UNICAST_HOPS: /* -1 means use default */ @@ -2492,8 +2575,10 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name, *outlenp = 0; return (EINVAL); } - if (!checkonly) + if (!checkonly) { connp->conn_multicast_loop = *i1; + PASS_OPT_TO_IP(connp); + } break; case IPV6_CHECKSUM: /* @@ -2544,51 +2629,71 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name, */ return (-EINVAL); case IPV6_BOUND_IF: - if (!checkonly) + if (!checkonly) { icmp->icmp_bound_if = *i1; + PASS_OPT_TO_IP(connp); + } break; case IPV6_UNSPEC_SRC: - if (!checkonly) + if (!checkonly) { icmp->icmp_unspec_source = onoff; + PASS_OPT_TO_IP(connp); + } break; case IPV6_RECVTCLASS: - if (!checkonly) + if (!checkonly) { icmp->icmp_ipv6_recvtclass = onoff; + PASS_OPT_TO_IP(connp); + } break; /* * Set boolean switches for ancillary data delivery */ case IPV6_RECVPKTINFO: - if (!checkonly) + if (!checkonly) { icmp->icmp_ip_recvpktinfo = onoff; + PASS_OPT_TO_IP(connp); + } break; case IPV6_RECVPATHMTU: - if (!checkonly) + if (!checkonly) { icmp->icmp_ipv6_recvpathmtu = onoff; + PASS_OPT_TO_IP(connp); + } break; case IPV6_RECVHOPLIMIT: - if (!checkonly) + if (!checkonly) { icmp->icmp_ipv6_recvhoplimit = onoff; + PASS_OPT_TO_IP(connp); + } break; case IPV6_RECVHOPOPTS: - if (!checkonly) + if (!checkonly) { icmp->icmp_ipv6_recvhopopts = onoff; + PASS_OPT_TO_IP(connp); + } break; case IPV6_RECVDSTOPTS: - if (!checkonly) + if (!checkonly) { icmp->icmp_ipv6_recvdstopts = onoff; + PASS_OPT_TO_IP(connp); + } break; case _OLD_IPV6_RECVDSTOPTS: if (!checkonly) icmp->icmp_old_ipv6_recvdstopts = onoff; break; case IPV6_RECVRTHDRDSTOPTS: - if (!checkonly) + if (!checkonly) { icmp->icmp_ipv6_recvrtdstopts = onoff; + PASS_OPT_TO_IP(connp); + } break; case IPV6_RECVRTHDR: - if (!checkonly) + if (!checkonly) { icmp->icmp_ipv6_recvrthdr = onoff; + PASS_OPT_TO_IP(connp); + } break; /* * Set sticky options or ancillary data. @@ -2601,8 +2706,10 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name, * in ip_opt_set(). For ancillary data the * source address is checked in ip_wput_v6. */ - if (inlen != 0 && inlen != sizeof (struct in6_pktinfo)) + if (inlen != 0 && inlen != + sizeof (struct in6_pktinfo)) { return (EINVAL); + } if (checkonly) break; @@ -2630,6 +2737,7 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name, error = icmp_build_hdrs(icmp); if (error != 0) return (error); + PASS_OPT_TO_IP(connp); } break; case IPV6_HOPLIMIT: @@ -2660,8 +2768,9 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name, * IPV6_RECVTCLASS accepts -1 as use kernel default * and [0, 255] as the actualy traffic class. */ - if (inlen != 0 && inlen != sizeof (int)) + if (inlen != 0 && inlen != sizeof (int)) { return (EINVAL); + } if (checkonly) break; @@ -2691,8 +2800,9 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name, * IP will verify that the nexthop is reachable * and fail for sticky options. */ - if (inlen != 0 && inlen != sizeof (sin6_t)) + if (inlen != 0 && inlen != sizeof (sin6_t)) { return (EINVAL); + } if (checkonly) break; @@ -2702,10 +2812,12 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name, } else { sin6_t *sin6 = (sin6_t *)invalp; - if (sin6->sin6_family != AF_INET6) + if (sin6->sin6_family != AF_INET6) { return (EAFNOSUPPORT); - if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) + } + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { return (EADDRNOTAVAIL); + } ipp->ipp_nexthop = sin6->sin6_addr; if (!IN6_IS_ADDR_UNSPECIFIED( &ipp->ipp_nexthop)) @@ -2717,6 +2829,7 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name, error = icmp_build_hdrs(icmp); if (error != 0) return (error); + PASS_OPT_TO_IP(connp); } break; case IPV6_HOPOPTS: { @@ -2726,8 +2839,9 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name, * eight bytes, and matching size passed in. */ if (inlen != 0 && - inlen != (8 * (hopts->ip6h_len + 1))) + inlen != (8 * (hopts->ip6h_len + 1))) { return (EINVAL); + } if (checkonly) break; @@ -2974,23 +3088,89 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name, *outlenp = inlen; return (0); } + /* This routine sets socket options. */ /* ARGSUSED */ int -icmp_opt_set(queue_t *q, uint_t optset_context, int level, int name, +icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, + uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, + void *thisdg_attrs, cred_t *cr) +{ + boolean_t checkonly; + int error; + + error = 0; + switch (optset_context) { + case SETFN_OPTCOM_CHECKONLY: + checkonly = B_TRUE; + /* + * Note: Implies T_CHECK semantics for T_OPTCOM_REQ + * inlen != 0 implies value supplied and + * we have to "pretend" to set it. + * inlen == 0 implies that there is no + * value part in T_CHECK request and just validation + * done elsewhere should be enough, we just return here. + */ + if (inlen == 0) { + *outlenp = 0; + error = 0; + goto done; + } + break; + case SETFN_OPTCOM_NEGOTIATE: + checkonly = B_FALSE; + break; + case SETFN_UD_NEGOTIATE: + case SETFN_CONN_NEGOTIATE: + checkonly = B_FALSE; + /* + * Negotiating local and "association-related" options + * through T_UNITDATA_REQ. + * + * Following routine can filter out ones we do not + * want to be "set" this way. + */ + if (!icmp_opt_allow_udr_set(level, name)) { + *outlenp = 0; + error = EINVAL; + goto done; + } + break; + default: + /* + * We should never get here + */ + *outlenp = 0; + error = EINVAL; + goto done; + } + + ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || + (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); + error = icmp_do_opt_set(connp, level, name, inlen, invalp, outlenp, + outvalp, cr, thisdg_attrs, checkonly); + +done: + return (error); +} + +/* This routine sets socket options. */ +/* ARGSUSED */ +int +icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk) { + conn_t *connp = Q_TO_CONN(q); icmp_t *icmp; - int err; - - icmp = Q_TO_ICMP(q); + int error; + icmp = connp->conn_icmp; rw_enter(&icmp->icmp_rwlock, RW_WRITER); - err = icmp_opt_set_locked(q, optset_context, level, name, inlen, invalp, - outlenp, outvalp, thisdg_attrs, cr, mblk); + error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp, + outlenp, outvalp, thisdg_attrs, cr); rw_exit(&icmp->icmp_rwlock); - return (err); + return (error); } /* @@ -3055,7 +3235,8 @@ icmp_build_hdrs(icmp_t *icmp) if (hdrs_len > icmp->icmp_max_hdr_len) { icmp->icmp_max_hdr_len = hdrs_len; rw_exit(&icmp->icmp_rwlock); - (void) mi_set_sth_wroff(icmp->icmp_connp->conn_rq, + (void) proto_set_tx_wroff(icmp->icmp_connp->conn_rq, + icmp->icmp_connp, icmp->icmp_max_hdr_len + is->is_wroff_extra); rw_enter(&icmp->icmp_rwlock, RW_WRITER); } @@ -3123,6 +3304,33 @@ icmp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr) icmppa->icmp_param_value = new_value; return (0); } +static void +icmp_queue_fallback(icmp_t *icmp, mblk_t *mp) +{ + ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock)); + if (IPCL_IS_NONSTR(icmp->icmp_connp)) { + /* + * fallback has started but messages have not been moved yet + */ + if (icmp->icmp_fallback_queue_head == NULL) { + ASSERT(icmp->icmp_fallback_queue_tail == NULL); + icmp->icmp_fallback_queue_head = mp; + icmp->icmp_fallback_queue_tail = mp; + } else { + ASSERT(icmp->icmp_fallback_queue_tail != NULL); + icmp->icmp_fallback_queue_tail->b_next = mp; + icmp->icmp_fallback_queue_tail = mp; + } + mutex_exit(&icmp->icmp_recv_lock); + } else { + /* + * no more fallbacks possible, ok to drop lock. + */ + mutex_exit(&icmp->icmp_recv_lock); + putnext(icmp->icmp_connp->conn_rq, mp); + } +} + /*ARGSUSED2*/ static void icmp_input(void *arg1, mblk_t *mp, void *arg2) @@ -3148,6 +3356,7 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2) uint_t icmp_opt = 0; boolean_t icmp_ipv6_recvhoplimit = B_FALSE; uint_t hopstrip; + int error; ASSERT(connp->conn_flags & IPCL_RAWIPCONN); @@ -3189,7 +3398,7 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2) /* * ICMP messages. */ - icmp_icmp_error(connp->conn_rq, mp); + icmp_icmp_error(connp, mp); return; } } @@ -3388,8 +3597,7 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2) freeb(options_mp); BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams); - putnext(connp->conn_rq, mp); - return; + goto deliver; } /* @@ -3707,7 +3915,7 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2) udi_size -= toh->len; } if (icmp->icmp_timestamp) { - struct T_opthdr *toh; + struct T_opthdr *toh; toh = (struct T_opthdr *)dstopt; toh->level = SOL_SOCKET; @@ -3723,6 +3931,7 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2) dstopt = (uchar_t *)toh + toh->len; udi_size -= toh->len; } + if (icmp_opt & IPPF_HOPOPTS) { struct T_opthdr *toh; @@ -3792,235 +4001,37 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2) ASSERT(udi_size == 0); } BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams); - putnext(connp->conn_rq, mp); -} - -/* - * Handle the results of a T_BIND_REQ whether deferred by IP or handled - * immediately. - */ -static void -icmp_bind_result(conn_t *connp, mblk_t *mp) -{ - struct T_error_ack *tea; - - switch (mp->b_datap->db_type) { - case M_PROTO: - case M_PCPROTO: - /* M_PROTO messages contain some type of TPI message. */ - if ((mp->b_wptr - mp->b_rptr) < sizeof (t_scalar_t)) { - freemsg(mp); - return; - } - tea = (struct T_error_ack *)mp->b_rptr; - - switch (tea->PRIM_type) { - case T_ERROR_ACK: - switch (tea->ERROR_prim) { - case O_T_BIND_REQ: - case T_BIND_REQ: - icmp_bind_error(connp, mp); - return; - default: - break; - } - ASSERT(0); - freemsg(mp); - return; - - case T_BIND_ACK: - icmp_bind_ack(connp, mp); - return; - - default: - break; - } - freemsg(mp); - return; - default: - /* FIXME: other cases? */ - ASSERT(0); - freemsg(mp); - return; - } -} - -/* - * Process a T_BIND_ACK - */ -static void -icmp_bind_ack(conn_t *connp, mblk_t *mp) -{ - icmp_t *icmp = connp->conn_icmp; - mblk_t *mp1; - ire_t *ire; - struct T_bind_ack *tba; - uchar_t *addrp; - ipa_conn_t *ac; - ipa6_conn_t *ac6; - - rw_enter(&icmp->icmp_rwlock, RW_WRITER); - /* - * We know if headers are included or not so we can - * safely do this. - */ - if (icmp->icmp_state == TS_UNBND) { - /* - * TPI has not yet bound - bind sent by - * icmp_bind_proto. - */ - freemsg(mp); - rw_exit(&icmp->icmp_rwlock); - return; - } - ASSERT(icmp->icmp_pending_op != -1); - - /* - * If a broadcast/multicast address was bound set - * the source address to 0. - * This ensures no datagrams with broadcast address - * as source address are emitted (which would violate - * RFC1122 - Hosts requirements) - * - * Note that when connecting the returned IRE is - * for the destination address and we only perform - * the broadcast check for the source address (it - * is OK to connect to a broadcast/multicast address.) - */ - mp1 = mp->b_cont; - if (mp1 != NULL && mp1->b_datap->db_type == IRE_DB_TYPE) { - ire = (ire_t *)mp1->b_rptr; - /* - * Note: we get IRE_BROADCAST for IPv6 to "mark" a multicast - * local address. - */ - if (ire->ire_type == IRE_BROADCAST && - icmp->icmp_state != TS_DATA_XFER) { - ASSERT(icmp->icmp_pending_op == T_BIND_REQ || - icmp->icmp_pending_op == O_T_BIND_REQ); - /* This was just a local bind to a MC/broadcast addr */ - V6_SET_ZERO(icmp->icmp_v6src); - if (icmp->icmp_family == AF_INET6) - (void) icmp_build_hdrs(icmp); - } else if (V6_OR_V4_INADDR_ANY(icmp->icmp_v6src)) { - /* - * Local address not yet set - pick it from the - * T_bind_ack - */ - tba = (struct T_bind_ack *)mp->b_rptr; - addrp = &mp->b_rptr[tba->ADDR_offset]; - switch (icmp->icmp_family) { - case AF_INET: - if (tba->ADDR_length == sizeof (ipa_conn_t)) { - ac = (ipa_conn_t *)addrp; - } else { - ASSERT(tba->ADDR_length == - sizeof (ipa_conn_x_t)); - ac = &((ipa_conn_x_t *)addrp)->acx_conn; - } - IN6_IPADDR_TO_V4MAPPED(ac->ac_laddr, - &icmp->icmp_v6src); - break; - case AF_INET6: - if (tba->ADDR_length == sizeof (ipa6_conn_t)) { - ac6 = (ipa6_conn_t *)addrp; - } else { - ASSERT(tba->ADDR_length == - sizeof (ipa6_conn_x_t)); - ac6 = &((ipa6_conn_x_t *) - addrp)->ac6x_conn; +deliver: + if (IPCL_IS_NONSTR(connp)) { + if ((*connp->conn_upcalls->su_recv) + (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error, + NULL) < 0) { + mutex_enter(&icmp->icmp_recv_lock); + if (error == ENOSPC) { + /* + * let's confirm while holding the lock + */ + if ((*connp->conn_upcalls->su_recv) + (connp->conn_upper_handle, NULL, 0, 0, + &error, NULL) < 0) { + if (error == ENOSPC) { + connp->conn_flow_cntrld = + B_TRUE; + } else { + ASSERT(error == EOPNOTSUPP); + } } - icmp->icmp_v6src = ac6->ac6_laddr; - (void) icmp_build_hdrs(icmp); + mutex_exit(&icmp->icmp_recv_lock); + } else { + ASSERT(error == EOPNOTSUPP); + icmp_queue_fallback(icmp, mp); } } - mp1 = mp1->b_cont; - } - icmp->icmp_pending_op = -1; - rw_exit(&icmp->icmp_rwlock); - /* - * Look for one or more appended ACK message added by - * icmp_connect or icmp_disconnect. - * If none found just send up the T_BIND_ACK. - * icmp_connect has appended a T_OK_ACK and a - * T_CONN_CON. - * icmp_disconnect has appended a T_OK_ACK. - */ - if (mp1 != NULL) { - if (mp->b_cont == mp1) - mp->b_cont = NULL; - else { - ASSERT(mp->b_cont->b_cont == mp1); - mp->b_cont->b_cont = NULL; - } - freemsg(mp); - mp = mp1; - while (mp != NULL) { - mp1 = mp->b_cont; - mp->b_cont = NULL; - putnext(connp->conn_rq, mp); - mp = mp1; - } - return; - } - freemsg(mp->b_cont); - mp->b_cont = NULL; - putnext(connp->conn_rq, mp); -} - -static void -icmp_bind_error(conn_t *connp, mblk_t *mp) -{ - icmp_t *icmp = connp->conn_icmp; - struct T_error_ack *tea; - - tea = (struct T_error_ack *)mp->b_rptr; - /* - * If our O_T_BIND_REQ/T_BIND_REQ fails, - * clear out the source address before - * passing the message upstream. - * If this was caused by a T_CONN_REQ - * revert back to bound state. - */ - rw_enter(&icmp->icmp_rwlock, RW_WRITER); - if (icmp->icmp_state == TS_UNBND) { - /* - * TPI has not yet bound - bind sent by icmp_bind_proto. - */ - freemsg(mp); - rw_exit(&icmp->icmp_rwlock); - return; - } - ASSERT(icmp->icmp_pending_op != -1); - tea->ERROR_prim = icmp->icmp_pending_op; - icmp->icmp_pending_op = -1; - - switch (tea->ERROR_prim) { - case T_CONN_REQ: - ASSERT(icmp->icmp_state == TS_DATA_XFER); - /* Connect failed */ - /* Revert back to the bound source */ - icmp->icmp_v6src = icmp->icmp_bound_v6src; - icmp->icmp_state = TS_IDLE; - if (icmp->icmp_family == AF_INET6) - (void) icmp_build_hdrs(icmp); - break; - - case T_DISCON_REQ: - case T_BIND_REQ: - case O_T_BIND_REQ: - V6_SET_ZERO(icmp->icmp_v6src); - V6_SET_ZERO(icmp->icmp_bound_v6src); - icmp->icmp_state = TS_UNBND; - if (icmp->icmp_family == AF_INET6) - (void) icmp_build_hdrs(icmp); - break; - default: - break; + } else { + putnext(connp->conn_rq, mp); } - rw_exit(&icmp->icmp_rwlock); - putnext(connp->conn_rq, mp); + ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock)); } /* @@ -4121,7 +4132,8 @@ icmp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) (void) mi_mpprintf(mp, MI_COL_PTRFMT_STR "%s %s %s", (void *)icmp, - inet_ntop(AF_INET6, &icmp->icmp_v6dst, faddrbuf, + inet_ntop(AF_INET6, &icmp->icmp_v6dst.sin6_addr, + faddrbuf, sizeof (faddrbuf)), inet_ntop(AF_INET6, &icmp->icmp_v6src, laddrbuf, sizeof (laddrbuf)), @@ -4152,32 +4164,26 @@ icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err) freemsg(mp); } -/* - * This routine is called by icmp_wput to handle T_UNBIND_REQ messages. - * After some error checking, the message is passed downstream to ip. - */ -static void -icmp_unbind(queue_t *q, mblk_t *mp) + +static int +rawip_do_unbind(conn_t *connp) { - icmp_t *icmp = Q_TO_ICMP(q); + icmp_t *icmp = connp->conn_icmp; rw_enter(&icmp->icmp_rwlock, RW_WRITER); /* If a bind has not been done, we can't unbind. */ if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) { rw_exit(&icmp->icmp_rwlock); - icmp_err_ack(q, mp, TOUTSTATE, 0); - return; + return (-TOUTSTATE); } icmp->icmp_pending_op = T_UNBIND_REQ; rw_exit(&icmp->icmp_rwlock); /* - * Pass the unbind to IP; T_UNBIND_REQ is larger than T_OK_ACK - * and therefore ip_unbind must never return NULL. + * Call ip to unbind */ - mp = ip_unbind(q, mp); - ASSERT(mp != NULL); - ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK); + + ip_unbind(connp); /* * Once we're unbound from IP, the pending operation may be cleared @@ -4191,17 +4197,54 @@ icmp_unbind(queue_t *q, mblk_t *mp) if (icmp->icmp_family == AF_INET6) (void) icmp_build_hdrs(icmp); rw_exit(&icmp->icmp_rwlock); + return (0); +} + +/* + * This routine is called by icmp_wput to handle T_UNBIND_REQ messages. + * After some error checking, the message is passed downstream to ip. + */ +static void +icmp_tpi_unbind(queue_t *q, mblk_t *mp) +{ + conn_t *connp = Q_TO_CONN(q); + int error; + + ASSERT(mp->b_cont == NULL); + error = rawip_do_unbind(connp); + if (error) { + if (error < 0) { + icmp_err_ack(q, mp, -error, 0); + } else { + icmp_err_ack(q, mp, 0, error); + } + return; + } + + /* + * Convert mp into a T_OK_ACK + */ + + mp = mi_tpi_ok_ack_alloc(mp); + /* + * should not happen in practice... T_OK_ACK is smaller than the + * original message. + */ + ASSERT(mp != NULL); + ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK); qreply(q, mp); } + /* * Process IPv4 packets that already include an IP header. * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and * IPPROTO_IGMP). */ -static void -icmp_wput_hdrincl(queue_t *q, mblk_t *mp, icmp_t *icmp, ip4_pkt_t *pktinfop) +static int +icmp_wput_hdrincl(queue_t *q, conn_t *connp, mblk_t *mp, icmp_t *icmp, + ip4_pkt_t *pktinfop) { icmp_stack_t *is = icmp->icmp_is; ipha_t *ipha; @@ -4210,7 +4253,6 @@ icmp_wput_hdrincl(queue_t *q, mblk_t *mp, icmp_t *icmp, ip4_pkt_t *pktinfop) mblk_t *mp1; uint_t pkt_len; ip_opt_info_t optinfo; - conn_t *connp = icmp->icmp_connp; optinfo.ip_opt_flags = 0; optinfo.ip_opt_ill_index = 0; @@ -4221,7 +4263,7 @@ icmp_wput_hdrincl(queue_t *q, mblk_t *mp, icmp_t *icmp, ip4_pkt_t *pktinfop) ASSERT(icmp != NULL); BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); freemsg(mp); - return; + return (0); } ipha = (ipha_t *)mp->b_rptr; } @@ -4266,7 +4308,7 @@ icmp_wput_hdrincl(queue_t *q, mblk_t *mp, icmp_t *icmp, ip4_pkt_t *pktinfop) BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); freemsg(mp); - return; + return (0); } ipha = (ipha_t *)mp->b_rptr; } @@ -4278,13 +4320,11 @@ icmp_wput_hdrincl(queue_t *q, mblk_t *mp, icmp_t *icmp, ip4_pkt_t *pktinfop) pkt_len = ntohs(ipha->ipha_length) + icmp->icmp_ip_snd_options_len; if (pkt_len > IP_MAXPACKET) { - icmp_ud_err(q, mp, EMSGSIZE); - return; + return (EMSGSIZE); } if (!(mp1 = allocb(ip_hdr_length + is->is_wroff_extra + tp_hdr_len, BPRI_LO))) { - icmp_ud_err(q, mp, ENOMEM); - return; + return (ENOMEM); } mp1->b_rptr += is->is_wroff_extra; mp1->b_wptr = mp1->b_rptr + ip_hdr_length; @@ -4329,10 +4369,11 @@ icmp_wput_hdrincl(queue_t *q, mblk_t *mp, icmp_t *icmp, ip4_pkt_t *pktinfop) mblk_setcred(mp, connp->conn_cred); ip_output_options(connp, mp, q, IP_WPUT, &optinfo); + return (0); } -static boolean_t -icmp_update_label(queue_t *q, icmp_t *icmp, mblk_t *mp, ipaddr_t dst) +static int +icmp_update_label(icmp_t *icmp, mblk_t *mp, ipaddr_t dst) { int err; uchar_t opt_storage[IP_MAX_OPT_LENGTH]; @@ -4351,13 +4392,12 @@ icmp_update_label(queue_t *q, icmp_t *icmp, mblk_t *mp, ipaddr_t dst) BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); DTRACE_PROBE4( tx__ip__log__drop__updatelabel__icmp, - char *, "queue(1) failed to update options(2) on mp(3)", - queue_t *, q, char *, opt_storage, mblk_t *, mp); - icmp_ud_err(q, mp, err); - return (B_FALSE); + char *, "icmp(1) failed to update options(2) on mp(3)", + icmp_t *, icmp, char *, opt_storage, mblk_t *, mp); + return (err); } IN6_IPADDR_TO_V4MAPPED(dst, &icmp->icmp_v6lastdst); - return (B_TRUE); + return (0); } /* @@ -4371,7 +4411,6 @@ icmp_wput(queue_t *q, mblk_t *mp) uchar_t *rptr = mp->b_rptr; ipha_t *ipha; mblk_t *mp1; - int ip_hdr_length; #define tudr ((struct T_unitdata_req *)rptr) size_t ip_len; conn_t *connp = Q_TO_CONN(q); @@ -4382,7 +4421,12 @@ icmp_wput(queue_t *q, mblk_t *mp) ipaddr_t v4dst; ip4_pkt_t pktinfo; ip4_pkt_t *pktinfop = &pktinfo; - ip_opt_info_t optinfo; + ip6_pkt_t ipp_s; /* For ancillary data options */ + ip6_pkt_t *ipp = &ipp_s; + int error; + + ipp->ipp_fields = 0; + ipp->ipp_sticky_ignored = 0; switch (mp->b_datap->db_type) { case M_DATA: @@ -4406,11 +4450,17 @@ icmp_wput(queue_t *q, mblk_t *mp) if (is_system_labeled() && (!IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6lastdst) || V4_PART_OF_V6(icmp->icmp_v6lastdst) != - ipha->ipha_dst) && - !icmp_update_label(q, icmp, mp, ipha->ipha_dst)) { - return; + ipha->ipha_dst)) { + error = icmp_update_label(icmp, mp, + ipha->ipha_dst); + if (error != 0) { + icmp_ud_err(q, mp, error); + return; + } } - icmp_wput_hdrincl(q, mp, icmp, NULL); + error = icmp_wput_hdrincl(q, connp, mp, icmp, NULL); + if (error != 0) + icmp_ud_err(q, mp, error); return; } freemsg(mp); @@ -4432,14 +4482,6 @@ icmp_wput(queue_t *q, mblk_t *mp) /* Handle T_UNITDATA_REQ messages here. */ - - - if (icmp->icmp_state == TS_UNBND) { - /* If a port has not been bound to the stream, fail. */ - BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - icmp_ud_err(q, mp, EPROTO); - return; - } mp1 = mp->b_cont; if (mp1 == NULL) { BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); @@ -4475,8 +4517,22 @@ icmp_wput(queue_t *q, mblk_t *mp) * Destination is a native IPv6 address. * Send out an IPv6 format packet. */ - icmp_wput_ipv6(q, mp, sin6, tudr->OPT_length); - return; + if (tudr->OPT_length != 0) { + int error; + + error = 0; + if (icmp_unitdata_opt_process(q, mp, &error, + (void *)ipp) < 0) { + /* failure */ + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + icmp_ud_err(q, mp, error); + return; + } + ASSERT(error == 0); + } + + error = raw_ip_send_data_v6(q, connp, mp1, sin6, ipp); + goto done; case AF_INET: sin = (sin_t *)&rptr[tudr->DEST_offset]; @@ -4497,9 +4553,6 @@ icmp_wput(queue_t *q, mblk_t *mp) pktinfop->ip4_ill_index = 0; pktinfop->ip4_addr = INADDR_ANY; - optinfo.ip_opt_flags = 0; - optinfo.ip_opt_ill_index = 0; - /* * If options passed in, feed it for verification and handling @@ -4522,7 +4575,48 @@ icmp_wput(queue_t *q, mblk_t *mp) * OPT_length/offset now potentially modified * and contain option setting results */ + } + error = raw_ip_send_data_v4(q, connp, mp1, v4dst, pktinfop); +done: + if (error != 0) { + icmp_ud_err(q, mp, error); + return; + } else { + mp->b_cont = NULL; + freeb(mp); + } +} + + +/* ARGSUSED */ +static void +icmp_wput_fallback(queue_t *q, mblk_t *mp) +{ +#ifdef DEBUG + cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n"); +#endif + freemsg(mp); +} + +static int +raw_ip_send_data_v4(queue_t *q, conn_t *connp, mblk_t *mp, ipaddr_t v4dst, + ip4_pkt_t *pktinfop) +{ + ipha_t *ipha; + size_t ip_len; + icmp_t *icmp = connp->conn_icmp; + icmp_stack_t *is = icmp->icmp_is; + int ip_hdr_length; + ip_opt_info_t optinfo; + + optinfo.ip_opt_flags = 0; + optinfo.ip_opt_ill_index = 0; + + if (icmp->icmp_state == TS_UNBND) { + /* If a port has not been bound to the stream, fail. */ + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + return (EPROTO); } if (v4dst == INADDR_ANY) @@ -4531,35 +4625,34 @@ icmp_wput(queue_t *q, mblk_t *mp) /* Check if our saved options are valid; update if not */ if (is_system_labeled() && (!IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6lastdst) || - V4_PART_OF_V6(icmp->icmp_v6lastdst) != v4dst) && - !icmp_update_label(q, icmp, mp, v4dst)) { - return; - } + V4_PART_OF_V6(icmp->icmp_v6lastdst) != v4dst)) { + int error = icmp_update_label(icmp, mp, v4dst); - /* Protocol 255 contains full IP headers */ - if (icmp->icmp_hdrincl) { - freeb(mp); - icmp_wput_hdrincl(q, mp1, icmp, pktinfop); - return; + if (error != 0) + return (error); } + /* Protocol 255 contains full IP headers */ + if (icmp->icmp_hdrincl) + return (icmp_wput_hdrincl(q, connp, mp, icmp, pktinfop)); /* Add an IP header */ ip_hdr_length = IP_SIMPLE_HDR_LENGTH + icmp->icmp_ip_snd_options_len; - ipha = (ipha_t *)&mp1->b_rptr[-ip_hdr_length]; - if ((uchar_t *)ipha < mp1->b_datap->db_base || - mp1->b_datap->db_ref != 1 || + ipha = (ipha_t *)&mp->b_rptr[-ip_hdr_length]; + if ((uchar_t *)ipha < mp->b_datap->db_base || + mp->b_datap->db_ref != 1 || !OK_32PTR(ipha)) { + mblk_t *mp1; if (!(mp1 = allocb(ip_hdr_length + is->is_wroff_extra, BPRI_LO))) { BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - icmp_ud_err(q, mp, ENOMEM); - return; + return (ENOMEM); } - mp1->b_cont = mp->b_cont; + mp1->b_cont = mp; ipha = (ipha_t *)mp1->b_datap->db_lim; mp1->b_wptr = (uchar_t *)ipha; ipha = (ipha_t *)((uchar_t *)ipha - ip_hdr_length); + mp = mp1; } #ifdef _BIG_ENDIAN /* Set version, header length, and tos */ @@ -4604,11 +4697,11 @@ icmp_wput(queue_t *q, mblk_t *mp) ipha->ipha_ident = IP_HDR_INCLUDED; /* Finish common formatting of the packet. */ - mp1->b_rptr = (uchar_t *)ipha; + mp->b_rptr = (uchar_t *)ipha; - ip_len = mp1->b_wptr - (uchar_t *)ipha; - if (mp1->b_cont != NULL) - ip_len += msgdsize(mp1->b_cont); + ip_len = mp->b_wptr - (uchar_t *)ipha; + if (mp->b_cont != NULL) + ip_len += msgdsize(mp->b_cont); /* * Set the length into the IP header. @@ -4618,13 +4711,11 @@ icmp_wput(queue_t *q, mblk_t *mp) */ if (ip_len > IP_MAXPACKET) { BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - icmp_ud_err(q, mp, EMSGSIZE); - return; + return (EMSGSIZE); } ipha->ipha_length = htons((uint16_t)ip_len); /* - * Copy in the destination address from the T_UNITDATA - * request + * Copy in the destination address request */ ipha->ipha_dst = v4dst; @@ -4645,16 +4736,14 @@ icmp_wput(queue_t *q, mblk_t *mp) (void) ip_massage_options(ipha, is->is_netstack); } - freeb(mp); BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); - mblk_setcred(mp1, connp->conn_cred); - ip_output_options(Q_TO_CONN(q), mp1, q, IP_WPUT, &optinfo); -#undef ipha -#undef tudr + mblk_setcred(mp, connp->conn_cred); + ip_output_options(connp, mp, q, IP_WPUT, &optinfo); + return (0); } -static boolean_t -icmp_update_label_v6(queue_t *wq, icmp_t *icmp, mblk_t *mp, in6_addr_t *dst) +static int +icmp_update_label_v6(icmp_t *icmp, mblk_t *mp, in6_addr_t *dst) { int err; uchar_t opt_storage[TSOL_MAX_IPV6_OPTION]; @@ -4672,33 +4761,30 @@ icmp_update_label_v6(queue_t *wq, icmp_t *icmp, mblk_t *mp, in6_addr_t *dst) BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); DTRACE_PROBE4( tx__ip__log__drop__updatelabel__icmp6, - char *, "queue(1) failed to update options(2) on mp(3)", - queue_t *, wq, char *, opt_storage, mblk_t *, mp); - icmp_ud_err(wq, mp, err); - return (B_FALSE); + char *, "icmp(1) failed to update options(2) on mp(3)", + icmp_t *, icmp, char *, opt_storage, mblk_t *, mp); + return (err); } icmp->icmp_v6lastdst = *dst; - return (B_TRUE); + return (0); } /* - * icmp_wput_ipv6(): + * raw_ip_send_data_v6(): * Assumes that icmp_wput did some sanity checking on the destination * address, but that the label may not yet be correct. */ -void -icmp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen) +static int +raw_ip_send_data_v6(queue_t *q, conn_t *connp, mblk_t *mp, sin6_t *sin6, + ip6_pkt_t *ipp) { ip6_t *ip6h; - ip6i_t *ip6i; /* mp1->b_rptr even if no ip6i_t */ - mblk_t *mp1; + ip6i_t *ip6i; /* mp->b_rptr even if no ip6i_t */ int ip_hdr_len = IPV6_HDR_LEN; size_t ip_len; - icmp_t *icmp = Q_TO_ICMP(q); + icmp_t *icmp = connp->conn_icmp; icmp_stack_t *is = icmp->icmp_is; - ip6_pkt_t ipp_s; /* For ancillary data options */ - ip6_pkt_t *ipp = &ipp_s; ip6_pkt_t *tipp; uint32_t csum = 0; uint_t ignore = 0; @@ -4716,30 +4802,10 @@ icmp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen) */ if (IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6src)) { BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - icmp_ud_err(q, mp, EADDRNOTAVAIL); - return; - } - - ipp->ipp_fields = 0; - ipp->ipp_sticky_ignored = 0; - - /* - * If TPI options passed in, feed it for verification and handling - */ - if (tudr_optlen != 0) { - int error; - - if (icmp_unitdata_opt_process(q, mp, &error, - (void *)ipp) < 0) { - /* failure */ - BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - icmp_ud_err(q, mp, error); - return; - } - ignore = ipp->ipp_sticky_ignored; - ASSERT(error == 0); + return (EADDRNOTAVAIL); } + ignore = ipp->ipp_sticky_ignored; if (sin6->sin6_scope_id != 0 && IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) { /* @@ -4763,9 +4829,12 @@ icmp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen) * avoid blowing up our stack here. */ if (is_system_labeled() && - !IN6_ARE_ADDR_EQUAL(&icmp->icmp_v6lastdst, &ip6_dst) && - !icmp_update_label_v6(q, icmp, mp, &ip6_dst)) { - return; + !IN6_ARE_ADDR_EQUAL(&icmp->icmp_v6lastdst, &ip6_dst)) { + int error = 0; + + error = icmp_update_label_v6(icmp, mp, &ip6_dst); + if (error != 0) + return (error); } /* @@ -4933,28 +5002,30 @@ no_options: ip_hdr_len += sizeof (ip6i_t); /* check/fix buffer config, setup pointers into it */ - mp1 = mp->b_cont; - ip6h = (ip6_t *)&mp1->b_rptr[-ip_hdr_len]; - if ((mp1->b_datap->db_ref != 1) || - ((unsigned char *)ip6h < mp1->b_datap->db_base) || + ip6h = (ip6_t *)&mp->b_rptr[-ip_hdr_len]; + if ((mp->b_datap->db_ref != 1) || + ((unsigned char *)ip6h < mp->b_datap->db_base) || !OK_32PTR(ip6h)) { + mblk_t *mp1; + /* Try to get everything in a single mblk next time */ if (ip_hdr_len > icmp->icmp_max_hdr_len) { icmp->icmp_max_hdr_len = ip_hdr_len; - (void) mi_set_sth_wroff(RD(q), + + (void) proto_set_tx_wroff(q == NULL ? NULL:RD(q), connp, icmp->icmp_max_hdr_len + is->is_wroff_extra); } mp1 = allocb(ip_hdr_len + is->is_wroff_extra, BPRI_LO); if (!mp1) { BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - icmp_ud_err(q, mp, ENOMEM); - return; + return (ENOMEM); } - mp1->b_cont = mp->b_cont; + mp1->b_cont = mp; mp1->b_wptr = mp1->b_datap->db_lim; ip6h = (ip6_t *)(mp1->b_wptr - ip_hdr_len); + mp = mp1; } - mp1->b_rptr = (unsigned char *)ip6h; + mp->b_rptr = (unsigned char *)ip6h; ip6i = (ip6i_t *)ip6h; #define ANCIL_OR_STICKY_PTR(f) ((is_sticky & f) ? &icmp->icmp_sticky_ipp : ipp) @@ -5140,27 +5211,25 @@ no_options: * We know that all extension headers will be in the same mblk * as the IPv6 header. */ - rth = ip_find_rthdr_v6(ip6h, mp1->b_wptr); + rth = ip_find_rthdr_v6(ip6h, mp->b_wptr); if (rth != NULL && rth->ip6r_segleft != 0) { if (rth->ip6r_type != IPV6_RTHDR_TYPE_0) { /* * Drop packet - only support Type 0 routing. * Notify the application as well. */ - icmp_ud_err(q, mp, EPROTO); BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - return; + return (EPROTO); } /* * rth->ip6r_len is twice the number of * addresses in the header */ if (rth->ip6r_len & 0x1) { - icmp_ud_err(q, mp, EPROTO); BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - return; + return (EPROTO); } /* * Shuffle the routing header and ip6_dst @@ -5176,17 +5245,16 @@ no_options: * for subsequent hops. */ if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) { - icmp_ud_err(q, mp, EADDRNOTAVAIL); BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - return; + return (EADDRNOTAVAIL); } } } - ip_len = mp1->b_wptr - (uchar_t *)ip6h - IPV6_HDR_LEN; - if (mp1->b_cont != NULL) - ip_len += msgdsize(mp1->b_cont); + ip_len = mp->b_wptr - (uchar_t *)ip6h - IPV6_HDR_LEN; + if (mp->b_cont != NULL) + ip_len += msgdsize(mp->b_cont); /* * Set the length into the IP header. @@ -5196,11 +5264,10 @@ no_options: */ if (ip_len > IP_MAXPACKET) { BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - icmp_ud_err(q, mp, EMSGSIZE); - return; + return (EMSGSIZE); } if (icmp->icmp_proto == IPPROTO_ICMPV6 || icmp->icmp_raw_checksum) { - uint_t cksum_off; /* From ip6i == mp1->b_rptr */ + uint_t cksum_off; /* From ip6i == mp->b_rptr */ uint16_t *cksum_ptr; uint_t ext_hdrs_len; @@ -5216,14 +5283,14 @@ no_options: * Note: ICMPv6 must always checksum the packet. */ cksum_off = ip_hdr_len + icmp->icmp_checksum_off; - if (cksum_off + sizeof (uint16_t) > mp1->b_wptr - mp1->b_rptr) { - if (!pullupmsg(mp1, cksum_off + sizeof (uint16_t))) { + if (cksum_off + sizeof (uint16_t) > mp->b_wptr - mp->b_rptr) { + if (!pullupmsg(mp, cksum_off + sizeof (uint16_t))) { BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); freemsg(mp); - return; + return (0); } - ip6i = (ip6i_t *)mp1->b_rptr; + ip6i = (ip6i_t *)mp->b_rptr; if (ip6i->ip6i_nxt == IPPROTO_RAW) ip6h = (ip6_t *)&ip6i[1]; else @@ -5244,11 +5311,10 @@ no_options: #endif ip6h->ip6_plen = (uint16_t)ip_len; - freeb(mp); - /* We're done. Pass the packet to IP */ BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); - ip_output_v6(icmp->icmp_connp, mp1, q, IP_WPUT); + ip_output_v6(icmp->icmp_connp, mp, q, IP_WPUT); + return (0); } static void @@ -5281,10 +5347,10 @@ icmp_wput_other(queue_t *q, mblk_t *mp) return; case O_T_BIND_REQ: case T_BIND_REQ: - icmp_bind(q, mp); + icmp_tpi_bind(q, mp); return; case T_CONN_REQ: - icmp_connect(q, mp); + icmp_tpi_connect(q, mp); return; case T_CAPABILITY_REQ: icmp_capability_req(q, mp); @@ -5301,7 +5367,7 @@ icmp_wput_other(queue_t *q, mblk_t *mp) icmp_ud_err(q, mp, EADDRNOTAVAIL); return; case T_UNBIND_REQ: - icmp_unbind(q, mp); + icmp_tpi_unbind(q, mp); return; case T_SVR4_OPTMGMT_REQ: @@ -5319,7 +5385,7 @@ icmp_wput_other(queue_t *q, mblk_t *mp) return; case T_DISCON_REQ: - icmp_disconnect(q, mp); + icmp_tpi_disconnect(q, mp); return; /* The following TPI message is not supported by icmp. */ @@ -5375,6 +5441,15 @@ icmp_wput_other(queue_t *q, mblk_t *mp) return; } break; + case _SIOCSOCKFALLBACK: + /* + * socket is falling back to be a + * streams socket. Nothing to do + */ + iocp->ioc_count = 0; + iocp->ioc_rval = 0; + qreply(q, mp); + return; default: break; } @@ -5398,10 +5473,8 @@ icmp_wput_iocdata(queue_t *q, mblk_t *mp) mblk_t *mp1; STRUCT_HANDLE(strbuf, sb); icmp_t *icmp; - in6_addr_t v6addr; - ipaddr_t v4addr; - uint32_t flowinfo = 0; - int addrlen; + uint_t addrlen; + uint_t error; /* Make sure it is one of ours. */ switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { @@ -5458,81 +5531,34 @@ icmp_wput_iocdata(queue_t *q, mblk_t *mp) mi_copy_done(q, mp, EINVAL); return; } + + mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); + + if (mp1 == NULL) + return; + + rw_enter(&icmp->icmp_rwlock, RW_READER); switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { case TI_GETMYNAME: - if (icmp->icmp_family == AF_INET) { - ASSERT(icmp->icmp_ipversion == IPV4_VERSION); - if (!IN6_IS_ADDR_V4MAPPED_ANY(&icmp->icmp_v6src) && - !IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) { - v4addr = V4_PART_OF_V6(icmp->icmp_v6src); - } else { - /* - * INADDR_ANY - * icmp_v6src is not set, we might be bound to - * broadcast/multicast. Use icmp_bound_v6src as - * local address instead (that could - * also still be INADDR_ANY) - */ - v4addr = V4_PART_OF_V6(icmp->icmp_bound_v6src); - } - } else { - /* icmp->icmp_family == AF_INET6 */ - if (!IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) { - v6addr = icmp->icmp_v6src; - } else { - /* - * UNSPECIFIED - * icmp_v6src is not set, we might be bound to - * broadcast/multicast. Use icmp_bound_v6src as - * local address instead (that could - * also still be UNSPECIFIED) - */ - v6addr = icmp->icmp_bound_v6src; - } - } + error = rawip_do_getsockname(icmp, (void *)mp1->b_rptr, + &addrlen); break; case TI_GETPEERNAME: - if (icmp->icmp_family == AF_INET) { - ASSERT(icmp->icmp_ipversion == IPV4_VERSION); - v4addr = V4_PART_OF_V6(icmp->icmp_v6dst); - } else { - /* icmp->icmp_family == AF_INET6) */ - v6addr = icmp->icmp_v6dst; - flowinfo = icmp->icmp_flowinfo; - } + error = rawip_do_getpeername(icmp, (void *)mp1->b_rptr, + &addrlen); break; - default: - mi_copy_done(q, mp, EPROTO); - return; } - mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); - if (!mp1) - return; - - if (icmp->icmp_family == AF_INET) { - sin_t *sin; + rw_exit(&icmp->icmp_rwlock); - STRUCT_FSET(sb, len, (int)sizeof (sin_t)); - sin = (sin_t *)mp1->b_rptr; - mp1->b_wptr = (uchar_t *)&sin[1]; - *sin = sin_null; - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = v4addr; + if (error != 0) { + mi_copy_done(q, mp, error); } else { - /* icmp->icmp_family == AF_INET6 */ - sin6_t *sin6; + mp1->b_wptr += addrlen; + STRUCT_FSET(sb, len, addrlen); - ASSERT(icmp->icmp_family == AF_INET6); - STRUCT_FSET(sb, len, (int)sizeof (sin6_t)); - sin6 = (sin6_t *)mp1->b_rptr; - mp1->b_wptr = (uchar_t *)&sin6[1]; - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - sin6->sin6_flowinfo = flowinfo; - sin6->sin6_addr = v6addr; + /* Copy out the address */ + mi_copyout(q, mp); } - /* Copy out the address */ - mi_copyout(q, mp); } static int @@ -5565,7 +5591,7 @@ icmp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp, } void -icmp_ddi_init(void) +icmp_ddi_g_init(void) { icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr, icmp_opt_obj.odb_opt_arr_cnt); @@ -5579,11 +5605,13 @@ icmp_ddi_init(void) } void -icmp_ddi_destroy(void) +icmp_ddi_g_destroy(void) { netstack_unregister(NS_ICMP); } +#define INET_NAME "ip" + /* * Initialize the ICMP stack instance. */ @@ -5592,6 +5620,8 @@ rawip_stack_init(netstackid_t stackid, netstack_t *ns) { icmp_stack_t *is; icmpparam_t *pa; + int error = 0; + major_t major; is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP); is->is_netstack = ns; @@ -5603,6 +5633,10 @@ rawip_stack_init(netstackid_t stackid, netstack_t *ns) (void) icmp_param_register(&is->is_nd, is->is_param_arr, A_CNT(icmp_param_arr)); is->is_ksp = rawip_kstat_init(stackid); + + major = mod_name_to_major(INET_NAME); + error = ldi_ident_from_major(major, &is->is_ldi_ident); + ASSERT(error == 0); return (is); } @@ -5620,6 +5654,7 @@ rawip_stack_fini(netstackid_t stackid, void *arg) rawip_kstat_fini(stackid, is->is_ksp); is->is_ksp = NULL; + ldi_ident_release(is->is_ldi_ident); kmem_free(is, sizeof (*is)); } @@ -5691,3 +5726,848 @@ rawip_kstat_update(kstat_t *ksp, int rw) netstack_rele(ns); return (0); } + +/* ARGSUSED */ +int +rawip_accept(sock_lower_handle_t lproto_handle, + sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, + cred_t *cr) +{ + return (EOPNOTSUPP); +} + +/* ARGSUSED */ +int +rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, + socklen_t len, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + int error; + + /* Binding to a NULL address really means unbind */ + if (sa == NULL) + error = rawip_do_unbind(connp); + else + error = rawip_do_bind(connp, sa, len); + + if (error < 0) { + if (error == -TOUTSTATE) + error = EINVAL; + else + error = proto_tlitosyserr(-error); + } + return (error); +} + +static int +rawip_implicit_bind(conn_t *connp) +{ + sin6_t sin6addr; + sin_t *sin; + sin6_t *sin6; + socklen_t len; + int error; + + if (connp->conn_icmp->icmp_family == AF_INET) { + len = sizeof (struct sockaddr_in); + sin = (sin_t *)&sin6addr; + *sin = sin_null; + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = INADDR_ANY; + } else { + ASSERT(connp->conn_icmp->icmp_family == AF_INET6); + len = sizeof (sin6_t); + sin6 = (sin6_t *)&sin6addr; + *sin6 = sin6_null; + sin6->sin6_family = AF_INET6; + V6_SET_ZERO(sin6->sin6_addr); + } + + error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len); + + return ((error < 0) ? proto_tlitosyserr(-error) : error); +} + +static int +rawip_unbind(conn_t *connp) +{ + int error; + + error = rawip_do_unbind(connp); + if (error < 0) { + error = proto_tlitosyserr(-error); + } + return (error); +} + +/* ARGSUSED */ +int +rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) +{ + return (EOPNOTSUPP); +} + +/* ARGSUSED */ +int +rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, + socklen_t len, sock_connid_t *id, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + icmp_t *icmp = connp->conn_icmp; + int error; + boolean_t did_bind = B_FALSE; + + if (sa == NULL) { + /* + * Disconnect + * Make sure we are connected + */ + if (icmp->icmp_state != TS_DATA_XFER) + return (EINVAL); + + error = icmp_disconnect(connp); + return (error); + } + + error = proto_verify_ip_addr(icmp->icmp_family, sa, len); + if (error != 0) + return (error); + + /* do an implicit bind if necessary */ + if (icmp->icmp_state == TS_UNBND) { + error = rawip_implicit_bind(connp); + /* + * We could be racing with an actual bind, in which case + * we would see EPROTO. We cross our fingers and try + * to connect. + */ + if (!(error == 0 || error == EPROTO)) + return (error); + did_bind = B_TRUE; + } + + /* + * set SO_DGRAM_ERRIND + */ + icmp->icmp_dgram_errind = B_TRUE; + + error = rawip_do_connect(connp, sa, len); + + if (error != 0 && did_bind) { + int unbind_err; + + unbind_err = rawip_unbind(connp); + ASSERT(unbind_err == 0); + } + + if (error == 0) { + *id = 0; + (*connp->conn_upcalls->su_connected) + (connp->conn_upper_handle, 0, NULL, -1); + } else if (error < 0) { + error = proto_tlitosyserr(-error); + } + return (error); +} + +/* ARGSUSED */ +void +rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, + boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb) +{ + conn_t *connp = (conn_t *)proto_handle; + icmp_t *icmp; + struct T_capability_ack tca; + struct sockaddr_in6 laddr, faddr; + socklen_t laddrlen, faddrlen; + short opts; + struct stroptions *stropt; + mblk_t *stropt_mp; + int error; + + icmp = connp->conn_icmp; + + stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL); + + /* + * setup the fallback stream that was allocated + */ + connp->conn_dev = (dev_t)RD(q)->q_ptr; + connp->conn_minor_arena = WR(q)->q_ptr; + + RD(q)->q_ptr = WR(q)->q_ptr = connp; + + WR(q)->q_qinfo = &icmpwinit; + + connp->conn_rq = RD(q); + connp->conn_wq = WR(q); + + /* Notify stream head about options before sending up data */ + stropt_mp->b_datap->db_type = M_SETOPTS; + stropt_mp->b_wptr += sizeof (*stropt); + stropt = (struct stroptions *)stropt_mp->b_rptr; + stropt->so_flags = SO_WROFF | SO_HIWAT; + stropt->so_wroff = + (ushort_t)(icmp->icmp_max_hdr_len + icmp->icmp_is->is_wroff_extra); + stropt->so_hiwat = icmp->icmp_recv_hiwat; + putnext(RD(q), stropt_mp); + + /* + * free helper stream + */ + ip_close_helper_stream(connp); + + /* + * Collect the information needed to sync with the sonode + */ + icmp_do_capability_ack(icmp, &tca, TC1_INFO); + + laddrlen = faddrlen = sizeof (sin6_t); + (void) rawip_getsockname((sock_lower_handle_t)connp, + (struct sockaddr *)&laddr, &laddrlen, NULL); + error = rawip_getpeername((sock_lower_handle_t)connp, + (struct sockaddr *)&faddr, &faddrlen, NULL); + if (error != 0) + faddrlen = 0; + opts = 0; + if (icmp->icmp_dgram_errind) + opts |= SO_DGRAM_ERRIND; + if (icmp->icmp_dontroute) + opts |= SO_DONTROUTE; + + /* + * Once we grab the drain lock, no data will be send up + * to the socket. So we notify the socket that the endpoint + * is quiescent and it's therefore safe move data from + * the socket to the stream head. + */ + (*quiesced_cb)(connp->conn_upper_handle, q, &tca, + (struct sockaddr *)&laddr, laddrlen, + (struct sockaddr *)&faddr, faddrlen, opts); + + /* + * push up any packets that were queued in icmp_t + */ + + mutex_enter(&icmp->icmp_recv_lock); + while (icmp->icmp_fallback_queue_head != NULL) { + mblk_t *mp; + + mp = icmp->icmp_fallback_queue_head; + icmp->icmp_fallback_queue_head = mp->b_next; + mp->b_next = NULL; + mutex_exit(&icmp->icmp_recv_lock); + putnext(RD(q), mp); + mutex_enter(&icmp->icmp_recv_lock); + } + icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head; + /* + * No longer a streams less socket + */ + connp->conn_flags &= ~IPCL_NONSTR; + mutex_exit(&icmp->icmp_recv_lock); + ASSERT(icmp->icmp_fallback_queue_head == NULL && + icmp->icmp_fallback_queue_tail == NULL); + + ASSERT(connp->conn_ref >= 1); +} + +/* ARGSUSED */ +sock_lower_handle_t +rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, + uint_t *smodep, int *errorp, int flags, cred_t *credp) +{ + conn_t *connp; + + if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) { + *errorp = EPROTONOSUPPORT; + return (NULL); + } + + connp = icmp_open(family, credp, errorp, flags); + if (connp != NULL) { + icmp_stack_t *is; + + is = connp->conn_icmp->icmp_is; + connp->conn_flags |= IPCL_NONSTR; + + if (connp->conn_icmp->icmp_family == AF_INET6) { + /* Build initial header template for transmit */ + rw_enter(&connp->conn_icmp->icmp_rwlock, RW_WRITER); + if ((*errorp = + icmp_build_hdrs(connp->conn_icmp)) != 0) { + rw_exit(&connp->conn_icmp->icmp_rwlock); + ipcl_conn_destroy(connp); + return (NULL); + } + rw_exit(&connp->conn_icmp->icmp_rwlock); + } + + connp->conn_icmp->icmp_recv_hiwat = is->is_recv_hiwat; + connp->conn_icmp->icmp_xmit_hiwat = is->is_xmit_hiwat; + + if ((*errorp = ip_create_helper_stream(connp, + is->is_ldi_ident)) != 0) { + cmn_err(CE_CONT, "create of IP helper stream failed\n"); + (void) rawip_do_close(connp); + return (NULL); + } + + mutex_enter(&connp->conn_lock); + connp->conn_state_flags &= ~CONN_INCIPIENT; + mutex_exit(&connp->conn_lock); + *sock_downcalls = &sock_rawip_downcalls; + *smodep = SM_ATOMIC; + } else { + ASSERT(*errorp != 0); + } + + return ((sock_lower_handle_t)connp); +} + +/* ARGSUSED */ +void +rawip_activate(sock_lower_handle_t proto_handle, + sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags, + cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + icmp_stack_t *is = connp->conn_icmp->icmp_is; + struct sock_proto_props sopp; + + connp->conn_upcalls = sock_upcalls; + connp->conn_upper_handle = sock_handle; + + sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | + SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ; + sopp.sopp_wroff = connp->conn_icmp->icmp_max_hdr_len + + is->is_wroff_extra; + sopp.sopp_rxhiwat = is->is_recv_hiwat; + sopp.sopp_rxlowat = icmp_mod_info.mi_lowat; + sopp.sopp_maxblk = INFPSZ; + sopp.sopp_maxpsz = IP_MAXPACKET; + sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 : + icmp_mod_info.mi_minpsz; + + (*connp->conn_upcalls->su_set_proto_props) + (connp->conn_upper_handle, &sopp); +} + +static int +rawip_do_getsockname(icmp_t *icmp, struct sockaddr *sa, uint_t *salenp) +{ + sin_t *sin = (sin_t *)sa; + sin6_t *sin6 = (sin6_t *)sa; + + ASSERT(icmp != NULL); + ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock)); + + switch (icmp->icmp_family) { + case AF_INET: + ASSERT(icmp->icmp_ipversion == IPV4_VERSION); + if (*salenp < sizeof (sin_t)) + return (EINVAL); + + *salenp = sizeof (sin_t); + *sin = sin_null; + sin->sin_family = AF_INET; + if (icmp->icmp_state == TS_UNBND) { + break; + } + + if (!IN6_IS_ADDR_V4MAPPED_ANY(&icmp->icmp_v6src) && + !IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) { + sin->sin_addr.s_addr = V4_PART_OF_V6(icmp->icmp_v6src); + } else { + /* + * INADDR_ANY + * icmp_v6src is not set, we might be bound to + * broadcast/multicast. Use icmp_bound_v6src as + * local address instead (that could + * also still be INADDR_ANY) + */ + sin->sin_addr.s_addr = + V4_PART_OF_V6(icmp->icmp_bound_v6src); + } + break; + case AF_INET6: + + if (*salenp < sizeof (sin6_t)) + return (EINVAL); + + *salenp = sizeof (sin6_t); + *sin6 = sin6_null; + sin6->sin6_family = AF_INET6; + if (icmp->icmp_state == TS_UNBND) { + break; + } + if (!IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) { + sin6->sin6_addr = icmp->icmp_v6src; + } else { + /* + * UNSPECIFIED + * icmp_v6src is not set, we might be bound to + * broadcast/multicast. Use icmp_bound_v6src as + * local address instead (that could + * also still be UNSPECIFIED) + */ + + sin6->sin6_addr = icmp->icmp_bound_v6src; + } + break; + } + return (0); +} + +static int +rawip_do_getpeername(icmp_t *icmp, struct sockaddr *sa, uint_t *salenp) +{ + sin_t *sin = (sin_t *)sa; + sin6_t *sin6 = (sin6_t *)sa; + + ASSERT(icmp != NULL); + ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock)); + + if (icmp->icmp_state != TS_DATA_XFER) + return (ENOTCONN); + + sa->sa_family = icmp->icmp_family; + switch (icmp->icmp_family) { + case AF_INET: + ASSERT(icmp->icmp_ipversion == IPV4_VERSION); + + if (*salenp < sizeof (sin_t)) + return (EINVAL); + + *salenp = sizeof (sin_t); + *sin = sin_null; + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = + V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr); + break; + case AF_INET6: + if (*salenp < sizeof (sin6_t)) + return (EINVAL); + + *salenp = sizeof (sin6_t); + *sin6 = sin6_null; + *sin6 = icmp->icmp_v6dst; + break; + } + return (0); +} + +/* ARGSUSED */ +int +rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa, + socklen_t *salenp, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + icmp_t *icmp = connp->conn_icmp; + int error; + + ASSERT(icmp != NULL); + + rw_enter(&icmp->icmp_rwlock, RW_READER); + + error = rawip_do_getpeername(icmp, sa, salenp); + + rw_exit(&icmp->icmp_rwlock); + + return (error); +} + +/* ARGSUSED */ +int +rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa, + socklen_t *salenp, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + icmp_t *icmp = connp->conn_icmp; + int error; + + ASSERT(icmp != NULL); + rw_enter(&icmp->icmp_rwlock, RW_READER); + + error = rawip_do_getsockname(icmp, sa, salenp); + + rw_exit(&icmp->icmp_rwlock); + + return (error); +} + +int +rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, + const void *optvalp, socklen_t optlen, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + icmp_t *icmp = connp->conn_icmp; + int error; + + error = proto_opt_check(level, option_name, optlen, NULL, + icmp_opt_obj.odb_opt_des_arr, + icmp_opt_obj.odb_opt_arr_cnt, + icmp_opt_obj.odb_topmost_tpiprovider, + B_TRUE, B_FALSE, cr); + + if (error != 0) { + /* + * option not recognized + */ + if (error < 0) { + error = proto_tlitosyserr(-error); + } + return (error); + } + + rw_enter(&icmp->icmp_rwlock, RW_WRITER); + error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, + option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen, + (uchar_t *)optvalp, NULL, cr); + rw_exit(&icmp->icmp_rwlock); + + if (error < 0) { + /* + * Pass on to ip + */ + error = ip_set_options(connp, level, option_name, optvalp, + optlen, cr); + } + + ASSERT(error >= 0); + + return (error); +} + +int +rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, + void *optvalp, socklen_t *optlen, cred_t *cr) +{ + int error; + conn_t *connp = (conn_t *)proto_handle; + icmp_t *icmp = connp->conn_icmp; + t_uscalar_t max_optbuf_len; + void *optvalp_buf; + int len; + + error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, + icmp_opt_obj.odb_opt_des_arr, + icmp_opt_obj.odb_opt_arr_cnt, + icmp_opt_obj.odb_topmost_tpiprovider, + B_FALSE, B_TRUE, cr); + + if (error != 0) { + if (error < 0) { + error = proto_tlitosyserr(-error); + } + return (error); + } + + optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); + rw_enter(&icmp->icmp_rwlock, RW_READER); + len = icmp_opt_get(connp, level, option_name, optvalp_buf); + rw_exit(&icmp->icmp_rwlock); + + if (len < 0) { + /* + * Pass on to IP + */ + kmem_free(optvalp_buf, max_optbuf_len); + return (ip_get_options(connp, level, option_name, optvalp, + optlen, cr)); + } else { + /* + * update optlen and copy option value + */ + t_uscalar_t size = MIN(len, *optlen); + bcopy(optvalp_buf, optvalp, size); + bcopy(&size, optlen, sizeof (size)); + + kmem_free(optvalp_buf, max_optbuf_len); + return (0); + } +} + +/* ARGSUSED */ +int +rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + (void) rawip_do_close(connp); + return (0); +} + +/* ARGSUSED */ +int +rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + + /* shut down the send side */ + if (how != SHUT_RD) + (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, + SOCK_OPCTL_SHUT_SEND, 0); + /* shut down the recv side */ + if (how != SHUT_WR) + (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, + SOCK_OPCTL_SHUT_RECV, 0); + return (0); +} + +void +rawip_clr_flowctrl(sock_lower_handle_t proto_handle) +{ + conn_t *connp = (conn_t *)proto_handle; + icmp_t *icmp = connp->conn_icmp; + + mutex_enter(&icmp->icmp_recv_lock); + connp->conn_flow_cntrld = B_FALSE; + mutex_exit(&icmp->icmp_recv_lock); +} + +int +rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, + int mode, int32_t *rvalp, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + int error; + + switch (cmd) { + case ND_SET: + case ND_GET: + case _SIOCSOCKFALLBACK: + case TI_GETPEERNAME: + case TI_GETMYNAME: +#ifdef DEBUG + cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams" + " socket", cmd); +#endif + error = EINVAL; + break; + default: + /* + * Pass on to IP using helper stream + */ + error = ldi_ioctl( + connp->conn_helper_info->ip_helper_stream_handle, + cmd, arg, mode, cr, rvalp); + break; + } + return (error); +} + +/* ARGSUSED */ +int +rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, + cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + icmp_t *icmp = connp->conn_icmp; + icmp_stack_t *is = icmp->icmp_is; + int error = 0; + boolean_t bypass_dgram_errind = B_FALSE; + + ASSERT(DB_TYPE(mp) == M_DATA); + + if (is_system_labeled()) + msg_setcredpid(mp, cr, curproc->p_pid); + + /* do an implicit bind if necessary */ + if (icmp->icmp_state == TS_UNBND) { + error = rawip_implicit_bind(connp); + /* + * We could be racing with an actual bind, in which case + * we would see EPROTO. We cross our fingers and try + * to connect. + */ + if (!(error == 0 || error == EPROTO)) { + freemsg(mp); + return (error); + } + } + + rw_enter(&icmp->icmp_rwlock, RW_WRITER); + + if (msg->msg_name != NULL && icmp->icmp_state == TS_DATA_XFER) { + error = EISCONN; + goto done_lock; + } + + switch (icmp->icmp_family) { + case AF_INET6: { + sin6_t *sin6; + ip6_pkt_t ipp_s; /* For ancillary data options */ + ip6_pkt_t *ipp = &ipp_s; + + sin6 = (sin6_t *)msg->msg_name; + if (sin6 != NULL) { + error = proto_verify_ip_addr(icmp->icmp_family, + (struct sockaddr *)msg->msg_name, msg->msg_namelen); + if (error != 0) { + bypass_dgram_errind = B_TRUE; + goto done_lock; + } + if (icmp->icmp_delayed_error != 0) { + sin6_t *sin1 = (sin6_t *)msg->msg_name; + sin6_t *sin2 = (sin6_t *) + &icmp->icmp_delayed_addr; + + error = icmp->icmp_delayed_error; + icmp->icmp_delayed_error = 0; + + /* Compare IP address and port */ + + if (sin1->sin6_port == sin2->sin6_port && + IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr, + &sin2->sin6_addr)) { + goto done_lock; + } + } + } else { + /* + * Use connected address + */ + if (icmp->icmp_state != TS_DATA_XFER) { + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + error = EDESTADDRREQ; + bypass_dgram_errind = B_TRUE; + goto done_lock; + } + sin6 = &icmp->icmp_v6dst; + } + + /* No support for mapped addresses on raw sockets */ + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + error = EADDRNOTAVAIL; + goto done_lock; + } + + ipp->ipp_fields = 0; + ipp->ipp_sticky_ignored = 0; + + /* + * If options passed in, feed it for verification and handling + */ + if (msg->msg_controllen != 0) { + error = process_auxiliary_options(connp, + msg->msg_control, msg->msg_controllen, + ipp, &icmp_opt_obj, icmp_opt_set); + if (error != 0) { + goto done_lock; + } + } + + rw_exit(&icmp->icmp_rwlock); + + /* + * Destination is a native IPv6 address. + * Send out an IPv6 format packet. + */ + + error = raw_ip_send_data_v6(connp->conn_wq, connp, mp, sin6, + ipp); + } + break; + case AF_INET: { + sin_t *sin; + ip4_pkt_t pktinfo; + ip4_pkt_t *pktinfop = &pktinfo; + ipaddr_t v4dst; + + sin = (sin_t *)msg->msg_name; + if (sin != NULL) { + error = proto_verify_ip_addr(icmp->icmp_family, + (struct sockaddr *)msg->msg_name, msg->msg_namelen); + if (error != 0) { + bypass_dgram_errind = B_TRUE; + goto done_lock; + } + v4dst = sin->sin_addr.s_addr; + if (icmp->icmp_delayed_error != 0) { + sin_t *sin1 = (sin_t *)msg->msg_name; + sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr; + + error = icmp->icmp_delayed_error; + icmp->icmp_delayed_error = 0; + + /* Compare IP address and port */ + if (sin1->sin_port == sin2->sin_port && + sin1->sin_addr.s_addr == + sin2->sin_addr.s_addr) { + goto done_lock; + } + + } + } else { + /* + * Use connected address + */ + if (icmp->icmp_state != TS_DATA_XFER) { + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + error = EDESTADDRREQ; + bypass_dgram_errind = B_TRUE; + goto done_lock; + } + v4dst = V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr); + } + + + pktinfop->ip4_ill_index = 0; + pktinfop->ip4_addr = INADDR_ANY; + + /* + * If options passed in, feed it for verification and handling + */ + if (msg->msg_controllen != 0) { + error = process_auxiliary_options(connp, + msg->msg_control, msg->msg_controllen, + pktinfop, &icmp_opt_obj, icmp_opt_set); + if (error != 0) { + goto done_lock; + } + } + rw_exit(&icmp->icmp_rwlock); + + error = raw_ip_send_data_v4(connp->conn_wq, connp, mp, + v4dst, pktinfop); + break; + } + + default: + ASSERT(0); + } + + goto done; + +done_lock: + rw_exit(&icmp->icmp_rwlock); + if (error != 0) { + ASSERT(mp != NULL); + freemsg(mp); + } +done: + if (bypass_dgram_errind) + return (error); + return (icmp->icmp_dgram_errind ? error : 0); +} + +sock_downcalls_t sock_rawip_downcalls = { + rawip_activate, + rawip_accept, + rawip_bind, + rawip_listen, + rawip_connect, + rawip_getpeername, + rawip_getsockname, + rawip_getsockopt, + rawip_setsockopt, + rawip_send, + NULL, + NULL, + NULL, + rawip_shutdown, + rawip_clr_flowctrl, + rawip_ioctl, + rawip_close +}; diff --git a/usr/src/uts/common/inet/ip/icmp_opt_data.c b/usr/src/uts/common/inet/ip/icmp_opt_data.c index 8769a7d3d4..4f15801dfb 100644 --- a/usr/src/uts/common/inet/ip/icmp_opt_data.c +++ b/usr/src/uts/common/inet/ip/icmp_opt_data.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/stream.h> #define _SUN_TPI_VERSION 2 @@ -52,8 +50,8 @@ extern int icmp_opt_default(queue_t *, int, int, uchar_t *); -extern int icmp_opt_get(queue_t *, int, int, uchar_t *); -extern int icmp_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *, +extern int icmp_tpi_opt_get(queue_t *, int, int, uchar_t *); +extern int icmp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *, uint_t *, uchar_t *, void *, cred_t *, mblk_t *); /* @@ -96,10 +94,10 @@ opdes_t icmp_opt_arr[] = { { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), - 40, -1 /* not initialized */ }, + IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), - 40, -1 /* not initialized */ }, + IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, { IP_HDRINCL, IPPROTO_IP, OA_R, OA_RW, OP_RAW, OP_PASSNEXT, sizeof (int), 0 }, @@ -347,8 +345,8 @@ uint_t icmp_max_optsize; /* initialized when ICMP driver is loaded */ optdb_obj_t icmp_opt_obj = { icmp_opt_default, /* ICMP default value function pointer */ - icmp_opt_get, /* ICMP get function pointer */ - icmp_opt_set, /* ICMP set function pointer */ + icmp_tpi_opt_get, /* ICMP get function pointer */ + icmp_tpi_opt_set, /* ICMP set function pointer */ B_TRUE, /* ICMP is tpi provider */ ICMP_OPT_ARR_CNT, /* ICMP option database count of entries */ icmp_opt_arr, /* ICMP option database */ diff --git a/usr/src/uts/common/inet/ip/icmpddi.c b/usr/src/uts/common/inet/ip/icmpddi.c index a5861d9120..dd0023c0c8 100644 --- a/usr/src/uts/common/inet/ip/icmpddi.c +++ b/usr/src/uts/common/inet/ip/icmpddi.c @@ -29,6 +29,9 @@ #include <sys/modctl.h> #include <inet/common.h> #include <inet/ip.h> +#include <inet/rawip_impl.h> +#include <sys/strsubr.h> +#include <sys/socketvar.h> #define INET_NAME "icmp" #define INET_MODDESC "ICMP dummy STREAMS module" @@ -36,6 +39,9 @@ #define INET_DEVMINOR 0 #define INET_DEVSTRTAB icmpinfov4 #define INET_MODSTRTAB dummymodinfo +#define INET_SOCKDESC "Rawip socket module" +#define INET_SOCK_PROTO_CREATE_FUNC (*rawip_create) +#define INET_SOCK_PROTO_FB_FUNC (*rawip_fallback) #define INET_DEVMTFLAGS D_MP #define INET_MODMTFLAGS D_MP diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c index b0eaa51983..3141cd914e 100644 --- a/usr/src/uts/common/inet/ip/ip.c +++ b/usr/src/uts/common/inet/ip/ip.c @@ -38,7 +38,6 @@ #include <sys/tihdr.h> #include <sys/xti_inet.h> #include <sys/ddi.h> -#include <sys/sunddi.h> #include <sys/cmn_err.h> #include <sys/debug.h> #include <sys/kobj.h> @@ -120,7 +119,6 @@ #include <inet/udp_impl.h> #include <inet/rawip_impl.h> #include <inet/rts_impl.h> -#include <sys/sunddi.h> #include <sys/tsol/label.h> #include <sys/tsol/tnet.h> @@ -625,7 +623,7 @@ uint_t ip_max_frag_dups = 10; #define IS_SIMPLE_IPH(ipha) \ ((ipha)->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION) -/* RFC1122 Conformance */ +/* RFC 1122 Conformance */ #define IP_FORWARD_DEFAULT IP_FORWARD_NEVER #define ILL_MAX_NAMELEN LIFNAMSIZ @@ -658,8 +656,7 @@ static void icmp_send_redirect(queue_t *, mblk_t *, ipaddr_t, ip_stack_t *); static void ip_arp_news(queue_t *, mblk_t *); -static boolean_t ip_bind_insert_ire(mblk_t *, ire_t *, iulp_t *, - ip_stack_t *); +static boolean_t ip_bind_get_ire_v4(mblk_t **, ire_t *, iulp_t *, ip_stack_t *); mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t); char *ip_dot_addr(ipaddr_t, char *); mblk_t *ip_carve_mp(mblk_t **, ssize_t); @@ -770,6 +767,8 @@ static void ip_multirt_bad_mtu(ire_t *, uint32_t); static int ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *); static int ip_cgtp_filter_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); +extern int ip_helper_stream_setup(queue_t *, dev_t *, int, int, + cred_t *, boolean_t); static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr); static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t, @@ -1318,6 +1317,7 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { ip_sioctl_set_ipmpfailback, NULL }, /* SIOCSENABLESDP is handled by SDP */ /* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL }, + /* 184 */ { IPI_DONTCARE /* SIOCSQPTR */, 0, 0, 0, NULL, NULL }, }; int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t); @@ -1373,7 +1373,8 @@ static ipha_t icmp_ipha = { }; struct module_info ip_mod_info = { - IP_MOD_ID, IP_MOD_NAME, 1, INFPSZ, 65536, 1024 + IP_MOD_ID, IP_MOD_NAME, IP_MOD_MINPSZ, IP_MOD_MAXPSZ, IP_MOD_HIWAT, + IP_MOD_LOWAT }; /* @@ -4334,6 +4335,23 @@ ip_bind_ipsec_policy_set(conn_t *connp, mblk_t *policy_mp) return (B_TRUE); } +static void +ip_bind_post_handling(conn_t *connp, mblk_t *mp, boolean_t ire_requested) +{ + /* + * Pass the IPsec headers size in ire_ipsec_overhead. + * We can't do this in ip_bind_get_ire because the policy + * may not have been inherited at that point in time and hence + * conn_out_enforce_policy may not be set. + */ + if (ire_requested && connp->conn_out_enforce_policy && + mp != NULL && DB_TYPE(mp) == IRE_DB_REQ_TYPE) { + ire_t *ire = (ire_t *)mp->b_rptr; + ASSERT(MBLKL(mp) >= sizeof (ire_t)); + ire->ire_ipsec_overhead = conn_ipsec_length(connp); + } +} + /* * Upper level protocols (ULP) pass through bind requests to IP for inspection * and to arrange for power-fanout assist. The ULP is identified by @@ -4374,7 +4392,6 @@ ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp) uchar_t *ucp; mblk_t *mp1; boolean_t ire_requested; - boolean_t ipsec_policy_set = B_FALSE; int error = 0; int protocol; ipa_conn_x_t *acx; @@ -4453,7 +4470,6 @@ ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp) mp1 = mp->b_cont; ire_requested = (mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE); - ipsec_policy_set = (mp1 != NULL && DB_TYPE(mp1) == IPSEC_POLICY_SET); switch (tbr->ADDR_length) { default: @@ -4463,14 +4479,14 @@ ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp) case IP_ADDR_LEN: /* Verification of local address only */ - error = ip_bind_laddr(connp, mp, *(ipaddr_t *)ucp, 0, - ire_requested, ipsec_policy_set, B_FALSE); + error = ip_bind_laddr_v4(connp, &mp1, protocol, + *(ipaddr_t *)ucp, 0, B_FALSE); break; case sizeof (sin_t): sin = (sin_t *)ucp; - error = ip_bind_laddr(connp, mp, sin->sin_addr.s_addr, - sin->sin_port, ire_requested, ipsec_policy_set, B_TRUE); + error = ip_bind_laddr_v4(connp, &mp1, protocol, + sin->sin_addr.s_addr, sin->sin_port, B_TRUE); break; case sizeof (ipa_conn_t): @@ -4479,9 +4495,9 @@ ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp) if (ac->ac_lport == 0) ac->ac_lport = connp->conn_lport; /* Always verify destination reachability. */ - error = ip_bind_connected(connp, mp, &ac->ac_laddr, - ac->ac_lport, ac->ac_faddr, ac->ac_fport, ire_requested, - ipsec_policy_set, B_TRUE, B_TRUE); + error = ip_bind_connected_v4(connp, &mp1, protocol, + &ac->ac_laddr, ac->ac_lport, ac->ac_faddr, ac->ac_fport, + B_TRUE, B_TRUE); break; case sizeof (ipa_conn_x_t): @@ -4490,29 +4506,17 @@ ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp) * Whether or not to verify destination reachability depends * on the setting of the ACX_VERIFY_DST flag in acx->acx_flags. */ - error = ip_bind_connected(connp, mp, &acx->acx_conn.ac_laddr, - acx->acx_conn.ac_lport, acx->acx_conn.ac_faddr, - acx->acx_conn.ac_fport, ire_requested, ipsec_policy_set, + error = ip_bind_connected_v4(connp, &mp1, protocol, + &acx->acx_conn.ac_laddr, acx->acx_conn.ac_lport, + acx->acx_conn.ac_faddr, acx->acx_conn.ac_fport, B_TRUE, (acx->acx_flags & ACX_VERIFY_DST) != 0); break; } - if (error == EINPROGRESS) - return (NULL); - else if (error != 0) + ASSERT(error != EINPROGRESS); + if (error != 0) goto bad_addr; - /* - * Pass the IPsec headers size in ire_ipsec_overhead. - * We can't do this in ip_bind_insert_ire because the policy - * may not have been inherited at that point in time and hence - * conn_out_enforce_policy may not be set. - */ - mp1 = mp->b_cont; - if (ire_requested && connp->conn_out_enforce_policy && - mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE) { - ire_t *ire = (ire_t *)mp1->b_rptr; - ASSERT(MBLKL(mp1) >= sizeof (ire_t)); - ire->ire_ipsec_overhead = conn_ipsec_length(connp); - } + + ip_bind_post_handling(connp, mp->b_cont, ire_requested); /* Send it home. */ mp->b_datap->db_type = M_PCPROTO; @@ -4539,7 +4543,7 @@ bad_addr: * upper protocol is expected to reset the src address * to 0 if it sees a IRE_BROADCAST type returned so that * no packets are emitted with broadcast/multicast address as - * source address (that violates hosts requirements RFC1122) + * source address (that violates hosts requirements RFC 1122) * The addresses valid for bind are: * (1) - INADDR_ANY (0) * (2) - IP address of an UP interface @@ -4561,21 +4565,26 @@ bad_addr: * matching IREs so bind has to look up based on the zone. * * Note: lport is in network byte order. + * */ int -ip_bind_laddr(conn_t *connp, mblk_t *mp, ipaddr_t src_addr, uint16_t lport, - boolean_t ire_requested, boolean_t ipsec_policy_set, - boolean_t fanout_insert) +ip_bind_laddr_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol, + ipaddr_t src_addr, uint16_t lport, boolean_t fanout_insert) { int error = 0; ire_t *src_ire; - mblk_t *policy_mp; - ipif_t *ipif; zoneid_t zoneid; ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + mblk_t *mp = NULL; + boolean_t ire_requested = B_FALSE; + boolean_t ipsec_policy_set = B_FALSE; - if (ipsec_policy_set) { - policy_mp = mp->b_cont; + if (mpp) + mp = *mpp; + + if (mp != NULL) { + ire_requested = (DB_TYPE(mp) == IRE_DB_REQ_TYPE); + ipsec_policy_set = (DB_TYPE(mp) == IPSEC_POLICY_SET); } /* @@ -4585,7 +4594,6 @@ ip_bind_laddr(conn_t *connp, mblk_t *mp, ipaddr_t src_addr, uint16_t lport, connp->conn_fully_bound = B_FALSE; src_ire = NULL; - ipif = NULL; zoneid = IPCL_ZONEID(connp); @@ -4598,7 +4606,7 @@ ip_bind_laddr(conn_t *connp, mblk_t *mp, ipaddr_t src_addr, uint16_t lport, * Note: Following code is in if-else-if form for * readability compared to a condition check. */ - /* LINTED - statement has no consequent */ + /* LINTED - statement has no consequence */ if (IRE_IS_LOCAL(src_ire)) { /* * (2) Bind to address of local UP interface @@ -4617,20 +4625,10 @@ ip_bind_laddr(conn_t *connp, mblk_t *mp, ipaddr_t src_addr, uint16_t lport, * (ipif_lookup_addr() looks up all interfaces * but we do not get here for UP interfaces * - case (2) above) - * We put the protocol byte back into the mblk - * since we may come back via ip_wput_nondata() - * later with this mblk if ipif_lookup_addr chooses - * to defer processing. - */ - *mp->b_wptr++ = (char)connp->conn_ulp; - if ((ipif = ipif_lookup_addr(src_addr, NULL, zoneid, - CONNP_TO_WQ(connp), mp, ip_wput_nondata, - &error, ipst)) != NULL) { - ipif_refrele(ipif); - } else if (error == EINPROGRESS) { - if (src_ire != NULL) - ire_refrele(src_ire); - return (EINPROGRESS); + */ + /* LINTED - statement has no consequent */ + if (ip_addr_exists(src_addr, zoneid, ipst)) { + /* The address exists */ } else if (CLASSD(src_addr)) { error = 0; if (src_ire != NULL) @@ -4653,20 +4651,16 @@ ip_bind_laddr(conn_t *connp, mblk_t *mp, ipaddr_t src_addr, uint16_t lport, */ error = EADDRNOTAVAIL; } - /* - * Just to keep it consistent with the processing in - * ip_bind_v4() - */ - mp->b_wptr--; } if (error) { /* Red Alert! Attempting to be a bogon! */ - ip1dbg(("ip_bind: bad src address 0x%x\n", + ip1dbg(("ip_bind_laddr_v4: bad src address 0x%x\n", ntohl(src_addr))); goto bad_addr; } } + /* * Allow setting new policies. For example, disconnects come * down as ipa_t bind. As we would have set conn_policy_cached @@ -4690,17 +4684,17 @@ ip_bind_laddr(conn_t *connp, mblk_t *mp, ipaddr_t src_addr, uint16_t lport, /* * Do we need to add a check to reject Multicast packets */ - error = ipcl_bind_insert(connp, *mp->b_wptr, src_addr, lport); + error = ipcl_bind_insert(connp, protocol, src_addr, lport); } if (error == 0) { if (ire_requested) { - if (!ip_bind_insert_ire(mp, src_ire, NULL, ipst)) { + if (!ip_bind_get_ire_v4(mpp, src_ire, NULL, ipst)) { error = -1; /* Falls through to bad_addr */ } } else if (ipsec_policy_set) { - if (!ip_bind_ipsec_policy_set(connp, policy_mp)) { + if (!ip_bind_ipsec_policy_set(connp, mp)) { error = -1; /* Falls through to bad_addr */ } @@ -4717,15 +4711,32 @@ bad_addr: } if (src_ire != NULL) IRE_REFRELE(src_ire); - if (ipsec_policy_set) { - ASSERT(policy_mp == mp->b_cont); - ASSERT(policy_mp != NULL); - freeb(policy_mp); - /* - * As of now assume that nothing else accompanies - * IPSEC_POLICY_SET. - */ - mp->b_cont = NULL; + return (error); +} + +int +ip_proto_bind_laddr_v4(conn_t *connp, mblk_t **ire_mpp, uint8_t protocol, + ipaddr_t src_addr, uint16_t lport, boolean_t fanout_insert) +{ + int error; + mblk_t *mp = NULL; + boolean_t ire_requested; + + if (ire_mpp) + mp = *ire_mpp; + ire_requested = (mp != NULL && DB_TYPE(mp) == IRE_DB_REQ_TYPE); + + ASSERT(!connp->conn_af_isv6); + connp->conn_pkt_isv6 = B_FALSE; + connp->conn_ulp = protocol; + + error = ip_bind_laddr_v4(connp, ire_mpp, protocol, src_addr, lport, + fanout_insert); + if (error == 0) { + ip_bind_post_handling(connp, ire_mpp ? *ire_mpp : NULL, + ire_requested); + } else if (error < 0) { + error = -TBADADDR; } return (error); } @@ -4746,16 +4757,14 @@ bad_addr: * Note: lport and fport are in network byte order. */ int -ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp, - uint16_t lport, ipaddr_t dst_addr, uint16_t fport, - boolean_t ire_requested, boolean_t ipsec_policy_set, +ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol, + ipaddr_t *src_addrp, uint16_t lport, ipaddr_t dst_addr, uint16_t fport, boolean_t fanout_insert, boolean_t verify_dst) { + ire_t *src_ire; ire_t *dst_ire; int error = 0; - int protocol; - mblk_t *policy_mp; ire_t *sire = NULL; ire_t *md_dst_ire = NULL; ire_t *lso_dst_ire = NULL; @@ -4763,25 +4772,33 @@ ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp, zoneid_t zoneid; ipaddr_t src_addr = *src_addrp; ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + mblk_t *mp = NULL; + boolean_t ire_requested = B_FALSE; + boolean_t ipsec_policy_set = B_FALSE; + ts_label_t *tsl = NULL; + + if (mpp) + mp = *mpp; + + if (mp != NULL) { + ire_requested = (DB_TYPE(mp) == IRE_DB_REQ_TYPE); + ipsec_policy_set = (DB_TYPE(mp) == IPSEC_POLICY_SET); + tsl = MBLK_GETLABEL(mp); + } src_ire = dst_ire = NULL; - protocol = *mp->b_wptr & 0xFF; /* * If we never got a disconnect before, clear it now. */ connp->conn_fully_bound = B_FALSE; - if (ipsec_policy_set) { - policy_mp = mp->b_cont; - } - zoneid = IPCL_ZONEID(connp); if (CLASSD(dst_addr)) { /* Pick up an IRE_BROADCAST */ dst_ire = ire_route_lookup(ip_g_all_ones, 0, 0, 0, NULL, - NULL, zoneid, MBLK_GETLABEL(mp), + NULL, zoneid, tsl, (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE | MATCH_IRE_SECATTR), ipst); @@ -4804,11 +4821,11 @@ ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp, if (connp->conn_nexthop_set) { dst_ire = ire_route_lookup(connp->conn_nexthop_v4, 0, - 0, 0, NULL, NULL, zoneid, MBLK_GETLABEL(mp), + 0, 0, NULL, NULL, zoneid, tsl, MATCH_IRE_SECATTR, ipst); } else { dst_ire = ire_route_lookup(dst_addr, 0, 0, 0, NULL, - &sire, zoneid, MBLK_GETLABEL(mp), + &sire, zoneid, tsl, (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_PARENT | MATCH_IRE_RJ_BHOLE | MATCH_IRE_SECATTR), ipst); @@ -4840,8 +4857,9 @@ ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp, */ if (verify_dst || (dst_ire != NULL)) { if (ip_debug > 2) { - pr_addr_dbg("ip_bind_connected: bad connected " - "dst %s\n", AF_INET, &dst_addr); + pr_addr_dbg("ip_bind_connected_v4:" + "bad connected dst %s\n", + AF_INET, &dst_addr); } if (dst_ire == NULL || !(dst_ire->ire_type & IRE_HOST)) error = ENETUNREACH; @@ -4872,7 +4890,8 @@ ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp, connp->conn_mac_exempt, ipst) != 0) { error = EHOSTUNREACH; if (ip_debug > 2) { - pr_addr_dbg("ip_bind_connected: no label for dst %s\n", + pr_addr_dbg("ip_bind_connected_v4:" + " no label for dst %s\n", AF_INET, &dst_addr); } goto bad_addr; @@ -5056,7 +5075,7 @@ ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp, /* src_ire must be a local|loopback */ if (!IRE_IS_LOCAL(src_ire)) { if (ip_debug > 2) { - pr_addr_dbg("ip_bind_connected: bad connected " + pr_addr_dbg("ip_bind_connected_v4: bad connected " "src %s\n", AF_INET, &src_addr); } error = EADDRNOTAVAIL; @@ -5071,7 +5090,7 @@ ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp, */ if (src_ire->ire_type == IRE_LOOPBACK && !(IRE_IS_LOCAL(dst_ire) || CLASSD(dst_addr))) { - ip1dbg(("ip_bind_connected: bad connected loopback\n")); + ip1dbg(("ip_bind_connected_v4: bad connected loopback\n")); error = -1; goto bad_addr; } @@ -5114,12 +5133,13 @@ ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp, if (sire != NULL) { ulp_info = &(sire->ire_uinfo); } - if (!ip_bind_insert_ire(mp, dst_ire, ulp_info, ipst)) { + if (!ip_bind_get_ire_v4(mpp, dst_ire, ulp_info, ipst)) { error = -1; goto bad_addr; } + mp = *mpp; } else if (ipsec_policy_set) { - if (!ip_bind_ipsec_policy_set(connp, policy_mp)) { + if (!ip_bind_ipsec_policy_set(connp, mp)) { error = -1; goto bad_addr; } @@ -5171,27 +5191,36 @@ ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp, ASSERT(ill->ill_lso_capab != NULL); if ((lsoinfo_mp = ip_lsoinfo_return(lso_dst_ire, connp, - ill->ill_name, ill->ill_lso_capab)) != NULL) - linkb(mp, lsoinfo_mp); + ill->ill_name, ill->ill_lso_capab)) != NULL) { + if (mp == NULL) { + *mpp = lsoinfo_mp; + } else { + linkb(mp, lsoinfo_mp); + } + } } else if (md_dst_ire != NULL) { mblk_t *mdinfo_mp; ASSERT(ill->ill_mdt_capab != NULL); if ((mdinfo_mp = ip_mdinfo_return(md_dst_ire, connp, - ill->ill_name, ill->ill_mdt_capab)) != NULL) - linkb(mp, mdinfo_mp); + ill->ill_name, ill->ill_mdt_capab)) != NULL) { + if (mp == NULL) { + *mpp = mdinfo_mp; + } else { + linkb(mp, mdinfo_mp); + } + } } } bad_addr: if (ipsec_policy_set) { - ASSERT(policy_mp == mp->b_cont); - ASSERT(policy_mp != NULL); - freeb(policy_mp); + ASSERT(mp != NULL); + freeb(mp); /* * As of now assume that nothing else accompanies * IPSEC_POLICY_SET. */ - mp->b_cont = NULL; + *mpp = NULL; } if (src_ire != NULL) IRE_REFRELE(src_ire); @@ -5206,32 +5235,62 @@ bad_addr: return (error); } +int +ip_proto_bind_connected_v4(conn_t *connp, mblk_t **ire_mpp, uint8_t protocol, + ipaddr_t *src_addrp, uint16_t lport, ipaddr_t dst_addr, uint16_t fport, + boolean_t fanout_insert, boolean_t verify_dst) +{ + int error; + mblk_t *mp = NULL; + boolean_t ire_requested; + + if (ire_mpp) + mp = *ire_mpp; + ire_requested = (mp != NULL && DB_TYPE(mp) == IRE_DB_REQ_TYPE); + + ASSERT(!connp->conn_af_isv6); + connp->conn_pkt_isv6 = B_FALSE; + connp->conn_ulp = protocol; + + /* For raw socket, the local port is not set. */ + if (lport == 0) + lport = connp->conn_lport; + error = ip_bind_connected_v4(connp, ire_mpp, protocol, + src_addrp, lport, dst_addr, fport, fanout_insert, verify_dst); + if (error == 0) { + ip_bind_post_handling(connp, ire_mpp ? *ire_mpp : NULL, + ire_requested); + } else if (error < 0) { + error = -TBADADDR; + } + return (error); +} + /* - * Insert the ire in b_cont. Returns false if it fails (due to lack of space). + * Get the ire in *mpp. Returns false if it fails (due to lack of space). * Prefers dst_ire over src_ire. */ static boolean_t -ip_bind_insert_ire(mblk_t *mp, ire_t *ire, iulp_t *ulp_info, ip_stack_t *ipst) +ip_bind_get_ire_v4(mblk_t **mpp, ire_t *ire, iulp_t *ulp_info, ip_stack_t *ipst) { - mblk_t *mp1; - ire_t *ret_ire = NULL; + mblk_t *mp = *mpp; + ire_t *ret_ire; - mp1 = mp->b_cont; - ASSERT(mp1 != NULL); + ASSERT(mp != NULL); if (ire != NULL) { /* - * mp1 initialized above to IRE_DB_REQ_TYPE + * mp initialized above to IRE_DB_REQ_TYPE * appended mblk. Its <upper protocol>'s * job to make sure there is room. */ - if ((mp1->b_datap->db_lim - mp1->b_rptr) < sizeof (ire_t)) - return (0); + if ((mp->b_datap->db_lim - mp->b_rptr) < sizeof (ire_t)) + return (B_FALSE); - mp1->b_datap->db_type = IRE_DB_TYPE; - mp1->b_wptr = mp1->b_rptr + sizeof (ire_t); - bcopy(ire, mp1->b_rptr, sizeof (ire_t)); - ret_ire = (ire_t *)mp1->b_rptr; + mp->b_datap->db_type = IRE_DB_TYPE; + mp->b_wptr = mp->b_rptr + sizeof (ire_t); + bcopy(ire, mp->b_rptr, sizeof (ire_t)); + ret_ire = (ire_t *)mp->b_rptr; /* * Pass the latest setting of the ip_path_mtu_discovery and * copy the ulp info if any. @@ -5242,16 +5301,15 @@ ip_bind_insert_ire(mblk_t *mp, ire_t *ire, iulp_t *ulp_info, ip_stack_t *ipst) bcopy(ulp_info, &(ret_ire->ire_uinfo), sizeof (iulp_t)); } - ret_ire->ire_mp = mp1; + ret_ire->ire_mp = mp; } else { /* * No IRE was found. Remove IRE mblk. */ - mp->b_cont = mp1->b_cont; - freeb(mp1); + *mpp = mp->b_cont; + freeb(mp); } - - return (1); + return (B_TRUE); } /* @@ -5645,9 +5703,9 @@ ip_ddi_destroy(void) { tnet_fini(); - icmp_ddi_destroy(); - rts_ddi_destroy(); - udp_ddi_destroy(); + icmp_ddi_g_destroy(); + rts_ddi_g_destroy(); + udp_ddi_g_destroy(); sctp_ddi_g_destroy(); tcp_ddi_g_destroy(); ipsec_policy_g_destroy(); @@ -5814,6 +5872,7 @@ ip_stack_fini(netstackid_t stackid, void *arg) kmem_free(ipst->ips_ill_g_heads, sizeof (ill_g_head_t) * MAX_G_HEADS); ipst->ips_ill_g_heads = NULL; + ldi_ident_release(ipst->ips_ldi_ident); kmem_free(ipst, sizeof (*ipst)); } @@ -5898,9 +5957,9 @@ ip_ddi_init(void) tnet_init(); - udp_ddi_init(); - rts_ddi_init(); - icmp_ddi_init(); + udp_ddi_g_init(); + rts_ddi_g_init(); + icmp_ddi_g_init(); } /* @@ -5912,6 +5971,7 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns) ip_stack_t *ipst; ipparam_t *pa; ipndp_t *na; + major_t major; #ifdef NS_DEBUG printf("ip_stack_init(stack %d)\n", stackid); @@ -6011,6 +6071,8 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns) list_create(&ipst->ips_capab_taskq_list, sizeof (mblk_t), offsetof(mblk_t, b_next)); + major = mod_name_to_major(INET_NAME); + (void) ldi_ident_from_major(major, &ipst->ips_ldi_ident); return (ipst); } @@ -6353,7 +6415,7 @@ ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags, } } - if (connp == NULL || connp->conn_upq == NULL) { + if (connp == NULL) { /* * No one bound to these addresses. Is * there a client that wants all @@ -6392,6 +6454,9 @@ ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags, } return; } + + ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL); + CONN_INC_REF(connp); first_connp = connp; @@ -6415,7 +6480,7 @@ ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags, /* * Copy the packet. */ - if (connp == NULL || connp->conn_upq == NULL || + if (connp == NULL || (((first_mp1 = dupmsg(first_mp)) == NULL) && ((first_mp1 = ip_copymsg(first_mp)) == NULL))) { /* @@ -6425,11 +6490,17 @@ ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags, connp = first_connp; break; } + ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL); mp1 = mctl_present ? first_mp1->b_cont : first_mp1; CONN_INC_REF(connp); mutex_exit(&connfp->connf_lock); rq = connp->conn_rq; - if (!canputnext(rq)) { + + /* + * Check flow control + */ + if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || + (!IPCL_IS_NONSTR(connp) && !canputnext(rq))) { if (flags & IP_FF_RAWIP) { BUMP_MIB(mibptr, rawipIfStatsInOverflows); } else { @@ -6527,7 +6598,11 @@ ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags, } rq = connp->conn_rq; - if (!canputnext(rq)) { + /* + * Check flow control + */ + if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || + (!IPCL_IS_NONSTR(connp) && !canputnext(rq))) { if (flags & IP_FF_RAWIP) { BUMP_MIB(mibptr, rawipIfStatsInOverflows); } else { @@ -6975,7 +7050,8 @@ ip_fanout_udp_conn(conn_t *connp, mblk_t *first_mp, mblk_t *mp, else first_mp = mp; - if (CONN_UDP_FLOWCTLD(connp)) { + if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || + (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) { BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows); freemsg(first_mp); return; @@ -7166,9 +7242,12 @@ ip_fanout_udp(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, connp = connp->conn_next; } - if (connp == NULL || connp->conn_upq == NULL) + if (connp == NULL || + !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL) goto notfound; + ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL); + if (is_system_labeled() && !tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, connp)) @@ -7202,9 +7281,12 @@ ip_fanout_udp(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, connp = connp->conn_next; } - if (connp == NULL || connp->conn_upq == NULL) + if (connp == NULL || + !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL) goto notfound; + ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL); + first_connp = connp; /* * When SO_REUSEADDR is not set, send the packet only to the first @@ -7321,7 +7403,8 @@ notfound: connp)) connp = NULL; - if (connp == NULL || connp->conn_upq == NULL) { + if (connp == NULL || + !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL) { /* * No one bound to this port. Is * there a client that wants all @@ -7349,6 +7432,7 @@ notfound: } return; } + ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL); CONN_INC_REF(connp); mutex_exit(&connfp->connf_lock); @@ -7377,7 +7461,8 @@ notfound: connp = connp->conn_next; } - if (connp == NULL || connp->conn_upq == NULL) { + if (connp == NULL || + !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL) { /* * No one bound to this port. Is * there a client that wants all @@ -7406,6 +7491,7 @@ notfound: } return; } + ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL); first_connp = connp; @@ -9774,6 +9860,15 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, return (ip_modopen(q, devp, flag, sflag, credp)); } + if ((flag & ~(FKLYR)) == IP_HELPER_STR) { + /* + * Non streams based socket looking for a stream + * to access IP + */ + return (ip_helper_stream_setup(q, devp, flag, sflag, + credp, isv6)); + } + ns = netstack_find_by_cred(credp); ASSERT(ns != NULL); ipst = ns->netstack_ip; @@ -10344,7 +10439,7 @@ ip_opt_set_ipif(conn_t *connp, ipaddr_t addr, boolean_t checkonly, int option, if (ipif == NULL) { if (error == EINPROGRESS) return (error); - else if ((option == IP_MULTICAST_IF) || + if ((option == IP_MULTICAST_IF) || (option == IP_NEXTHOP)) return (EHOSTUNREACH); else @@ -11611,7 +11706,6 @@ ip_opt_get(queue_t *q, int level, int name, uchar_t *ptr) } return (-1); } - /* Named Dispatch routine to get a current value out of our parameter table. */ /* ARGSUSED */ static int @@ -12806,10 +12900,11 @@ ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, if ((connp = ipcl_classify_v4(mp, IPPROTO_UDP, IP_SIMPLE_HDR_LENGTH, ire->ire_zoneid, ipst)) != NULL) { - ASSERT(connp->conn_upq != NULL); + ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL); IP_STAT(ipst, ip_udp_fast_path); - if (CONN_UDP_FLOWCTLD(connp)) { + if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || + (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) { freemsg(mp); BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows); } else { @@ -20373,11 +20468,9 @@ ip_trash_ire_reclaim_stack(ip_stack_t *ipst) * upper level protocol. We remove this conn from any fanout hash list it is * on, and zero out the bind information. No reply is expected up above. */ -mblk_t * -ip_unbind(queue_t *q, mblk_t *mp) +void +ip_unbind(conn_t *connp) { - conn_t *connp = Q_TO_CONN(q); - ASSERT(!MUTEX_HELD(&connp->conn_lock)); if (is_system_labeled() && connp->conn_anon_port) { @@ -20390,20 +20483,6 @@ ip_unbind(queue_t *q, mblk_t *mp) ipcl_hash_remove(connp); - ASSERT(mp->b_cont == NULL); - /* - * Convert mp into a T_OK_ACK - */ - mp = mi_tpi_ok_ack_alloc(mp); - - /* - * should not happen in practice... T_OK_ACK is smaller than the - * original message. - */ - if (mp == NULL) - return (NULL); - - return (mp); } /* @@ -20475,11 +20554,13 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller, ASSERT(connp != NULL); zoneid = connp->conn_zoneid; ipst = connp->conn_netstack->netstack_ip; + ASSERT(ipst != NULL); /* is queue flow controlled? */ if ((q->q_first != NULL || connp->conn_draining) && (caller == IP_WPUT)) { ASSERT(!need_decref); + ASSERT(!IP_FLOW_CONTROLLED_ULP(connp->conn_ulp)); (void) putq(q, mp); return; } @@ -21514,7 +21595,6 @@ dontroute: * connectivity. */ ipha->ipha_ttl = 1; - /* If suitable ipif not found, drop packet */ dst_ipif = ipif_lookup_onlink_addr(dst, zoneid, ipst); if (dst_ipif == NULL) { @@ -23244,6 +23324,7 @@ blocked: * ip_wsrv will be scheduled or * is already running. */ + (void) putq(connp->conn_wq, first_mp); } @@ -27522,26 +27603,6 @@ ip_ioctl_finish(queue_t *q, mblk_t *mp, int err, int mode, ipsq_t *ipsq) ipsq_current_finish(ipsq); } -/* - * This is called from ip_wput_nondata to resume a deferred TCP bind. - */ -/* ARGSUSED */ -void -ip_resume_tcp_bind(void *arg, mblk_t *mp, void *arg2) -{ - conn_t *connp = arg; - tcp_t *tcp; - - ASSERT(connp != NULL && IPCL_IS_TCP(connp) && connp->conn_tcp != NULL); - tcp = connp->conn_tcp; - - if (connp->conn_tcp->tcp_state == TCPS_CLOSED) - freemsg(mp); - else - tcp_rput_other(tcp, mp); - CONN_OPER_PENDING_DONE(connp); -} - /* Called from ip_wput for all non data messages */ /* ARGSUSED */ void @@ -27782,8 +27843,9 @@ nak: case M_PROTO: case M_PCPROTO: /* - * The only PROTO messages we expect are ULP binds and - * copies of option negotiation acknowledgements. + * The only PROTO messages we expect are copies of option + * negotiation acknowledgements, AH and ESP bind requests + * are also expected. */ switch (((union T_primitives *)mp->b_rptr)->type) { case O_T_BIND_REQ: @@ -27809,37 +27871,15 @@ nak: mp = connp->conn_af_isv6 ? ip_bind_v6(q, mp, connp, NULL) : ip_bind_v4(q, mp, connp); - if (mp == NULL) - return; - if (IPCL_IS_TCP(connp)) { - /* - * In the case of TCP endpoint we - * come here only for bind retries - */ - ASSERT(ipsq != NULL); - CONN_INC_REF(connp); - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, - ip_resume_tcp_bind, connp, - SQ_FILL, SQTAG_BIND_RETRY); - } else if (IPCL_IS_UDP(connp)) { - /* - * In the case of UDP endpoint we - * come here only for bind retries - */ - ASSERT(ipsq != NULL); - udp_resume_bind(connp, mp); - } else if (IPCL_IS_RAWIP(connp)) { - /* - * In the case of RAWIP endpoint we - * come here only for bind retries - */ - ASSERT(ipsq != NULL); - rawip_resume_bind(connp, mp); - } else { - /* The case of AH and ESP */ - qreply(q, mp); - CONN_OPER_PENDING_DONE(connp); - } + ASSERT(mp != NULL); + + ASSERT(!IPCL_IS_TCP(connp)); + ASSERT(!IPCL_IS_UDP(connp)); + ASSERT(!IPCL_IS_RAWIP(connp)); + + /* The case of AH and ESP */ + qreply(q, mp); + CONN_OPER_PENDING_DONE(connp); return; } case T_SVR4_OPTMGMT_REQ: @@ -27908,7 +27948,8 @@ nak: proto_str = "T_UNBIND_REQ"; goto protonak; } - mp = ip_unbind(q, mp); + ip_unbind(Q_TO_CONN(q)); + mp = mi_tpi_ok_ack_alloc(mp); qreply(q, mp); return; default: @@ -28582,6 +28623,11 @@ conn_drain_insert(conn_t *connp) head->conn_drain_prev->conn_drain_next = connp; head->conn_drain_prev = connp; } + /* + * For non streams based sockets assert flow control. + */ + (*connp->conn_upcalls->su_txq_full) + (connp->conn_upper_handle, B_TRUE); mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); } @@ -28695,7 +28741,16 @@ conn_drain_tail(conn_t *connp, boolean_t closing) } connp->conn_drain_next = NULL; connp->conn_drain_prev = NULL; + + /* + * For non streams based sockets open up flow control. + */ + if (IPCL_IS_NONSTR(connp)) { + (*connp->conn_upcalls->su_txq_full) + (connp->conn_upper_handle, B_FALSE); + } } + mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); } @@ -28779,6 +28834,7 @@ ip_wsrv(queue_t *q) */ connp->conn_draining = 0; enableok(q); + } /* Enable the next conn for draining */ @@ -28941,7 +28997,7 @@ ip_conn_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) "CONN " MI_COL_HDRPAD_STR "rfq " MI_COL_HDRPAD_STR "stq " MI_COL_HDRPAD_STR - " zone local remote"); + " zone local remote"); /* * Because of the ndd constraint, at most we can have 64K buffer @@ -29339,7 +29395,6 @@ ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, return (or->or_private == CGTP_MCAST_SUCCESS ? 0 : error); } - /* * Issue a warning regarding a route crossing an interface with an * incorrect MTU. Only one message every 'ip_multirt_log_interval' diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c index a1d97627b2..fe326778c2 100644 --- a/usr/src/uts/common/inet/ip/ip6.c +++ b/usr/src/uts/common/inet/ip/ip6.c @@ -191,13 +191,15 @@ static void icmp_inbound_too_big_v6(queue_t *, mblk_t *, ill_t *ill, static void icmp_pkt_v6(queue_t *, mblk_t *, void *, size_t, const in6_addr_t *, boolean_t, zoneid_t, ip_stack_t *); static void icmp_redirect_v6(queue_t *, mblk_t *, ill_t *ill); -static int ip_bind_connected_v6(conn_t *, mblk_t *, in6_addr_t *, +static int ip_bind_connected_v6(conn_t *, mblk_t **, uint8_t, in6_addr_t *, uint16_t, const in6_addr_t *, ip6_pkt_t *, uint16_t, - boolean_t, boolean_t, boolean_t, boolean_t); -static boolean_t ip_bind_insert_ire_v6(mblk_t *, ire_t *, const in6_addr_t *, + boolean_t, boolean_t); +static boolean_t ip_bind_get_ire_v6(mblk_t **, ire_t *, const in6_addr_t *, iulp_t *, ip_stack_t *); -static int ip_bind_laddr_v6(conn_t *, mblk_t *, const in6_addr_t *, - uint16_t, boolean_t, boolean_t, boolean_t); +static void ip_bind_post_handling_v6(conn_t *, mblk_t *, boolean_t, + boolean_t, ip_stack_t *); +static int ip_bind_laddr_v6(conn_t *, mblk_t **, uint8_t, + const in6_addr_t *, uint16_t, boolean_t); static void ip_fanout_proto_v6(queue_t *, mblk_t *, ip6_t *, ill_t *, ill_t *, uint8_t, uint_t, uint_t, boolean_t, zoneid_t); static void ip_fanout_tcp_v6(queue_t *, mblk_t *, ip6_t *, ill_t *, @@ -2071,12 +2073,8 @@ ip_bind_v6(queue_t *q, mblk_t *mp, conn_t *connp, ip6_pkt_t *ipp) uint16_t lport; uint16_t fport; uchar_t *ucp; - mblk_t *mp1; - boolean_t ire_requested; - boolean_t ipsec_policy_set; int error = 0; boolean_t local_bind; - boolean_t orig_pkt_isv6 = connp->conn_pkt_isv6; ipa6_conn_x_t *acx6; boolean_t verify_dst; ip_stack_t *ipst = connp->conn_netstack->netstack_ip; @@ -2145,9 +2143,6 @@ ip_bind_v6(queue_t *q, mblk_t *mp, conn_t *connp, ip6_pkt_t *ipp) ip1dbg(("ip_bind_v6: unaligned address\n")); goto bad_addr; } - mp1 = mp->b_cont; /* trailing mp if any */ - ire_requested = (mp1 && mp1->b_datap->db_type == IRE_DB_REQ_TYPE); - ipsec_policy_set = (mp1 && mp1->b_datap->db_type == IPSEC_POLICY_SET); switch (tbr->ADDR_length) { default: @@ -2173,9 +2168,6 @@ ip_bind_v6(queue_t *q, mblk_t *mp, conn_t *connp, ip6_pkt_t *ipp) /* * Verify that both the source and destination addresses * are valid. - * Note that we allow connect to broadcast and multicast - * addresses when ire_requested is set. Thus the ULP - * has to check for IRE_BROADCAST and multicast. */ ac6 = (ipa6_conn_t *)ucp; v6srcp = &ac6->ac6_laddr; @@ -2192,9 +2184,6 @@ ip_bind_v6(queue_t *q, mblk_t *mp, conn_t *connp, ip6_pkt_t *ipp) case sizeof (ipa6_conn_x_t): /* * Verify that the source address is valid. - * Note that we allow connect to broadcast and multicast - * addresses when ire_requested is set. Thus the ULP - * has to check for IRE_BROADCAST and multicast. */ acx6 = (ipa6_conn_x_t *)ucp; ac6 = &acx6->ac6x_conn; @@ -2211,80 +2200,35 @@ ip_bind_v6(queue_t *q, mblk_t *mp, conn_t *connp, ip6_pkt_t *ipp) break; } if (local_bind) { - if (IN6_IS_ADDR_V4MAPPED(v6srcp) && !connp->conn_ipv6_v6only) { - /* Bind to IPv4 address */ - ipaddr_t v4src; - - IN6_V4MAPPED_TO_IPADDR(v6srcp, v4src); - - error = ip_bind_laddr(connp, mp, v4src, lport, - ire_requested, ipsec_policy_set, - tbr->ADDR_length != IPV6_ADDR_LEN); - if (error != 0) - goto bad_addr; - connp->conn_pkt_isv6 = B_FALSE; - } else { - if (IN6_IS_ADDR_V4MAPPED(v6srcp)) { - error = 0; - goto bad_addr; - } - error = ip_bind_laddr_v6(connp, mp, v6srcp, lport, - ire_requested, ipsec_policy_set, - (tbr->ADDR_length != IPV6_ADDR_LEN)); - if (error != 0) - goto bad_addr; - connp->conn_pkt_isv6 = B_TRUE; - } + error = ip_proto_bind_laddr_v6(connp, &mp->b_cont, protocol, + v6srcp, lport, tbr->ADDR_length != IPV6_ADDR_LEN); } else { - /* - * Bind to local and remote address. Local might be - * unspecified in which case it will be extracted from - * ire_src_addr_v6 - */ - if (IN6_IS_ADDR_V4MAPPED(v6dstp) && !connp->conn_ipv6_v6only) { - /* Connect to IPv4 address */ - ipaddr_t v4src; - ipaddr_t v4dst; - - /* Is the source unspecified or mapped? */ - if (!IN6_IS_ADDR_V4MAPPED(v6srcp) && - !IN6_IS_ADDR_UNSPECIFIED(v6srcp)) { - ip1dbg(("ip_bind_v6: " - "dst is mapped, but not the src\n")); - goto bad_addr; - } - IN6_V4MAPPED_TO_IPADDR(v6srcp, v4src); - IN6_V4MAPPED_TO_IPADDR(v6dstp, v4dst); - - /* - * XXX Fix needed. Need to pass ipsec_policy_set - * instead of B_FALSE. - */ + error = ip_proto_bind_connected_v6(connp, &mp->b_cont, protocol, + v6srcp, lport, v6dstp, ipp, fport, B_TRUE, verify_dst); + } - /* Always verify destination reachability. */ - error = ip_bind_connected(connp, mp, &v4src, lport, - v4dst, fport, ire_requested, ipsec_policy_set, - B_TRUE, B_TRUE); - if (error != 0) - goto bad_addr; - IN6_IPADDR_TO_V4MAPPED(v4src, v6srcp); - connp->conn_pkt_isv6 = B_FALSE; - } else if (IN6_IS_ADDR_V4MAPPED(v6srcp)) { - ip1dbg(("ip_bind_v6: " - "src is mapped, but not the dst\n")); - goto bad_addr; - } else { - error = ip_bind_connected_v6(connp, mp, v6srcp, - lport, v6dstp, ipp, fport, ire_requested, - ipsec_policy_set, B_TRUE, verify_dst); - if (error != 0) - goto bad_addr; - connp->conn_pkt_isv6 = B_TRUE; - } + if (error == 0) { + /* Send it home. */ + mp->b_datap->db_type = M_PCPROTO; + tbr->PRIM_type = T_BIND_ACK; + return (mp); } +bad_addr: + ASSERT(error != EINPROGRESS); + if (error > 0) + mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error); + else + mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); + return (mp); +} + +static void +ip_bind_post_handling_v6(conn_t *connp, mblk_t *mp, + boolean_t version_changed, boolean_t ire_requested, ip_stack_t *ipst) +{ /* Update conn_send and pktversion if v4/v6 changed */ - if (orig_pkt_isv6 != connp->conn_pkt_isv6) { + if (version_changed) { ip_setpktversion(connp, connp->conn_pkt_isv6, B_TRUE, ipst); } /* @@ -2293,27 +2237,12 @@ ip_bind_v6(queue_t *q, mblk_t *mp, conn_t *connp, ip6_pkt_t *ipp) * may not have been inherited at that point in time and hence * conn_out_enforce_policy may not be set. */ - mp1 = mp->b_cont; if (ire_requested && connp->conn_out_enforce_policy && - mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE) { - ire_t *ire = (ire_t *)mp1->b_rptr; - ASSERT(MBLKL(mp1) >= sizeof (ire_t)); + mp != NULL && DB_TYPE(mp) == IRE_DB_REQ_TYPE) { + ire_t *ire = (ire_t *)mp->b_rptr; + ASSERT(MBLKL(mp) >= sizeof (ire_t)); ire->ire_ipsec_overhead = (conn_ipsec_length(connp)); } - - /* Send it home. */ - mp->b_datap->db_type = M_PCPROTO; - tbr->PRIM_type = T_BIND_ACK; - return (mp); - -bad_addr: - if (error == EINPROGRESS) - return (NULL); - if (error > 0) - mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error); - else - mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); - return (mp); } /* @@ -2339,20 +2268,27 @@ bad_addr: * When the address is loopback or multicast, there might be many matching IREs * so bind has to look up based on the zone. */ +/* + * Verify the local IP address. Does not change the conn_t except + * conn_fully_bound and conn_policy_cached. + */ static int -ip_bind_laddr_v6(conn_t *connp, mblk_t *mp, const in6_addr_t *v6src, - uint16_t lport, boolean_t ire_requested, boolean_t ipsec_policy_set, - boolean_t fanout_insert) +ip_bind_laddr_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol, + const in6_addr_t *v6src, uint16_t lport, boolean_t fanout_insert) { int error = 0; ire_t *src_ire = NULL; - ipif_t *ipif = NULL; - mblk_t *policy_mp; zoneid_t zoneid; + mblk_t *mp = NULL; + boolean_t ire_requested; + boolean_t ipsec_policy_set; ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - if (ipsec_policy_set) - policy_mp = mp->b_cont; + if (mpp) + mp = *mpp; + + ire_requested = (mp != NULL && DB_TYPE(mp) == IRE_DB_REQ_TYPE); + ipsec_policy_set = (mp != NULL && DB_TYPE(mp) == IPSEC_POLICY_SET); /* * If it was previously connected, conn_fully_bound would have @@ -2372,11 +2308,11 @@ ip_bind_laddr_v6(conn_t *connp, mblk_t *mp, const in6_addr_t *v6src, * readability compared to a condition check. */ ASSERT(src_ire == NULL || !(src_ire->ire_type & IRE_BROADCAST)); + /* LINTED - statement has no consequent */ if (IRE_IS_LOCAL(src_ire)) { /* * (2) Bind to address of local UP interface */ - ipif = src_ire->ire_ipif; } else if (IN6_IS_ADDR_MULTICAST(v6src)) { ipif_t *multi_ipif = NULL; ire_t *save_ire; @@ -2418,28 +2354,12 @@ ip_bind_laddr_v6(conn_t *connp, mblk_t *mp, const in6_addr_t *v6src, if (multi_ipif != NULL) ipif_refrele(multi_ipif); } else { - *mp->b_wptr++ = (char)connp->conn_ulp; - ipif = ipif_lookup_addr_v6(v6src, NULL, zoneid, - CONNP_TO_WQ(connp), mp, ip_wput_nondata, &error, - ipst); - if (ipif == NULL) { - if (error == EINPROGRESS) { - if (src_ire != NULL) - ire_refrele(src_ire); - return (error); - } + if (!ip_addr_exists_v6(v6src, zoneid, ipst)) { /* * Not a valid address for bind */ error = EADDRNOTAVAIL; - } else { - ipif_refrele(ipif); } - /* - * Just to keep it consistent with the processing in - * ip_bind_v6(). - */ - mp->b_wptr--; } if (error != 0) { @@ -2471,17 +2391,18 @@ ip_bind_laddr_v6(conn_t *connp, mblk_t *mp, const in6_addr_t *v6src, connp->conn_remv6 = ipv6_all_zeros; connp->conn_lport = lport; connp->conn_fport = 0; - error = ipcl_bind_insert_v6(connp, *mp->b_wptr, v6src, lport); + error = ipcl_bind_insert_v6(connp, protocol, v6src, lport); } if (error == 0) { if (ire_requested) { - if (!ip_bind_insert_ire_v6(mp, src_ire, v6src, NULL, + if (!ip_bind_get_ire_v6(mpp, src_ire, v6src, NULL, ipst)) { error = -1; goto bad_addr; } + mp = *mpp; } else if (ipsec_policy_set) { - if (!ip_bind_ipsec_policy_set(connp, policy_mp)) { + if (!ip_bind_ipsec_policy_set(connp, mp)) { error = -1; goto bad_addr; } @@ -2501,54 +2422,70 @@ bad_addr: ire_refrele(src_ire); if (ipsec_policy_set) { - ASSERT(policy_mp != NULL); - freeb(policy_mp); + ASSERT(mp != NULL); + freeb(mp); /* * As of now assume that nothing else accompanies * IPSEC_POLICY_SET. */ - mp->b_cont = NULL; + *mpp = NULL; } + return (error); } - -/* ARGSUSED */ -static void -ip_bind_connected_resume_v6(ipsq_t *ipsq, queue_t *q, mblk_t *mp, - void *dummy_arg) +int +ip_proto_bind_laddr_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol, + const in6_addr_t *v6srcp, uint16_t lport, boolean_t fanout_insert) { - conn_t *connp = NULL; - t_scalar_t prim; + int error; + boolean_t ire_requested; + mblk_t *mp = NULL; + boolean_t orig_pkt_isv6 = connp->conn_pkt_isv6; + ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); + /* + * Note that we allow connect to broadcast and multicast + * address when ire_requested is set. Thus the ULP + * has to check for IRE_BROADCAST and multicast. + */ + if (mpp) + mp = *mpp; + ire_requested = (mp && DB_TYPE(mp) == IRE_DB_REQ_TYPE); - if (CONN_Q(q)) - connp = Q_TO_CONN(q); - ASSERT(connp != NULL); + ASSERT(connp->conn_af_isv6); + connp->conn_ulp = protocol; - prim = ((union T_primitives *)mp->b_rptr)->type; - ASSERT(prim == O_T_BIND_REQ || prim == T_BIND_REQ); + if (IN6_IS_ADDR_V4MAPPED(v6srcp) && !connp->conn_ipv6_v6only) { + /* Bind to IPv4 address */ + ipaddr_t v4src; - if (IPCL_IS_TCP(connp)) { - /* Pass sticky_ipp for scope_id and pktinfo */ - mp = ip_bind_v6(q, mp, connp, &connp->conn_tcp->tcp_sticky_ipp); + IN6_V4MAPPED_TO_IPADDR(v6srcp, v4src); + + error = ip_bind_laddr_v4(connp, mpp, protocol, v4src, lport, + fanout_insert); + if (error != 0) + goto bad_addr; + connp->conn_pkt_isv6 = B_FALSE; } else { - /* For UDP and ICMP */ - mp = ip_bind_v6(q, mp, connp, NULL); - } - if (mp != NULL) { - if (IPCL_IS_TCP(connp)) { - CONN_INC_REF(connp); - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, - ip_resume_tcp_bind, connp, SQ_FILL, - SQTAG_TCP_RPUTOTHER); - } else if (IPCL_IS_UDP(connp)) { - udp_resume_bind(connp, mp); - } else { - ASSERT(IPCL_IS_RAWIP(connp)); - rawip_resume_bind(connp, mp); + if (IN6_IS_ADDR_V4MAPPED(v6srcp)) { + error = 0; + goto bad_addr; } + error = ip_bind_laddr_v6(connp, mpp, protocol, v6srcp, + lport, fanout_insert); + if (error != 0) + goto bad_addr; + connp->conn_pkt_isv6 = B_TRUE; } + + ip_bind_post_handling_v6(connp, mpp ? *mpp : NULL, + orig_pkt_isv6 != connp->conn_pkt_isv6, ire_requested, ipst); + return (0); + +bad_addr: + if (error < 0) + error = -TBADADDR; + return (error); } /* @@ -2562,42 +2499,43 @@ ip_bind_connected_resume_v6(ipsq_t *ipsq, queue_t *q, mblk_t *mp, * non-TCP cases, it is NULL and for all other tcp cases it is not useful. * */ -static int -ip_bind_connected_v6(conn_t *connp, mblk_t *mp, in6_addr_t *v6src, - uint16_t lport, const in6_addr_t *v6dst, ip6_pkt_t *ipp, uint16_t fport, - boolean_t ire_requested, boolean_t ipsec_policy_set, - boolean_t fanout_insert, boolean_t verify_dst) +int +ip_bind_connected_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol, + in6_addr_t *v6src, uint16_t lport, const in6_addr_t *v6dst, + ip6_pkt_t *ipp, uint16_t fport, boolean_t fanout_insert, + boolean_t verify_dst) { ire_t *src_ire; ire_t *dst_ire; int error = 0; - int protocol; - mblk_t *policy_mp; ire_t *sire = NULL; ire_t *md_dst_ire = NULL; ill_t *md_ill = NULL; ill_t *dst_ill = NULL; ipif_t *src_ipif = NULL; zoneid_t zoneid; - boolean_t ill_held = B_FALSE; + boolean_t ill_held = B_FALSE; + mblk_t *mp = NULL; + boolean_t ire_requested = B_FALSE; + boolean_t ipsec_policy_set = B_FALSE; ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + ts_label_t *tsl = NULL; - src_ire = dst_ire = NULL; - /* - * NOTE: The protocol is beyond the wptr because that's how - * the undocumented transport<-->IP T_BIND_REQ behavior works. - */ - protocol = *mp->b_wptr & 0xFF; + if (mpp) + mp = *mpp; + + if (mp != NULL) { + ire_requested = (DB_TYPE(mp) == IRE_DB_REQ_TYPE); + ipsec_policy_set = (DB_TYPE(mp) == IPSEC_POLICY_SET); + tsl = MBLK_GETLABEL(mp); + } + src_ire = dst_ire = NULL; /* * If we never got a disconnect before, clear it now. */ connp->conn_fully_bound = B_FALSE; - if (ipsec_policy_set) { - policy_mp = mp->b_cont; - } - zoneid = connp->conn_zoneid; if (IN6_IS_ADDR_MULTICAST(v6dst)) { @@ -2620,7 +2558,7 @@ ip_bind_connected_v6(conn_t *connp, mblk_t *mp, in6_addr_t *v6src, ipif = ipif_lookup_group_v6(v6dst, zoneid, ipst); } mutex_exit(&connp->conn_lock); - if (ipif == NULL || !ire_requested || + if (ipif == NULL || ire_requested || (dst_ire = ipif_to_ire_v6(ipif)) == NULL) { if (ipif != NULL) ipif_refrele(ipif); @@ -2637,7 +2575,7 @@ ip_bind_connected_v6(conn_t *connp, mblk_t *mp, in6_addr_t *v6src, ipif_refrele(ipif); } else { dst_ire = ire_route_lookup_v6(v6dst, NULL, NULL, 0, - NULL, &sire, zoneid, MBLK_GETLABEL(mp), + NULL, &sire, zoneid, tsl, MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_PARENT | MATCH_IRE_RJ_BHOLE | MATCH_IRE_SECATTR, ipst); @@ -2693,8 +2631,8 @@ ip_bind_connected_v6(conn_t *connp, mblk_t *mp, in6_addr_t *v6src, */ if (dst_ire != NULL && is_system_labeled() && !IPCL_IS_TCP(connp) && - tsol_compute_label_v6(DB_CREDDEF(mp, connp->conn_cred), v6dst, NULL, - connp->conn_mac_exempt, ipst) != 0) { + tsol_compute_label_v6(DB_CREDDEF(mp, connp->conn_cred), + v6dst, NULL, connp->conn_mac_exempt, ipst) != 0) { error = EHOSTUNREACH; if (ip_debug > 2) { pr_addr_dbg("ip_bind_connected: no label for dst %s\n", @@ -2831,25 +2769,24 @@ ip_bind_connected_v6(conn_t *connp, mblk_t *mp, in6_addr_t *v6src, /* No need to hold ill here */ dst_ill = dst_ire->ire_ipif->ipif_ill; } - if (!ip6_asp_can_lookup(ipst)) { - *mp->b_wptr++ = (char)protocol; - ip6_asp_pending_op(CONNP_TO_WQ(connp), mp, - ip_bind_connected_resume_v6); - error = EINPROGRESS; - goto refrele_and_quit; - } - src_ipif = ipif_select_source_v6(dst_ill, v6dst, - RESTRICT_TO_NONE, connp->conn_src_preferences, - zoneid); - ip6_asp_table_refrele(ipst); - if (src_ipif == NULL) { - pr_addr_dbg("ip_bind_connected_v6: " - "no usable source address for " - "connection to %s\n", AF_INET6, v6dst); + if (ip6_asp_can_lookup(ipst)) { + src_ipif = ipif_select_source_v6(dst_ill, + v6dst, RESTRICT_TO_NONE, + connp->conn_src_preferences, zoneid); + ip6_asp_table_refrele(ipst); + if (src_ipif == NULL) { + pr_addr_dbg("ip_bind_connected_v6: " + "no usable source address for " + "connection to %s\n", + AF_INET6, v6dst); + error = EADDRNOTAVAIL; + goto bad_addr; + } + *v6src = src_ipif->ipif_v6lcl_addr; + } else { error = EADDRNOTAVAIL; goto bad_addr; } - *v6src = src_ipif->ipif_v6lcl_addr; } } @@ -2922,13 +2859,13 @@ ip_bind_connected_v6(conn_t *connp, mblk_t *mp, in6_addr_t *v6src, if (sire != NULL) ulp_info = &(sire->ire_uinfo); - if (!ip_bind_insert_ire_v6(mp, dst_ire, v6dst, ulp_info, + if (!ip_bind_get_ire_v6(mpp, dst_ire, v6dst, ulp_info, ipst)) { error = -1; goto bad_addr; } } else if (ipsec_policy_set) { - if (!ip_bind_ipsec_policy_set(connp, policy_mp)) { + if (!ip_bind_ipsec_policy_set(connp, mp)) { error = -1; goto bad_addr; } @@ -2982,19 +2919,24 @@ ip_bind_connected_v6(conn_t *connp, mblk_t *mp, in6_addr_t *v6src, ASSERT(md_ill != NULL); ASSERT(md_ill->ill_mdt_capab != NULL); if ((mdinfo_mp = ip_mdinfo_return(md_dst_ire, connp, - md_ill->ill_name, md_ill->ill_mdt_capab)) != NULL) - linkb(mp, mdinfo_mp); + md_ill->ill_name, md_ill->ill_mdt_capab)) != NULL) { + if (mp == NULL) { + *mpp = mdinfo_mp; + } else { + linkb(mp, mdinfo_mp); + } + } } } bad_addr: if (ipsec_policy_set) { - ASSERT(policy_mp != NULL); - freeb(policy_mp); + ASSERT(mp != NULL); + freeb(mp); /* * As of now assume that nothing else accompanies * IPSEC_POLICY_SET. */ - mp->b_cont = NULL; + *mpp = NULL; } refrele_and_quit: if (src_ire != NULL) @@ -3012,34 +2954,110 @@ refrele_and_quit: return (error); } +/* ARGSUSED */ +int +ip_proto_bind_connected_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol, + in6_addr_t *v6srcp, uint16_t lport, const in6_addr_t *v6dstp, + ip6_pkt_t *ipp, uint16_t fport, boolean_t fanout_insert, + boolean_t verify_dst) +{ + int error = 0; + boolean_t orig_pkt_isv6 = connp->conn_pkt_isv6; + boolean_t ire_requested; + ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + + /* + * Note that we allow connect to broadcast and multicast + * address when ire_requested is set. Thus the ULP + * has to check for IRE_BROADCAST and multicast. + */ + ASSERT(mpp != NULL); + ire_requested = (*mpp != NULL && DB_TYPE(*mpp) == IRE_DB_REQ_TYPE); + + ASSERT(connp->conn_af_isv6); + connp->conn_ulp = protocol; + + /* For raw socket, the local port is not set. */ + lport = lport != 0 ? lport : connp->conn_lport; + + /* + * Bind to local and remote address. Local might be + * unspecified in which case it will be extracted from + * ire_src_addr_v6 + */ + if (IN6_IS_ADDR_V4MAPPED(v6dstp) && !connp->conn_ipv6_v6only) { + /* Connect to IPv4 address */ + ipaddr_t v4src; + ipaddr_t v4dst; + + /* Is the source unspecified or mapped? */ + if (!IN6_IS_ADDR_V4MAPPED(v6srcp) && + !IN6_IS_ADDR_UNSPECIFIED(v6srcp)) { + ip1dbg(("ip_proto_bind_connected_v6: " + "dst is mapped, but not the src\n")); + goto bad_addr; + } + IN6_V4MAPPED_TO_IPADDR(v6srcp, v4src); + IN6_V4MAPPED_TO_IPADDR(v6dstp, v4dst); + + /* Always verify destination reachability. */ + error = ip_bind_connected_v4(connp, mpp, protocol, &v4src, + lport, v4dst, fport, B_TRUE, B_TRUE); + if (error != 0) + goto bad_addr; + IN6_IPADDR_TO_V4MAPPED(v4src, v6srcp); + connp->conn_pkt_isv6 = B_FALSE; + } else if (IN6_IS_ADDR_V4MAPPED(v6srcp)) { + ip1dbg(("ip_proto_bind_connected_v6: " + "src is mapped, but not the dst\n")); + goto bad_addr; + } else { + error = ip_bind_connected_v6(connp, mpp, protocol, v6srcp, + lport, v6dstp, ipp, fport, B_TRUE, verify_dst); + if (error != 0) + goto bad_addr; + connp->conn_pkt_isv6 = B_TRUE; + } + + ip_bind_post_handling_v6(connp, mpp ? *mpp : NULL, + orig_pkt_isv6 != connp->conn_pkt_isv6, ire_requested, ipst); + + /* Send it home. */ + return (0); + +bad_addr: + if (error == 0) + error = -TBADADDR; + return (error); +} + /* - * Insert the ire in b_cont. Returns false if it fails (due to lack of space). + * Get the ire in *mpp. Returns false if it fails (due to lack of space). * Makes the IRE be IRE_BROADCAST if dst is a multicast address. */ /* ARGSUSED4 */ static boolean_t -ip_bind_insert_ire_v6(mblk_t *mp, ire_t *ire, const in6_addr_t *dst, +ip_bind_get_ire_v6(mblk_t **mpp, ire_t *ire, const in6_addr_t *dst, iulp_t *ulp_info, ip_stack_t *ipst) { - mblk_t *mp1; + mblk_t *mp = *mpp; ire_t *ret_ire; - mp1 = mp->b_cont; - ASSERT(mp1 != NULL); + ASSERT(mp != NULL); if (ire != NULL) { /* - * mp1 initialized above to IRE_DB_REQ_TYPE + * mp initialized above to IRE_DB_REQ_TYPE * appended mblk. Its <upper protocol>'s * job to make sure there is room. */ - if ((mp1->b_datap->db_lim - mp1->b_rptr) < sizeof (ire_t)) + if ((mp->b_datap->db_lim - mp->b_rptr) < sizeof (ire_t)) return (B_FALSE); - mp1->b_datap->db_type = IRE_DB_TYPE; - mp1->b_wptr = mp1->b_rptr + sizeof (ire_t); - bcopy(ire, mp1->b_rptr, sizeof (ire_t)); - ret_ire = (ire_t *)mp1->b_rptr; + mp->b_datap->db_type = IRE_DB_TYPE; + mp->b_wptr = mp->b_rptr + sizeof (ire_t); + bcopy(ire, mp->b_rptr, sizeof (ire_t)); + ret_ire = (ire_t *)mp->b_rptr; if (IN6_IS_ADDR_MULTICAST(dst) || IN6_IS_ADDR_V4MAPPED_CLASSD(dst)) { ret_ire->ire_type = IRE_BROADCAST; @@ -3049,13 +3067,13 @@ ip_bind_insert_ire_v6(mblk_t *mp, ire_t *ire, const in6_addr_t *dst, bcopy(ulp_info, &(ret_ire->ire_uinfo), sizeof (iulp_t)); } - ret_ire->ire_mp = mp1; + ret_ire->ire_mp = mp; } else { /* * No IRE was found. Remove IRE mblk. */ - mp->b_cont = mp1->b_cont; - freeb(mp1); + *mpp = mp->b_cont; + freeb(mp); } return (B_TRUE); } @@ -3168,7 +3186,7 @@ ip_fanout_proto_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill, break; } - if (connp == NULL || connp->conn_upq == NULL) { + if (connp == NULL) { /* * No one bound to this port. Is * there a client that wants all @@ -3184,6 +3202,8 @@ ip_fanout_proto_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill, return; } + ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL); + CONN_INC_REF(connp); first_connp = connp; @@ -3217,7 +3237,7 @@ ip_fanout_proto_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill, * needed just for verifying policy and it is never * sent up. */ - if (connp == NULL || connp->conn_upq == NULL || + if (connp == NULL || (((first_mp1 = dupmsg(first_mp)) == NULL) && ((first_mp1 = ip_copymsg(first_mp)) == NULL))) { /* @@ -3227,6 +3247,7 @@ ip_fanout_proto_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill, connp = first_connp; break; } + ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL); mp1 = mctl_present ? first_mp1->b_cont : first_mp1; CONN_INC_REF(connp); mutex_exit(&connfp->connf_lock); @@ -3243,7 +3264,9 @@ ip_fanout_proto_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill, } if (mp1 == NULL) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - } else if (!canputnext(rq)) { + } else if ( + (IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || + (!IPCL_IS_NONSTR(connp) && !canputnext(rq))) { if (flags & IP_FF_RAWIP) { BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows); @@ -3320,7 +3343,9 @@ ip_fanout_proto_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill, } rq = connp->conn_rq; - if (!canputnext(rq)) { + if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || + (!IPCL_IS_NONSTR(connp) && !canputnext(rq))) { + if (flags & IP_FF_RAWIP) { BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows); } else { @@ -3740,7 +3765,8 @@ ip_fanout_udp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, uint32_t ports, CONN_INC_REF(connp); mutex_exit(&connfp->connf_lock); - if (CONN_UDP_FLOWCTLD(connp)) { + if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || + (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) { freemsg(first_mp); CONN_DEC_REF(connp); return; @@ -3870,7 +3896,8 @@ ip_fanout_udp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, uint32_t ports, BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); goto next_one; } - if (CONN_UDP_FLOWCTLD(connp)) { + if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || + (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) { BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows); freemsg(first_mp1); goto next_one; @@ -3938,7 +3965,8 @@ next_one: first_mp = mp; } } - if (CONN_UDP_FLOWCTLD(connp)) { + if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || + (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) { BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows); freemsg(mp); } else { @@ -8397,7 +8425,8 @@ udp_fanout: return; } - if (CONN_UDP_FLOWCTLD(connp)) { + if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || + (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) { freemsg(first_mp); BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows); CONN_DEC_REF(connp); @@ -9069,7 +9098,7 @@ done: * * case 1 : Routing header was processed by this node and * ip_process_rthdr replaced ip6_dst with the next hop - * and we are forwarding the packet to the next hop. + * and we are forwarding the packet to the next hop. * * case 2 : Routing header was not processed by this node and we * are just forwarding the packet. diff --git a/usr/src/uts/common/inet/ip/ip6_if.c b/usr/src/uts/common/inet/ip/ip6_if.c index dc703f40c3..81447c2e30 100644 --- a/usr/src/uts/common/inet/ip/ip6_if.c +++ b/usr/src/uts/common/inet/ip/ip6_if.c @@ -284,6 +284,44 @@ repeat: goto repeat; } +boolean_t +ip_addr_exists_v6(const in6_addr_t *addr, zoneid_t zoneid, + ip_stack_t *ipst) +{ + ipif_t *ipif; + ill_t *ill; + ill_walk_context_t ctx; + + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + + ill = ILL_START_WALK_V6(&ctx, ipst); + for (; ill != NULL; ill = ill_next(&ctx, ill)) { + mutex_enter(&ill->ill_lock); + for (ipif = ill->ill_ipif; ipif != NULL; + ipif = ipif->ipif_next) { + if (zoneid != ALL_ZONES && + ipif->ipif_zoneid != zoneid && + ipif->ipif_zoneid != ALL_ZONES) + continue; + /* Allow the ipif to be down */ + if (((IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, + addr) && + (ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || + ((ipif->ipif_flags & IPIF_POINTOPOINT) && + IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, + addr))) { + mutex_exit(&ill->ill_lock); + rw_exit(&ipst->ips_ill_g_lock); + return (B_TRUE); + } + } + mutex_exit(&ill->ill_lock); + } + + rw_exit(&ipst->ips_ill_g_lock); + return (B_FALSE); +} + /* * Look for an ipif with the specified address. For point-point links * we look for matches on either the destination address and the local @@ -2237,7 +2275,6 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst, dstinfo.dst_scope = ip_addr_scope_v6(dst); dstinfo.dst_label = ip6_asp_lookup(dst, NULL, ipst); dstinfo.dst_prefer_src_tmp = ((src_prefs & IPV6_PREFER_SRC_TMP) != 0); - rw_enter(&ipst->ips_ill_g_lock, RW_READER); /* * Section three of the I-D states that for multicast and diff --git a/usr/src/uts/common/inet/ip/ip_helper_stream.c b/usr/src/uts/common/inet/ip/ip_helper_stream.c new file mode 100644 index 0000000000..7da64667d1 --- /dev/null +++ b/usr/src/uts/common/inet/ip/ip_helper_stream.c @@ -0,0 +1,482 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <inet/ip.h> +#include <inet/ip_impl.h> +#include <inet/ipclassifier.h> +#include <inet/proto_set.h> +#include <sys/stream.h> +#include <sys/strsubr.h> +#include <sys/strsun.h> +#include <sys/cmn_err.h> +#include <sys/t_kuser.h> +#include <sys/tihdr.h> +#include <sys/pathname.h> +#include <sys/sockio.h> +#include <sys/vmem.h> +#include <sys/disp.h> + +void ip_helper_wput(queue_t *q, mblk_t *mp); + +static int ip_helper_stream_close(queue_t *, int); + +static struct module_info ip_helper_stream_info = { + 0, "iphelper", IP_MOD_MINPSZ, IP_MOD_MAXPSZ, IP_MOD_HIWAT, IP_MOD_LOWAT +}; + +static struct qinit ip_helper_stream_rinit = { + NULL, NULL, NULL, ip_helper_stream_close, NULL, + &ip_helper_stream_info, NULL +}; + +static struct qinit ip_helper_stream_winit = { + (pfi_t)ip_helper_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, + &ip_helper_stream_info, NULL, NULL, NULL, STRUIOT_NONE +}; + +#define IP_USE_HELPER_CACHE (ip_helper_stream_cache != NULL) + +/* + * set the q_ptr of the 'q' to the conn_t pointer passed in + */ +static void +ip_helper_share_conn(queue_t *q, mblk_t *mp) +{ + if (IP_USE_HELPER_CACHE) { + ip_helper_stream_info_t *ip_helper_info; + + ip_helper_info = *((ip_helper_stream_info_t **) + mp->b_cont->b_rptr); + ip_helper_info->ip_helper_stream_minfo = q->q_ptr; + ip_helper_info->ip_helper_stream_rq = RD(q); + ip_helper_info->ip_helper_stream_wq = WR(q); + } else { + conn_t *connp = *((conn_t **)mp->b_cont->b_rptr); + + connp->conn_helper_info->ip_helper_stream_minfo = q->q_ptr; + connp->conn_helper_info->ip_helper_stream_rq = RD(q); + connp->conn_helper_info->ip_helper_stream_wq = WR(q); + WR(q)->q_ptr = RD(q)->q_ptr = (void *)connp; + connp->conn_rq = RD(q); + connp->conn_wq = WR(q); + } + miocack(q, mp, 0, 0); +} + +void +ip_helper_wput(queue_t *q, mblk_t *mp) +{ + struct iocblk *iocp = (struct iocblk *)mp->b_rptr; + if (DB_TYPE(mp) == M_IOCTL && + iocp->ioc_cmd == SIOCSQPTR) { + ip_helper_share_conn(q, mp); + } else { + conn_t *connp = (conn_t *)q->q_ptr; + + if (connp->conn_af_isv6) { + ip_wput_v6(q, mp); + } else { + ip_wput(q, mp); + } + } +} + +/* ARGSUSED */ +int +ip_helper_stream_setup(queue_t *q, dev_t *devp, int flag, int sflag, + cred_t *credp, boolean_t isv6) +{ + major_t maj; + ip_helper_minfo_t *ip_minfop; + + ASSERT((flag & ~(FKLYR)) == IP_HELPER_STR); + + ASSERT(RD(q) == q); + + ip_minfop = kmem_alloc(sizeof (ip_helper_minfo_t), KM_NOSLEEP); + if (ip_minfop == NULL) { + return (ENOMEM); + } + + ip_minfop->ip_minfo_dev = 0; + ip_minfop->ip_minfo_arena = NULL; + + /* + * Clone the device, allocate minor device number + */ + if (ip_minor_arena_la != NULL) + ip_minfop->ip_minfo_dev = inet_minor_alloc(ip_minor_arena_la); + + if (ip_minfop->ip_minfo_dev == 0) { + /* + * numbers in the large arena are exhausted + * Try small arena. + * Or this is a 32 bit system, 32 bit systems do not have + * ip_minor_arena_la + */ + ip_minfop->ip_minfo_dev = inet_minor_alloc(ip_minor_arena_sa); + if (ip_minfop->ip_minfo_dev == 0) { + return (EBUSY); + } + ip_minfop->ip_minfo_arena = ip_minor_arena_sa; + } else { + ip_minfop->ip_minfo_arena = ip_minor_arena_la; + } + + + ASSERT(ip_minfop->ip_minfo_dev != 0); + ASSERT(ip_minfop->ip_minfo_arena != NULL); + + RD(q)->q_ptr = WR(q)->q_ptr = ip_minfop; + + maj = getemajor(*devp); + *devp = makedevice(maj, (ulong_t)(ip_minfop->ip_minfo_dev)); + + q->q_qinfo = &ip_helper_stream_rinit; + WR(q)->q_qinfo = &ip_helper_stream_winit; + qprocson(q); + return (0); +} + +/* ARGSUSED */ +static int +ip_helper_stream_close(queue_t *q, int flag) +{ + ip_helper_minfo_t *ip_minfop; + + qprocsoff(q); + ip_minfop = (q)->q_ptr; + inet_minor_free(ip_minfop->ip_minfo_arena, + ip_minfop->ip_minfo_dev); + kmem_free(ip_minfop, sizeof (ip_helper_minfo_t)); + RD(q)->q_ptr = NULL; + WR(q)->q_ptr = NULL; + return (0); +} + +/* + * Public interface for creating an IP stream with shared conn_t + */ +/* ARGSUSED */ +int +ip_create_helper_stream(conn_t *connp, ldi_ident_t li) +{ + int error; + int ret; + + ASSERT(!servicing_interrupt()); + + error = 0; + if (IP_USE_HELPER_CACHE) { + connp->conn_helper_info = (ip_helper_stream_info_t *) + kmem_cache_alloc(ip_helper_stream_cache, KM_SLEEP); + ASSERT(connp->conn_helper_info != NULL); + connp->conn_rq = connp->conn_helper_info->ip_helper_stream_rq; + connp->conn_wq = connp->conn_helper_info->ip_helper_stream_wq; + connp->conn_helper_info->ip_helper_stream_rq->q_ptr = + (void *)connp; + connp->conn_helper_info->ip_helper_stream_wq->q_ptr = + (void *)connp; + } else { + ASSERT(connp->conn_helper_info == NULL); + connp->conn_helper_info = (ip_helper_stream_info_t *) + kmem_alloc(sizeof (ip_helper_stream_info_t), KM_SLEEP); + /* + * open ip device via the layered interface. + * pass in kcred as some threads do not have the + * priviledge to open /dev/ip and the check in + * secpolicy_spec_open() will fail the open + */ + error = ldi_open_by_name(connp->conn_af_isv6 ? + DEV_IP6 : DEV_IP, IP_HELPER_STR, + kcred, &connp->conn_helper_info->ip_helper_stream_handle, + li); + + if (error != 0) { + kmem_free(connp->conn_helper_info, + (sizeof (ip_helper_stream_info_t))); + connp->conn_helper_info = NULL; + return (error); + } + /* + * Share connp with the helper stream + */ + error = ldi_ioctl( + connp->conn_helper_info->ip_helper_stream_handle, + SIOCSQPTR, (intptr_t)connp, FKIOCTL, kcred, &ret); + + if (error != 0) { + /* + * Passing in a zero flag indicates that an error + * occured and stream was not shared + */ + (void) ldi_close( + connp->conn_helper_info->ip_helper_stream_handle, + 0, kcred); + kmem_free(connp->conn_helper_info, + (sizeof (ip_helper_stream_info_t))); + connp->conn_helper_info = NULL; + } + } + return (error); +} + +/* + * Public interface for closing the shared IP stream + */ +/* ARGSUSED */ +void +ip_close_helper_stream(conn_t *connp) +{ + ASSERT(!servicing_interrupt()); + if (IP_USE_HELPER_CACHE) { + ASSERT(connp->conn_helper_info->ip_helper_stream_rq != NULL); + ASSERT(connp->conn_helper_info->ip_helper_stream_wq != NULL); + + /* Prevent service procedures from being called */ + disable_svc(connp->conn_helper_info->ip_helper_stream_rq); + + /* Wait until service procedure of each queue is run */ + wait_svc(connp->conn_helper_info->ip_helper_stream_rq); + + /* Cleanup any pending ioctls */ + conn_ioctl_cleanup(connp); + + /* Allow service procedures to be called again */ + enable_svc(connp->conn_helper_info->ip_helper_stream_rq); + + /* Flush the queues */ + flushq(connp->conn_helper_info->ip_helper_stream_rq, FLUSHALL); + flushq(connp->conn_helper_info->ip_helper_stream_wq, FLUSHALL); + + connp->conn_helper_info->ip_helper_stream_rq->q_ptr = NULL; + connp->conn_helper_info->ip_helper_stream_wq->q_ptr = NULL; + + kmem_cache_free(ip_helper_stream_cache, + connp->conn_helper_info); + } else { + ASSERT( + connp->conn_helper_info->ip_helper_stream_handle != NULL); + + connp->conn_helper_info->ip_helper_stream_rq->q_ptr = + connp->conn_helper_info->ip_helper_stream_wq->q_ptr = + connp->conn_helper_info->ip_helper_stream_minfo; + (void) ldi_close( + connp->conn_helper_info->ip_helper_stream_handle, + IP_HELPER_STR, kcred); + kmem_free(connp->conn_helper_info, + sizeof (ip_helper_stream_info_t)); + } + connp->conn_helper_info = NULL; +} + +/* + * create a T_SVR4_OPTMGMT_REQ TPI message and send down the IP stream + */ +static int +ip_send_option_request(conn_t *connp, uint_t optset_context, int level, + int option_name, const void *optval, t_uscalar_t optlen, cred_t *cr) +{ + struct T_optmgmt_req *optmgmt_reqp; + struct opthdr *ohp; + ssize_t size; + mblk_t *mp; + int error; + + size = sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) + optlen; + mp = allocb_cred(size, cr); + if (mp == NULL) + return (ENOMEM); + + mp->b_datap->db_type = M_PROTO; + optmgmt_reqp = (struct T_optmgmt_req *)mp->b_wptr; + + optmgmt_reqp->PRIM_type = T_SVR4_OPTMGMT_REQ; + optmgmt_reqp->MGMT_flags = optset_context; + optmgmt_reqp->OPT_length = (t_scalar_t)sizeof (struct opthdr) + optlen; + optmgmt_reqp->OPT_offset = (t_scalar_t)sizeof (struct T_optmgmt_req); + + mp->b_wptr += sizeof (struct T_optmgmt_req); + + ohp = (struct opthdr *)mp->b_wptr; + + ohp->level = level; + ohp->name = option_name; + ohp->len = optlen; + + mp->b_wptr += sizeof (struct opthdr); + + if (optval != NULL) { + bcopy(optval, mp->b_wptr, optlen); + } else { + bzero(mp->b_wptr, optlen); + } + mp->b_wptr += optlen; + + /* + * Send down the primitive + */ + error = ldi_putmsg(connp->conn_helper_info->ip_helper_stream_handle, + mp); + return (error); +} + +/* + * wait/process the response to T_SVR4_OPTMGMT_REQ TPI message + */ +static int +ip_get_option_response(conn_t *connp, uint_t optset_context, void *optval, + t_uscalar_t *optlenp) +{ + union T_primitives *tpr; + int error; + mblk_t *mp; + + mp = NULL; + + ASSERT(optset_context == T_CHECK || optset_context == T_NEGOTIATE); + error = ldi_getmsg(connp->conn_helper_info->ip_helper_stream_handle, + &mp, NULL); + if (error != 0) { + return (error); + } + + if (DB_TYPE(mp) != M_PCPROTO || MBLKL(mp) < sizeof (tpr->type)) { + error = EPROTO; + goto done; + } + + tpr = (union T_primitives *)mp->b_rptr; + + switch (tpr->type) { + case T_OPTMGMT_ACK: + if (MBLKL(mp) < TOPTMGMTACKSZ) + error = EPROTO; + break; + case T_ERROR_ACK: + if (MBLKL(mp) < TERRORACKSZ) { + error = EPROTO; + break; + } + + if (tpr->error_ack.TLI_error == TSYSERR) + error = tpr->error_ack.UNIX_error; + else + error = proto_tlitosyserr(tpr->error_ack.TLI_error); + break; + default: + error = EPROTO; + break; + } + + if ((optset_context == T_CHECK) && (error == 0)) { + struct opthdr *opt_res; + t_uscalar_t len; + t_uscalar_t size; + t_uscalar_t maxlen = *optlenp; + void *option; + struct T_optmgmt_ack *optmgmt_ack; + + optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr; + opt_res = (struct opthdr *) + ((uintptr_t)mp->b_rptr + optmgmt_ack->OPT_offset); + /* + * Check mblk boundary + */ + if (!MBLKIN(mp, optmgmt_ack->OPT_offset, + optmgmt_ack->OPT_length)) { + error = EPROTO; + goto done; + } + + /* + * Check alignment + */ + if ((((uintptr_t)opt_res) & (__TPI_ALIGN_SIZE - 1)) != 0) { + error = EPROTO; + goto done; + } + + option = &opt_res[1]; + + /* check to ensure that the option is within bounds */ + if ((((uintptr_t)option + opt_res->len) < (uintptr_t)option) || + !MBLKIN(mp, sizeof (struct opthdr), opt_res->len)) { + error = EPROTO; + goto done; + } + + len = opt_res->len; + size = MIN(len, maxlen); + + /* + * Copy data + */ + bcopy(option, optval, size); + bcopy(&size, optlenp, sizeof (size)); + } + +done: + freemsg(mp); + return (error); +} + +/* + * Public interface to get socketoptions via the ip helper stream. + */ +int +ip_get_options(conn_t *connp, int level, int option_name, void *optval, + t_uscalar_t *optlenp, cred_t *cr) +{ + int error; + + error = ip_send_option_request(connp, T_CHECK, level, option_name, NULL, + *optlenp, cr); + if (error) + return (error); + + return (ip_get_option_response(connp, T_CHECK, optval, optlenp)); +} + +/* + * Public interface to set socket options via the ip helper stream. + */ +int +ip_set_options(conn_t *connp, int level, int option_name, const void *optval, + t_uscalar_t optlen, cred_t *cr) +{ + + int error; + + error = ip_send_option_request(connp, T_NEGOTIATE, level, option_name, + optval, optlen, cr); + if (error) + return (error); + + return (ip_get_option_response(connp, T_NEGOTIATE, (void *)optval, + &optlen)); +} diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c index d767b25a76..0597245499 100644 --- a/usr/src/uts/common/inet/ip/ip_if.c +++ b/usr/src/uts/common/inet/ip/ip_if.c @@ -5845,6 +5845,55 @@ repeat: } /* + * Check if the address exists in the system. + * We don't hold the conn_lock as we will not perform defered ipsqueue + * operation. + */ +boolean_t +ip_addr_exists(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) +{ + ipif_t *ipif; + ill_t *ill; + ill_walk_context_t ctx; + + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + + ill = ILL_START_WALK_V4(&ctx, ipst); + for (; ill != NULL; ill = ill_next(&ctx, ill)) { + mutex_enter(&ill->ill_lock); + for (ipif = ill->ill_ipif; ipif != NULL; + ipif = ipif->ipif_next) { + if (zoneid != ALL_ZONES && + zoneid != ipif->ipif_zoneid && + ipif->ipif_zoneid != ALL_ZONES) + continue; + /* Allow the ipif to be down */ + /* + * XXX Different from ipif_lookup_addr(), we don't do + * twice lookups. As from bind()'s point of view, we + * may return once we find a match. + */ + if (((ipif->ipif_lcl_addr == addr) && + ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || + ((ipif->ipif_flags & IPIF_POINTOPOINT) && + (ipif->ipif_pp_dst_addr == addr))) { + /* + * Allow bind() to be successful even if the + * ipif is with IPIF_CHANGING bit set. + */ + mutex_exit(&ill->ill_lock); + rw_exit(&ipst->ips_ill_g_lock); + return (B_TRUE); + } + } + mutex_exit(&ill->ill_lock); + } + + rw_exit(&ipst->ips_ill_g_lock); + return (B_FALSE); +} + +/* * Look for an ipif with the specified address. For point-point links * we look for matches on either the destination address and the local * address, but we ignore the check on the local address if IPIF_UNNUMBERED @@ -22145,7 +22194,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, q, mp, ip_process_ioctl, &err, ipst); - if (usesrc_ill == NULL) { return (err); } diff --git a/usr/src/uts/common/inet/ip/ip_opt_data.c b/usr/src/uts/common/inet/ip/ip_opt_data.c index 3df66ece60..bb6e98a99e 100644 --- a/usr/src/uts/common/inet/ip/ip_opt_data.c +++ b/usr/src/uts/common/inet/ip/ip_opt_data.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/stream.h> #define _SUN_TPI_VERSION 2 @@ -45,7 +43,7 @@ extern int ip_opt_default(queue_t *q, int level, int name, uchar_t *ptr); extern int ip_opt_get(queue_t *q, int level, int name, uchar_t *ptr); extern int ip_opt_set(queue_t *q, uint_t optset_context, int level, int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, - void *, cred_t *cr, mblk_t *); + void *dummy, cred_t *cr, mblk_t *first_mp); /* * Table of all known options handled on a IP protocol stack. @@ -71,9 +69,11 @@ opdes_t ip_opt_arr[] = { { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, - (OP_VARLEN|OP_NODEFAULT), 40, -1 /* not initialized */ }, + (OP_VARLEN|OP_NODEFAULT), + IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, - (OP_VARLEN|OP_NODEFAULT), 40, -1 /* not initialized */ }, + (OP_VARLEN|OP_NODEFAULT), + IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, { IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, diff --git a/usr/src/uts/common/inet/ip/ip_rts.c b/usr/src/uts/common/inet/ip/ip_rts.c index e232f6c04e..3324d1d833 100644 --- a/usr/src/uts/common/inet/ip/ip_rts.c +++ b/usr/src/uts/common/inet/ip/ip_rts.c @@ -1,5 +1,5 @@ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,8 +38,6 @@ * @(#)rtsock.c 8.6 (Berkeley) 2/11/95 */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * This file contains routines that processes routing socket requests. */ @@ -104,10 +102,9 @@ static void ip_rts_request_retry(ipsq_t *, queue_t *q, mblk_t *mp, void *); * */ void -rts_queue_input(mblk_t *mp, queue_t *q, sa_family_t af, ip_stack_t *ipst) +rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, ip_stack_t *ipst) { mblk_t *mp1; - int checkqfull; conn_t *connp, *next_connp; mutex_enter(&ipst->ips_rts_clients->connf_lock); @@ -130,24 +127,16 @@ rts_queue_input(mblk_t *mp, queue_t *q, sa_family_t af, ip_stack_t *ipst) * socket, we check if there is room upstream for a copy of the * message. */ - if ((q != NULL) && (CONNP_TO_RQ(connp) == RD(q))) { - if (connp->conn_loopback == 0) { + if ((o_connp == connp) && connp->conn_loopback == 0) { connp = connp->conn_next; continue; - } - /* - * Just because it is the same queue doesn't mean it - * will promptly read its acks. Have to avoid using - * all of kernel memory. - */ - checkqfull = B_TRUE; - } else { - checkqfull = B_TRUE; } CONN_INC_REF(connp); mutex_exit(&ipst->ips_rts_clients->connf_lock); /* Pass to rts_input */ - if (!checkqfull || canputnext(CONNP_TO_RQ(connp))) { + if ((IPCL_IS_NONSTR(connp) && !PROTO_FLOW_CNTRLD(connp))|| + (!IPCL_IS_NONSTR(connp) && + canputnext(CONNP_TO_RQ(connp)))) { mp1 = dupmsg(mp); if (mp1 == NULL) mp1 = copymsg(mp); @@ -273,7 +262,7 @@ ip_rts_unregister(conn_t *connp) * conn close occurs in conn_ioctl_cleanup. */ int -ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr) +ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr) { rt_msghdr_t *rtm = NULL; in6_addr_t dst_addr_v6; @@ -298,7 +287,6 @@ ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr) ipif_t *ipif = NULL; ipif_t *tmp_ipif = NULL; IOCP iocp = (IOCP)mp->b_rptr; - conn_t *connp; boolean_t gcgrp_xtraref = B_FALSE; tsol_gcgrp_addr_t ga; tsol_rtsecattr_t rtsecattr; @@ -311,8 +299,6 @@ ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr) ip1dbg(("ip_rts_request: mp is %x\n", DB_TYPE(mp))); - ASSERT(CONN_Q(q)); - connp = Q_TO_CONN(q); zoneid = connp->conn_zoneid; ipst = connp->conn_netstack->netstack_ip; @@ -564,7 +550,7 @@ ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr) error = ip_rt_add(dst_addr, net_mask, gw_addr, src_addr, rtm->rtm_flags, ipif, &ire, B_FALSE, - CONNP_TO_WQ(connp), ioc_mp, ip_rts_request_retry, + WR(q), ioc_mp, ip_rts_request_retry, rtsap, ipst); if (ipif != NULL) ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); @@ -602,7 +588,7 @@ ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr) error = ip_rt_add_v6(&dst_addr_v6, &net_mask_v6, &gw_addr_v6, &src_addr_v6, rtm->rtm_flags, - ipif, &ire, CONNP_TO_WQ(connp), ioc_mp, + ipif, &ire, WR(q), ioc_mp, ip_rts_request_retry, rtsap, ipst); break; } @@ -616,7 +602,7 @@ ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr) } error = ip_rt_add_v6(&dst_addr_v6, &net_mask_v6, &gw_addr_v6, NULL, rtm->rtm_flags, - ipif, &ire, CONNP_TO_WQ(connp), ioc_mp, + ipif, &ire, WR(q), ioc_mp, ip_rts_request_retry, rtsap, ipst); if (ipif != NULL) ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); @@ -646,14 +632,12 @@ ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr) case AF_INET: error = ip_rt_delete(dst_addr, net_mask, gw_addr, found_addrs, rtm->rtm_flags, ipif, B_FALSE, - CONNP_TO_WQ(connp), ioc_mp, ip_rts_request_retry, - ipst); + WR(q), ioc_mp, ip_rts_request_retry, ipst); break; case AF_INET6: error = ip_rt_delete_v6(&dst_addr_v6, &net_mask_v6, &gw_addr_v6, found_addrs, rtm->rtm_flags, ipif, - CONNP_TO_WQ(connp), ioc_mp, ip_rts_request_retry, - ipst); + WR(q), ioc_mp, ip_rts_request_retry, ipst); break; } break; @@ -867,7 +851,7 @@ ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr) */ tmp_ipif = ipif_lookup_addr( src_addr, NULL, ALL_ZONES, - CONNP_TO_WQ(connp), ioc_mp, + WR(q), ioc_mp, ip_rts_request_retry, &error, ipst); if (tmp_ipif == NULL) { @@ -1053,19 +1037,27 @@ done: /* OK ACK already set up by caller except this */ ip2dbg(("ip_rts_request: OK ACK\n")); } - rts_queue_input(mp, q, af, ipst); + rts_queue_input(mp, connp, af, ipst); } + iocp->ioc_error = error; ioc_mp->b_datap->db_type = M_IOCACK; if (iocp->ioc_error != 0) iocp->ioc_count = 0; (connp->conn_recv)(connp, ioc_mp, NULL); + /* conn was refheld in ip_wput_ioctl. */ CONN_OPER_PENDING_DONE(connp); return (error); } +int +ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr) +{ + return (ip_rts_request_common(q, mp, Q_TO_CONN(q), ioc_cr)); +} + /* * Build a reply to the RTM_GET request contained in the given message block * using the retrieved IRE of the destination address, the parent IRE (if it diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c index a19e729b41..50bd38c981 100644 --- a/usr/src/uts/common/inet/ip/ipclassifier.c +++ b/usr/src/uts/common/inet/ip/ipclassifier.c @@ -261,8 +261,8 @@ #include <inet/ip.h> #include <inet/ip6.h> -#include <inet/tcp.h> #include <inet/ip_ndp.h> +#include <inet/ip_impl.h> #include <inet/udp_impl.h> #include <inet/sctp_ip.h> #include <inet/sctp/sctp_impl.h> @@ -272,9 +272,11 @@ #include <sys/cpuvar.h> #include <inet/ipclassifier.h> +#include <inet/tcp.h> #include <inet/ipsec_impl.h> #include <sys/tsol/tnet.h> +#include <sys/sockio.h> #ifdef DEBUG #define IPCL_DEBUG @@ -325,6 +327,7 @@ typedef union itc_s { struct kmem_cache *tcp_conn_cache; struct kmem_cache *ip_conn_cache; +struct kmem_cache *ip_helper_stream_cache; extern struct kmem_cache *sctp_conn_cache; extern struct kmem_cache *tcp_sack_info_cache; extern struct kmem_cache *tcp_iphc_cache; @@ -350,6 +353,11 @@ static void rawip_conn_destructor(void *, void *); static int rts_conn_constructor(void *, void *, int); static void rts_conn_destructor(void *, void *); +static int ip_helper_stream_constructor(void *, void *, int); +static void ip_helper_stream_destructor(void *, void *); + +boolean_t ip_use_helper_cache = B_TRUE; + #ifdef IPCL_DEBUG #define INET_NTOA_BUFSIZE 18 @@ -394,6 +402,15 @@ ipcl_g_init(void) sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE, rts_conn_constructor, rts_conn_destructor, NULL, NULL, NULL, 0); + + if (ip_use_helper_cache) { + ip_helper_stream_cache = kmem_cache_create + ("ip_helper_stream_cache", sizeof (ip_helper_stream_info_t), + CACHE_ALIGN_SIZE, ip_helper_stream_constructor, + ip_helper_stream_destructor, NULL, NULL, NULL, 0); + } else { + ip_helper_stream_cache = NULL; + } } /* @@ -749,6 +766,7 @@ ipcl_conn_destroy(conn_t *connp) connp->conn_netstack = NULL; netstack_rele(ns); } + ipcl_conn_cleanup(connp); /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */ @@ -756,6 +774,7 @@ ipcl_conn_destroy(conn_t *connp) connp->conn_flags = IPCL_UDPCONN; kmem_cache_free(udp_conn_cache, connp); } else if (connp->conn_flags & IPCL_RAWIPCONN) { + connp->conn_flags = IPCL_RAWIPCONN; connp->conn_ulp = IPPROTO_ICMP; kmem_cache_free(rawip_conn_cache, connp); @@ -2025,6 +2044,7 @@ tcp_conn_constructor(void *buf, void *cdrarg, int kmflags) mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); + cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL); tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP); connp->conn_tcp = tcp; connp->conn_flags = IPCL_TCPCONN; @@ -2047,6 +2067,7 @@ tcp_conn_destructor(void *buf, void *cdrarg) tcp_timermp_free(tcp); mutex_destroy(&connp->conn_lock); cv_destroy(&connp->conn_cv); + cv_destroy(&connp->conn_sq_cv); } /* ARGSUSED */ @@ -2181,15 +2202,56 @@ rts_conn_destructor(void *buf, void *cdrarg) cv_destroy(&connp->conn_cv); } +/* ARGSUSED */ +int +ip_helper_stream_constructor(void *buf, void *cdrarg, int kmflags) +{ + int error; + netstack_t *ns; + int ret; + tcp_stack_t *tcps; + ip_helper_stream_info_t *ip_helper_str; + ip_stack_t *ipst; + + ns = netstack_find_by_cred(kcred); + ASSERT(ns != NULL); + tcps = ns->netstack_tcp; + ipst = ns->netstack_ip; + ASSERT(tcps != NULL); + ip_helper_str = (ip_helper_stream_info_t *)buf; + + error = ldi_open_by_name(DEV_IP, IP_HELPER_STR, kcred, + &ip_helper_str->ip_helper_stream_handle, ipst->ips_ldi_ident); + if (error != 0) { + goto done; + } + error = ldi_ioctl(ip_helper_str->ip_helper_stream_handle, + SIOCSQPTR, (intptr_t)buf, FKIOCTL, kcred, &ret); + if (error != 0) { + (void) ldi_close(ip_helper_str->ip_helper_stream_handle, 0, + kcred); + } +done: + netstack_rele(ipst->ips_netstack); + return (error); +} + +/* ARGSUSED */ +static void +ip_helper_stream_destructor(void *buf, void *cdrarg) +{ + ip_helper_stream_info_t *ip_helper_str = (ip_helper_stream_info_t *)buf; + + ip_helper_str->ip_helper_stream_rq->q_ptr = + ip_helper_str->ip_helper_stream_wq->q_ptr = + ip_helper_str->ip_helper_stream_minfo; + (void) ldi_close(ip_helper_str->ip_helper_stream_handle, 0, kcred); +} + + /* * Called as part of ipcl_conn_destroy to assert and clear any pointers * in the conn_t. - * - * Below we list all the pointers in the conn_t as a documentation aid. - * The ones that we can not ASSERT to be NULL are #ifdef'ed out. - * If you add any pointers to the conn_t please add an ASSERT here - * and #ifdef it out if it can't be actually asserted to be NULL. - * In any case, we bzero most of the conn_t at the end of the function. */ void ipcl_conn_cleanup(conn_t *connp) @@ -2197,7 +2259,6 @@ ipcl_conn_cleanup(conn_t *connp) ASSERT(connp->conn_ire_cache == NULL); ASSERT(connp->conn_latch == NULL); #ifdef notdef - /* These are not cleared */ ASSERT(connp->conn_rq == NULL); ASSERT(connp->conn_wq == NULL); #endif @@ -2236,11 +2297,11 @@ ipcl_conn_cleanup(conn_t *connp) ASSERT(connp->conn_peercred == NULL); ASSERT(connp->conn_netstack == NULL); + ASSERT(connp->conn_helper_info == NULL); /* Clear out the conn_t fields that are not preserved */ bzero(&connp->conn_start_clr, sizeof (conn_t) - ((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp)); - } /* diff --git a/usr/src/uts/common/inet/ip/keysock.c b/usr/src/uts/common/inet/ip/keysock.c index c982fb4c45..af0fd73d63 100644 --- a/usr/src/uts/common/inet/ip/keysock.c +++ b/usr/src/uts/common/inet/ip/keysock.c @@ -59,7 +59,7 @@ #include <inet/common.h> #include <netinet/ip6.h> #include <inet/ip.h> -#include <inet/mi.h> +#include <inet/proto_set.h> #include <inet/nd.h> #include <inet/optcom.h> #include <inet/ipsec_info.h> @@ -707,7 +707,8 @@ keysock_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) mutex_exit(&keystack->keystack_list_lock); qprocson(q); - (void) mi_set_sth_hiwat(q, keystack->keystack_recv_hiwat); + (void) proto_set_rx_hiwat(q, NULL, + keystack->keystack_recv_hiwat); /* * Wait outside the keysock module perimeter for IPsec * plumbing to be completed. If it fails, keysock_close() @@ -875,7 +876,7 @@ keysock_opt_set(queue_t *q, uint_t mgmt_flags, int level, if (*i1 > keystack->keystack_max_buf) return (ENOBUFS); RD(q)->q_hiwat = *i1; - (void) mi_set_sth_hiwat(RD(q), *i1); + (void) proto_set_rx_hiwat(RD(q), NULL, *i1); break; } mutex_exit(&ks->keysock_lock); diff --git a/usr/src/uts/common/inet/ip/rts.c b/usr/src/uts/common/inet/ip/rts.c index 350a5fa887..7965d37483 100644 --- a/usr/src/uts/common/inet/ip/rts.c +++ b/usr/src/uts/common/inet/ip/rts.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/stream.h> #include <sys/strsubr.h> @@ -41,15 +39,17 @@ #include <sys/suntpi.h> #include <sys/policy.h> #include <sys/zone.h> +#include <sys/disp.h> #include <sys/socket.h> +#include <sys/socketvar.h> #include <netinet/in.h> #include <inet/common.h> #include <netinet/ip6.h> #include <inet/ip.h> #include <inet/ipclassifier.h> -#include <inet/mi.h> +#include <inet/proto_set.h> #include <inet/nd.h> #include <inet/optcom.h> #include <netinet/ip_mroute.h> @@ -111,20 +111,10 @@ static rtsparam_t lcl_param_arr[] = { #define rtss_recv_hiwat rtss_params[2].rts_param_value #define rtss_max_buf rtss_params[3].rts_param_value -static int rts_close(queue_t *q); static void rts_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error); static void rts_input(void *, mblk_t *, void *); static mblk_t *rts_ioctl_alloc(mblk_t *data, cred_t *cr); -static int rts_open(queue_t *q, dev_t *devp, int flag, int sflag, - cred_t *credp); -int rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, - uchar_t *ptr); -int rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, - uchar_t *ptr); -int rts_opt_set(queue_t *q, uint_t optset_context, int level, - int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, - uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk); static int rts_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); static boolean_t rts_param_register(IDP *ndp, rtsparam_t *rtspa, int cnt); static int rts_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, @@ -137,12 +127,21 @@ static void rts_wput_iocdata(queue_t *q, mblk_t *mp); static void rts_wput_other(queue_t *q, mblk_t *mp); static int rts_wrw(queue_t *q, struiod_t *dp); +static int rts_stream_open(queue_t *q, dev_t *devp, int flag, int sflag, + cred_t *credp); +static conn_t *rts_open(int flag, cred_t *credp); + +static int rts_stream_close(queue_t *q); +static int rts_close(sock_lower_handle_t proto_handle, int flags, + cred_t *cr); + static struct module_info rts_mod_info = { 129, "rts", 1, INFPSZ, 512, 128 }; static struct qinit rtsrinit = { - NULL, (pfi_t)rts_rsrv, rts_open, rts_close, NULL, &rts_mod_info + NULL, (pfi_t)rts_rsrv, rts_stream_open, rts_stream_close, NULL, + &rts_mod_info }; static struct qinit rtswinit = { @@ -201,9 +200,8 @@ rts_ioctl_alloc(mblk_t *data, cred_t *cr) * internal datastructure. */ static int -rts_close(queue_t *q) +rts_common_close(queue_t *q, conn_t *connp) { - conn_t *connp = Q_TO_CONN(q); ASSERT(connp != NULL && IPCL_IS_RTS(connp)); @@ -211,25 +209,39 @@ rts_close(queue_t *q) ip_quiesce_conn(connp); - qprocsoff(q); + if (!IPCL_IS_NONSTR(connp)) { + qprocsoff(q); - /* - * Now we are truly single threaded on this stream, and can - * delete the things hanging off the connp, and finally the connp. - * We removed this connp from the fanout list, it cannot be - * accessed thru the fanouts, and we already waited for the - * conn_ref to drop to 0. We are already in close, so - * there cannot be any other thread from the top. qprocsoff - * has completed, and service has completed or won't run in - * future. - */ + /* + * Now we are truly single threaded on this stream, and can + * delete the things hanging off the connp, and finally the + * connp. + * We removed this connp from the fanout list, it cannot be + * accessed thru the fanouts, and we already waited for the + * conn_ref to drop to 0. We are already in close, so + * there cannot be any other thread from the top. qprocsoff + * has completed, and service has completed or won't run in + * future. + */ + inet_minor_free(connp->conn_minor_arena, connp->conn_dev); + } else { + ip_close_helper_stream(connp); + } ASSERT(connp->conn_ref == 1); - inet_minor_free(connp->conn_minor_arena, connp->conn_dev); connp->conn_ref--; ipcl_conn_destroy(connp); + return (0); +} + +static int +rts_stream_close(queue_t *q) +{ + conn_t *connp = Q_TO_CONN(q); + + (void) rts_common_close(q, connp); q->q_ptr = WR(q)->q_ptr = NULL; return (0); } @@ -240,14 +252,12 @@ rts_close(queue_t *q) */ /* ARGSUSED */ static int -rts_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) +rts_stream_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) { - rts_t *rts; conn_t *connp; dev_t conn_dev; - zoneid_t zoneid; - netstack_t *ns; rts_stack_t *rtss; + rts_t *rts; /* If the stream is already open, return immediately. */ if (q->q_ptr != NULL) @@ -256,40 +266,26 @@ rts_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) if (sflag == MODOPEN) return (EINVAL); - ns = netstack_find_by_cred(credp); - ASSERT(ns != NULL); - rtss = ns->netstack_rts; - ASSERT(rtss != NULL); - - /* - * For exclusive stacks we set the zoneid to zero - * to make RTS operate as if in the global zone. - */ - if (ns->netstack_stackid != GLOBAL_NETSTACKID) - zoneid = GLOBAL_ZONEID; - else - zoneid = crgetzoneid(credp); /* * Since RTS is not used so heavily, allocating from the small * arena should be sufficient. */ if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) { - netstack_rele(ns); return (EBUSY); } + + connp = rts_open(flag, credp); + ASSERT(connp != NULL); + + *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); - connp = ipcl_conn_create(IPCL_RTSCONN, KM_SLEEP, ns); - connp->conn_dev = conn_dev; - connp->conn_minor_arena = ip_minor_arena_sa; rts = connp->conn_rts; - /* - * ipcl_conn_create did a netstack_hold. Undo the hold that was - * done by netstack_find_by_cred() - */ - netstack_rele(ns); + rw_enter(&rts->rts_rwlock, RW_WRITER); + connp->conn_dev = conn_dev; + connp->conn_minor_arena = ip_minor_arena_sa; /* * Initialize the rts_t structure for this stream. @@ -299,25 +295,12 @@ rts_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) connp->conn_rq = q; connp->conn_wq = WR(q); - rw_enter(&rts->rts_rwlock, RW_WRITER); - ASSERT(connp->conn_rts == rts); - ASSERT(rts->rts_connp == connp); - - /* Set the initial state of the stream and the privilege status. */ - rts->rts_state = TS_UNBND; - connp->conn_zoneid = zoneid; - - connp->conn_ulp_labeled = is_system_labeled(); - - rts->rts_rtss = rtss; - + rtss = rts->rts_rtss; q->q_hiwat = rtss->rtss_recv_hiwat; WR(q)->q_hiwat = rtss->rtss_xmit_hiwat; WR(q)->q_lowat = rtss->rtss_xmit_lowat; - connp->conn_recv = rts_input; - crhold(credp); - connp->conn_cred = credp; + mutex_enter(&connp->conn_lock); connp->conn_state_flags &= ~CONN_INCIPIENT; @@ -325,7 +308,6 @@ rts_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) qprocson(q); rw_exit(&rts->rts_rwlock); - /* * Indicate the down IP module that this is a routing socket * client by sending an RTS IOCTL without any user data. Although @@ -335,7 +317,67 @@ rts_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) ip_rts_register(connp); return (0); +} + +/* ARGSUSED */ +static conn_t * +rts_open(int flag, cred_t *credp) +{ + netstack_t *ns; + rts_stack_t *rtss; + rts_t *rts; + conn_t *connp; + zoneid_t zoneid; + + ns = netstack_find_by_cred(credp); + ASSERT(ns != NULL); + rtss = ns->netstack_rts; + ASSERT(rtss != NULL); + + /* + * For exclusive stacks we set the zoneid to zero + * to make RTS operate as if in the global zone. + */ + if (ns->netstack_stackid != GLOBAL_NETSTACKID) + zoneid = GLOBAL_ZONEID; + else + zoneid = crgetzoneid(credp); + + connp = ipcl_conn_create(IPCL_RTSCONN, KM_SLEEP, ns); + rts = connp->conn_rts; + + /* + * ipcl_conn_create did a netstack_hold. Undo the hold that was + * done by netstack_find_by_cred() + */ + netstack_rele(ns); + + + rw_enter(&rts->rts_rwlock, RW_WRITER); + ASSERT(connp->conn_rts == rts); + ASSERT(rts->rts_connp == connp); + + connp->conn_zoneid = zoneid; + connp->conn_flow_cntrld = B_FALSE; + connp->conn_ulp_labeled = is_system_labeled(); + + rts->rts_rtss = rtss; + rts->rts_xmit_hiwat = rtss->rtss_xmit_hiwat; + + connp->conn_recv = rts_input; + crhold(credp); + connp->conn_cred = credp; + + /* + * rts sockets start out as bound and connected + * For streams based sockets, socket state is set to + * SS_ISBOUND | SS_ISCONNECTED in so_strinit. + */ + rts->rts_state = TS_DATA_XFER; + rw_exit(&rts->rts_rwlock); + + return (connp); } /* @@ -362,7 +404,7 @@ rts_ok_ack(queue_t *q, mblk_t *mp) * This routine is called by rts_wput to handle T_UNBIND_REQ messages. */ static void -rts_unbind(queue_t *q, mblk_t *mp) +rts_tpi_unbind(queue_t *q, mblk_t *mp) { conn_t *connp = Q_TO_CONN(q); rts_t *rts = connp->conn_rts; @@ -383,7 +425,7 @@ rts_unbind(queue_t *q, mblk_t *mp) * O_T_BIND_REQ and T_BIND_REQ semantics. */ static void -rts_bind(queue_t *q, mblk_t *mp) +rts_tpi_bind(queue_t *q, mblk_t *mp) { conn_t *connp = Q_TO_CONN(q); rts_t *rts = connp->conn_rts; @@ -392,13 +434,13 @@ rts_bind(queue_t *q, mblk_t *mp) if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, - "rts_bind: bad data, %d", rts->rts_state); + "rts_tpi_bind: bad data, %d", rts->rts_state); rts_err_ack(q, mp, TBADADDR, 0); return; } if (rts->rts_state != TS_UNBND) { (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, - "rts_bind: bad state, %d", rts->rts_state); + "rts_tpi_bind: bad state, %d", rts->rts_state); rts_err_ack(q, mp, TOUTSTATE, 0); return; } @@ -415,7 +457,7 @@ rts_bind(queue_t *q, mblk_t *mp) tbr = (struct T_bind_req *)mp->b_rptr; if (tbr->ADDR_length != 0) { (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, - "rts_bind: bad ADDR_length %d", tbr->ADDR_length); + "rts_tpi_bind: bad ADDR_length %d", tbr->ADDR_length); rts_err_ack(q, mp, TBADADDR, 0); return; } @@ -498,16 +540,14 @@ rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) return (-1); } -/* - * This routine retrieves the current status of socket options. - * It returns the size of the option retrieved. - */ -int -rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) + +static int +rts_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) { - int *i1 = (int *)ptr; - conn_t *connp = Q_TO_CONN(q); rts_t *rts = connp->conn_rts; + int *i1 = (int *)ptr; + + ASSERT(RW_READ_HELD(&rts->rts_rwlock)); switch (level) { case SOL_SOCKET: @@ -543,12 +583,12 @@ rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) * but changing them should do nothing. */ case SO_SNDBUF: - ASSERT(q->q_hiwat <= INT_MAX); - *i1 = (int)(q->q_hiwat); + ASSERT(rts->rts_xmit_hiwat <= INT_MAX); + *i1 = (int)(rts->rts_xmit_hiwat); break; case SO_RCVBUF: - ASSERT(q->q_hiwat <= INT_MAX); - *i1 = (int)(RD(q)->q_hiwat); + ASSERT(rts->rts_recv_hiwat <= INT_MAX); + *i1 = (int)(rts->rts_recv_hiwat); break; case SO_DOMAIN: *i1 = PF_ROUTE; @@ -563,60 +603,17 @@ rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) return ((int)sizeof (int)); } - -/* - * This routine sets socket options. - */ -/*ARGSUSED*/ -int -rts_opt_set(queue_t *q, uint_t optset_context, int level, - int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, - uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk) +/* ARGSUSED */ +static int +rts_do_opt_set(conn_t *connp, int level, int name, uint_t inlen, + uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, cred_t *cr, + void *thisdg_attrs, boolean_t checkonly) { int *i1 = (int *)invalp; - conn_t *connp = Q_TO_CONN(q); rts_t *rts = connp->conn_rts; - boolean_t checkonly; rts_stack_t *rtss = rts->rts_rtss; - switch (optset_context) { - case SETFN_OPTCOM_CHECKONLY: - checkonly = B_TRUE; - /* - * Note: Implies T_CHECK semantics for T_OPTCOM_REQ - * inlen != 0 implies value supplied and - * we have to "pretend" to set it. - * inlen == 0 implies that there is no - * value part in T_CHECK request and just validation - * done elsewhere should be enough, we just return here. - */ - if (inlen == 0) { - *outlenp = 0; - return (0); - } - break; - case SETFN_OPTCOM_NEGOTIATE: - checkonly = B_FALSE; - break; - case SETFN_UD_NEGOTIATE: - case SETFN_CONN_NEGOTIATE: - checkonly = B_FALSE; - /* - * Negotiating local and "association-related" options - * through T_UNITDATA_REQ or T_CONN_{REQ,CON} - * Not allowed in this module. - */ - return (EINVAL); - default: - /* - * We should never get here - */ - *outlenp = 0; - return (EINVAL); - } - - ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || - (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); + ASSERT(RW_WRITE_HELD(&rts->rts_rwlock)); /* * For rts, we should have no ancillary data sent down @@ -680,7 +677,9 @@ rts_opt_set(queue_t *q, uint_t optset_context, int level, return (ENOBUFS); } if (!checkonly) { - q->q_hiwat = *i1; + rts->rts_xmit_hiwat = *i1; + if (!IPCL_IS_NONSTR(connp)) + connp->conn_wq->q_hiwat = *i1; } break; /* goto sizeof (int) option return */ case SO_RCVBUF: @@ -689,9 +688,13 @@ rts_opt_set(queue_t *q, uint_t optset_context, int level, return (ENOBUFS); } if (!checkonly) { - RD(q)->q_hiwat = *i1; - (void) mi_set_sth_hiwat(RD(q), *i1); + rts->rts_recv_hiwat = *i1; + rw_exit(&rts->rts_rwlock); + (void) proto_set_rx_hiwat(connp->conn_rq, connp, + *i1); + rw_enter(&rts->rts_rwlock, RW_WRITER); } + break; /* goto sizeof (int) option return */ default: *outlenp = 0; @@ -705,11 +708,105 @@ rts_opt_set(queue_t *q, uint_t optset_context, int level, /* * Common case of return from an option that is sizeof (int) */ - *(int *)outvalp = *i1; + if (invalp != outvalp) { + /* don't trust bcopy for identical src/dst */ + (void) bcopy(invalp, outvalp, inlen); + } *outlenp = (t_uscalar_t)sizeof (int); return (0); } +static int +rts_opt_set(conn_t *connp, uint_t optset_context, int level, int name, + uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, + void *thisdg_attrs, cred_t *cr) +{ + boolean_t checkonly = B_FALSE; + + if (optset_context) { + switch (optset_context) { + case SETFN_OPTCOM_CHECKONLY: + checkonly = B_TRUE; + /* + * Note: Implies T_CHECK semantics for T_OPTCOM_REQ + * inlen != 0 implies value supplied and + * we have to "pretend" to set it. + * inlen == 0 implies that there is no value part + * in T_CHECK request and just validation + * done elsewhere should be enough, we just return here. + */ + if (inlen == 0) { + *outlenp = 0; + return (0); + } + break; + case SETFN_OPTCOM_NEGOTIATE: + checkonly = B_FALSE; + break; + case SETFN_UD_NEGOTIATE: + case SETFN_CONN_NEGOTIATE: + checkonly = B_FALSE; + /* + * Negotiating local and "association-related" options + * through T_UNITDATA_REQ or T_CONN_{REQ,CON} + * Not allowed in this module. + */ + return (EINVAL); + default: + /* + * We should never get here + */ + *outlenp = 0; + return (EINVAL); + } + + ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || + (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); + + } + return (rts_do_opt_set(connp, level, name, inlen, invalp, outlenp, + outvalp, cr, thisdg_attrs, checkonly)); + +} + +/* + * This routine retrieves the current status of socket options. + * It returns the size of the option retrieved. + */ +int +rts_tpi_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) +{ + rts_t *rts; + int err; + + rts = Q_TO_RTS(q); + rw_enter(&rts->rts_rwlock, RW_READER); + err = rts_opt_get(Q_TO_CONN(q), level, name, ptr); + rw_exit(&rts->rts_rwlock); + return (err); +} + +/* + * This routine sets socket options. + */ +/*ARGSUSED*/ +int +rts_tpi_opt_set(queue_t *q, uint_t optset_context, int level, + int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, + uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk) +{ + conn_t *connp = Q_TO_CONN(q); + int error; + rts_t *rts = connp->conn_rts; + + + rw_enter(&rts->rts_rwlock, RW_WRITER); + error = rts_opt_set(connp, optset_context, level, name, inlen, invalp, + outlenp, outvalp, thisdg_attrs, cr); + rw_exit(&rts->rts_rwlock); + return (error); +} + /* * This routine retrieves the value of an ND variable in a rtsparam_t * structure. It is called through nd_getset when a user reads the @@ -803,7 +900,7 @@ rts_wrw(queue_t *q, struiod_t *dp) rts->rts_error = EINTR; goto err_ret; } - } + } rts->rts_flag |= RTS_WRW_PENDING; if (isuioq(q) && (error = struioget(q, mp, dp, 0))) { @@ -954,10 +1051,10 @@ rts_wput_other(queue_t *q, mblk_t *mp) switch (((union T_primitives *)rptr)->type) { case T_BIND_REQ: case O_T_BIND_REQ: - rts_bind(q, mp); + rts_tpi_bind(q, mp); return; case T_UNBIND_REQ: - rts_unbind(q, mp); + rts_tpi_unbind(q, mp); return; case T_CAPABILITY_REQ: rts_capability_req(q, mp); @@ -985,6 +1082,7 @@ rts_wput_other(queue_t *q, mblk_t *mp) freemsg(mp); (void) putnextctl1(RD(q), M_ERROR, EPROTO); return; + default: break; } @@ -1086,21 +1184,33 @@ rts_input(void *arg1, mblk_t *mp, void *arg2) struct iocblk *iocp; mblk_t *mp1; struct T_data_ind *tdi; + int error; switch (mp->b_datap->db_type) { case M_IOCACK: case M_IOCNAK: iocp = (struct iocblk *)mp->b_rptr; - if (rts->rts_flag & (RTS_WPUT_PENDING)) { - rts->rts_flag &= ~RTS_WPUT_PENDING; + if (IPCL_IS_NONSTR(connp)) { + ASSERT(rts->rts_flag & (RTS_REQ_PENDING)); + mutex_enter(&rts->rts_send_mutex); + rts->rts_flag &= ~RTS_REQ_INPROG; rts->rts_error = iocp->ioc_error; - /* - * Tell rts_wvw/qwait that we are done. - * Note: there is no qwait_wakeup() we can use. - */ - qenable(connp->conn_rq); + cv_signal(&rts->rts_io_cv); + mutex_exit(&rts->rts_send_mutex); freemsg(mp); return; + } else { + if (rts->rts_flag & (RTS_WPUT_PENDING)) { + rts->rts_flag &= ~RTS_WPUT_PENDING; + rts->rts_error = iocp->ioc_error; + /* + * Tell rts_wvw/qwait that we are done. + * Note: there is no qwait_wakeup() we can use. + */ + qenable(connp->conn_rq); + freemsg(mp); + return; + } } break; case M_DATA: @@ -1124,12 +1234,33 @@ rts_input(void *arg1, mblk_t *mp, void *arg2) default: break; } - putnext(connp->conn_rq, mp); + + if (IPCL_IS_NONSTR(connp)) { + if ((*connp->conn_upcalls->su_recv) + (connp->conn_upper_handle, mp, msgdsize(mp), 0, + &error, NULL) < 0) { + ASSERT(error == ENOSPC); + /* + * Let's confirm hoding the lock that + * we are out of recv space. + */ + mutex_enter(&rts->rts_recv_mutex); + if ((*connp->conn_upcalls->su_recv) + (connp->conn_upper_handle, NULL, 0, 0, + &error, NULL) < 0) { + ASSERT(error == ENOSPC); + connp->conn_flow_cntrld = B_TRUE; + } + mutex_exit(&rts->rts_recv_mutex); + } + } else { + putnext(connp->conn_rq, mp); + } } void -rts_ddi_init(void) +rts_ddi_g_init(void) { rts_max_optsize = optcom_max_optsize(rts_opt_obj.odb_opt_des_arr, rts_opt_obj.odb_opt_arr_cnt); @@ -1143,11 +1274,13 @@ rts_ddi_init(void) } void -rts_ddi_destroy(void) +rts_ddi_g_destroy(void) { netstack_unregister(NS_RTS); } +#define INET_NAME "ip" + /* * Initialize the RTS stack instance. */ @@ -1157,6 +1290,8 @@ rts_stack_init(netstackid_t stackid, netstack_t *ns) { rts_stack_t *rtss; rtsparam_t *pa; + int error = 0; + major_t major; rtss = (rts_stack_t *)kmem_zalloc(sizeof (*rtss), KM_SLEEP); rtss->rtss_netstack = ns; @@ -1167,6 +1302,10 @@ rts_stack_init(netstackid_t stackid, netstack_t *ns) (void) rts_param_register(&rtss->rtss_g_nd, rtss->rtss_params, A_CNT(lcl_param_arr)); + + major = mod_name_to_major(INET_NAME); + error = ldi_ident_from_major(major, &rtss->rtss_ldi_ident); + ASSERT(error == 0); return (rtss); } @@ -1182,5 +1321,411 @@ rts_stack_fini(netstackid_t stackid, void *arg) nd_free(&rtss->rtss_g_nd); kmem_free(rtss->rtss_params, sizeof (lcl_param_arr)); rtss->rtss_params = NULL; + ldi_ident_release(rtss->rtss_ldi_ident); kmem_free(rtss, sizeof (*rtss)); } + +/* ARGSUSED */ +int +rts_accept(sock_lower_handle_t lproto_handle, + sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, + cred_t *cr) +{ + return (EINVAL); +} + +/* ARGSUSED */ +static int +rts_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, + socklen_t len, cred_t *cr) +{ + /* + * rebind not allowed + */ + return (EINVAL); +} + +/* ARGSUSED */ +int +rts_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) +{ + return (EINVAL); +} + +/* ARGSUSED */ +int +rts_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, + socklen_t len, sock_connid_t *id, cred_t *cr) +{ + /* + * rts sockets start out as bound and connected + */ + *id = 0; + return (EISCONN); +} + +/* ARGSUSED */ +int +rts_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, + socklen_t *addrlen, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + rts_t *rts = connp->conn_rts; + + ASSERT(rts != NULL); + + bzero(addr, sizeof (struct sockaddr)); + addr->sa_family = AF_ROUTE; + *addrlen = sizeof (struct sockaddr); + + return (0); +} + +/* ARGSUSED */ +int +rts_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr, + socklen_t *addrlen, cred_t *cr) +{ + return (EOPNOTSUPP); +} + +static int +rts_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, + void *optvalp, socklen_t *optlen, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + rts_t *rts = connp->conn_rts; + int error; + t_uscalar_t max_optbuf_len; + void *optvalp_buf; + int len; + + error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, + rts_opt_obj.odb_opt_des_arr, + rts_opt_obj.odb_opt_arr_cnt, + rts_opt_obj.odb_topmost_tpiprovider, + B_FALSE, B_TRUE, cr); + if (error != 0) { + if (error < 0) + error = proto_tlitosyserr(-error); + return (error); + } + + optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); + rw_enter(&rts->rts_rwlock, RW_READER); + len = rts_opt_get(connp, level, option_name, optvalp_buf); + rw_exit(&rts->rts_rwlock); + + if (len < 0) { + /* + * Pass on to IP + */ + error = ip_get_options(connp, level, option_name, + optvalp, optlen, cr); + } else { + /* + * update optlen and copy option value + */ + t_uscalar_t size = MIN(len, *optlen); + bcopy(optvalp_buf, optvalp, size); + bcopy(&size, optlen, sizeof (size)); + error = 0; + } + + kmem_free(optvalp_buf, max_optbuf_len); + return (error); +} + +static int +rts_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, + const void *optvalp, socklen_t optlen, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + rts_t *rts = connp->conn_rts; + int error; + + error = proto_opt_check(level, option_name, optlen, NULL, + rts_opt_obj.odb_opt_des_arr, + rts_opt_obj.odb_opt_arr_cnt, + rts_opt_obj.odb_topmost_tpiprovider, + B_TRUE, B_FALSE, cr); + + if (error != 0) { + if (error < 0) + error = proto_tlitosyserr(-error); + return (error); + } + + rw_enter(&rts->rts_rwlock, RW_WRITER); + error = rts_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name, + optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp, + NULL, cr); + rw_exit(&rts->rts_rwlock); + + ASSERT(error >= 0); + + return (error); +} + +/* ARGSUSED */ +static int +rts_send(sock_lower_handle_t proto_handle, mblk_t *mp, + struct nmsghdr *msg, cred_t *cr) +{ + mblk_t *mp1; + conn_t *connp = (conn_t *)proto_handle; + rts_t *rts = connp->conn_rts; + rt_msghdr_t *rtm; + int error; + + ASSERT(DB_TYPE(mp) == M_DATA); + /* + * The semantics of the routing socket is such that the rtm_pid + * field is automatically filled in during requests with the + * current process' pid. We do this here (where we still have + * user context) after checking we have at least a message the + * size of a routing message header. + */ + if ((mp->b_wptr - mp->b_rptr) < sizeof (rt_msghdr_t)) { + if (!pullupmsg(mp, sizeof (rt_msghdr_t))) { + rts->rts_error = EINVAL; + freemsg(mp); + return (rts->rts_error); + } + } + rtm = (rt_msghdr_t *)mp->b_rptr; + rtm->rtm_pid = curproc->p_pid; + + mp1 = rts_ioctl_alloc(mp, DB_CRED(mp)); + if (mp1 == NULL) { + ASSERT(rts != NULL); + freemsg(mp); + return (ENOMEM); + } + + /* + * Allow only one outstanding request(ioctl) at any given time + */ + mutex_enter(&rts->rts_send_mutex); + while (rts->rts_flag & RTS_REQ_PENDING) { + int ret; + + ret = cv_wait_sig(&rts->rts_send_cv, &rts->rts_send_mutex); + if (ret <= 0) { + mutex_exit(&rts->rts_send_mutex); + freemsg(mp); + return (EINTR); + } + } + + rts->rts_flag |= RTS_REQ_PENDING; + + rts->rts_flag |= RTS_REQ_INPROG; + + mutex_exit(&rts->rts_send_mutex); + + CONN_INC_REF(connp); + + error = ip_rts_request_common(rts->rts_connp->conn_wq, mp1, connp, + DB_CREDDEF(mp, connp->conn_cred)); + + mutex_enter(&rts->rts_send_mutex); + if (error == EINPROGRESS) { + ASSERT(rts->rts_flag & RTS_REQ_INPROG); + if (rts->rts_flag & RTS_REQ_INPROG) { + /* + * Once the request has been issued we wait for + * completion + */ + cv_wait(&rts->rts_io_cv, &rts->rts_send_mutex); + error = rts->rts_error; + } + } + + ASSERT((error != 0) || !(rts->rts_flag & RTS_REQ_INPROG)); + ASSERT(MUTEX_HELD(&rts->rts_send_mutex)); + + rts->rts_flag &= ~(RTS_REQ_PENDING | RTS_REQ_INPROG); + cv_signal(&rts->rts_send_cv); + mutex_exit(&rts->rts_send_mutex); + return (error); +} + +/* ARGSUSED */ +sock_lower_handle_t +rts_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, + uint_t *smodep, int *errorp, int flags, cred_t *credp) +{ + conn_t *connp; + rts_t *rts; + rts_stack_t *rtss; + + if (family != AF_ROUTE || type != SOCK_RAW || + (proto != 0 && proto != AF_INET && proto != AF_INET6)) { + *errorp = EPROTONOSUPPORT; + return (NULL); + } + + connp = rts_open(flags, credp); + ASSERT(connp != NULL); + connp->conn_flags |= IPCL_NONSTR; + + rts = connp->conn_rts; + rtss = rts->rts_rtss; + + rts->rts_xmit_hiwat = rtss->rtss_xmit_hiwat; + rts->rts_xmit_lowat = rtss->rtss_xmit_lowat; + rts->rts_recv_hiwat = rtss->rtss_recv_hiwat; + rts->rts_recv_lowat = rts_mod_info.mi_lowat; + + ASSERT(rtss->rtss_ldi_ident != NULL); + + *errorp = ip_create_helper_stream(connp, rtss->rtss_ldi_ident); + if (*errorp != 0) { +#ifdef DEBUG + cmn_err(CE_CONT, "rts_create: create of IP helper stream" + " failed\n"); +#endif + (void) rts_close((sock_lower_handle_t)connp, 0, credp); + return (NULL); + } + + mutex_enter(&connp->conn_lock); + connp->conn_state_flags &= ~CONN_INCIPIENT; + mutex_exit(&connp->conn_lock); + + *errorp = 0; + *smodep = SM_ATOMIC; + *sock_downcalls = &sock_rts_downcalls; + return ((sock_lower_handle_t)connp); +} + +/* ARGSUSED */ +void +rts_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, + sock_upcalls_t *sock_upcalls, int flags, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + rts_t *rts = connp->conn_rts; + rts_stack_t *rtss = rts->rts_rtss; + struct sock_proto_props sopp; + + connp->conn_upcalls = sock_upcalls; + connp->conn_upper_handle = sock_handle; + + sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | + SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ; + sopp.sopp_wroff = 0; + sopp.sopp_rxhiwat = rtss->rtss_recv_hiwat; + sopp.sopp_rxlowat = rts_mod_info.mi_lowat; + sopp.sopp_maxblk = INFPSZ; + sopp.sopp_maxpsz = rts_mod_info.mi_maxpsz; + sopp.sopp_minpsz = (rts_mod_info.mi_minpsz == 1) ? 0 : + rts_mod_info.mi_minpsz; + + (*connp->conn_upcalls->su_set_proto_props) + (connp->conn_upper_handle, &sopp); + + /* + * We treat it as already connected for routing socket. + */ + (*connp->conn_upcalls->su_connected) + (connp->conn_upper_handle, 0, NULL, -1); + + /* + * Indicate the down IP module that this is a routing socket + * client by sending an RTS IOCTL without any user data. Although + * this is just a notification message (without any real routing + * request), we pass in any credential for correctness sake. + */ + ip_rts_register(connp); +} + +/* ARGSUSED */ +int +rts_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + + ASSERT(connp != NULL && IPCL_IS_RTS(connp)); + return (rts_common_close(NULL, connp)); +} + +/* ARGSUSED */ +int +rts_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + + /* shut down the send side */ + if (how != SHUT_RD) + (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, + SOCK_OPCTL_SHUT_SEND, 0); + /* shut down the recv side */ + if (how != SHUT_WR) + (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, + SOCK_OPCTL_SHUT_RECV, 0); + return (0); +} + +void +rts_clr_flowctrl(sock_lower_handle_t proto_handle) +{ + conn_t *connp = (conn_t *)proto_handle; + rts_t *rts = connp->conn_rts; + + mutex_enter(&rts->rts_recv_mutex); + connp->conn_flow_cntrld = B_FALSE; + mutex_exit(&rts->rts_recv_mutex); +} + +int +rts_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, + int mode, int32_t *rvalp, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + int error; + + switch (cmd) { + case ND_SET: + case ND_GET: + case TI_GETPEERNAME: + case TI_GETMYNAME: +#ifdef DEUG + cmn_err(CE_CONT, "rts_ioctl cmd 0x%x on non sreams" + " socket", cmd); +#endif + error = EINVAL; + break; + default: + /* + * Pass on to IP using helper stream + */ + error = ldi_ioctl( + connp->conn_helper_info->ip_helper_stream_handle, + cmd, arg, mode, cr, rvalp); + break; + } + + return (error); +} + +sock_downcalls_t sock_rts_downcalls = { + rts_activate, + rts_accept, + rts_bind, + rts_listen, + rts_connect, + rts_getpeername, + rts_getsockname, + rts_getsockopt, + rts_setsockopt, + rts_send, + NULL, + NULL, + NULL, + rts_shutdown, + rts_clr_flowctrl, + rts_ioctl, + rts_close +}; diff --git a/usr/src/uts/common/inet/ip/rts_opt_data.c b/usr/src/uts/common/inet/ip/rts_opt_data.c index f815cf086c..bac0eabdc4 100644 --- a/usr/src/uts/common/inet/ip/rts_opt_data.c +++ b/usr/src/uts/common/inet/ip/rts_opt_data.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/stream.h> #define _SUN_TPI_VERSION 2 @@ -40,14 +38,7 @@ #include <netinet/tcp.h> #include <netinet/ip_mroute.h> #include <inet/optcom.h> - -extern int rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, - uchar_t *ptr); -extern int rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, - uchar_t *ptr); -extern int rts_opt_set(queue_t *q, uint_t optset_context, int level, - int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, - uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk); +#include <inet/rts_impl.h> /* * Table of all known options handled on a RTS protocol stack. @@ -102,8 +93,8 @@ uint_t rts_max_optsize; /* initialized in _init() */ optdb_obj_t rts_opt_obj = { rts_opt_default, /* RTS default value function pointer */ - rts_opt_get, /* RTS get function pointer */ - rts_opt_set, /* RTS set function pointer */ + rts_tpi_opt_get, /* RTS get function pointer */ + rts_tpi_opt_set, /* RTS set function pointer */ B_TRUE, /* RTS is tpi provider */ RTS_OPT_ARR_CNT, /* RTS option database count of entries */ rts_opt_arr, /* RTS option database */ diff --git a/usr/src/uts/common/inet/ip/rtsddi.c b/usr/src/uts/common/inet/ip/rtsddi.c index 27704da503..482c53ab5c 100644 --- a/usr/src/uts/common/inet/ip/rtsddi.c +++ b/usr/src/uts/common/inet/ip/rtsddi.c @@ -28,10 +28,22 @@ #include <sys/modctl.h> #include <inet/common.h> #include <inet/ip.h> +#include <inet/rts_impl.h> +#include <sys/strsubr.h> +#include <sys/socketvar.h> + +#include <netinet/in.h> +#include <netinet/ip6.h> + +#include <inet/common.h> +#include <inet/ip.h> + #define INET_NAME "rts" #define INET_DEVSTRTAB rtsinfo #define INET_DEVDESC "PF_ROUTE socket STREAMS driver" +#define INET_SOCKDESC "PF_ROUTE socket module" +#define INET_SOCK_PROTO_CREATE_FUNC (*rts_create) #define INET_DEVMINOR 0 #define INET_DEVMTFLAGS (D_MP|D_MTQPAIR|D_SYNCSTR) diff --git a/usr/src/uts/common/inet/ip/spdsock.c b/usr/src/uts/common/inet/ip/spdsock.c index dc2e113505..749db40ee6 100644 --- a/usr/src/uts/common/inet/ip/spdsock.c +++ b/usr/src/uts/common/inet/ip/spdsock.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/param.h> #include <sys/types.h> #include <sys/stream.h> @@ -55,6 +53,7 @@ #include <inet/ip.h> #include <inet/ip6.h> #include <inet/mi.h> +#include <inet/proto_set.h> #include <inet/nd.h> #include <inet/ip_if.h> #include <inet/tun.h> @@ -3199,7 +3198,7 @@ spdsock_opt_set(queue_t *q, uint_t mgmt_flags, int level, int name, if (*i1 > spds->spds_max_buf) return (ENOBUFS); RD(q)->q_hiwat = *i1; - (void) mi_set_sth_hiwat(RD(q), *i1); + (void) proto_set_rx_hiwat(RD(q), NULL, *i1); break; } break; @@ -3407,7 +3406,7 @@ spdsock_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) oq->q_lowat = spds->spds_xmit_lowat; qprocson(q); - (void) mi_set_sth_hiwat(q, spds->spds_recv_hiwat); + (void) proto_set_rx_hiwat(q, NULL, spds->spds_recv_hiwat); *devp = makedevice(getmajor(*devp), ss->spdsock_minor); return (0); diff --git a/usr/src/uts/common/inet/ip6.h b/usr/src/uts/common/inet/ip6.h index 1dbe8c3dd1..d463c3f6ee 100644 --- a/usr/src/uts/common/inet/ip6.h +++ b/usr/src/uts/common/inet/ip6.h @@ -378,9 +378,9 @@ extern void mld_timeout_handler(void *); extern void pr_addr_dbg(char *, int, const void *); extern int ip_multirt_apply_membership_v6(int (*fn)(conn_t *, boolean_t, - const in6_addr_t *, int, mcast_record_t, const in6_addr_t *, - mblk_t *), ire_t *, conn_t *, boolean_t, const in6_addr_t *, - mcast_record_t, const in6_addr_t *, mblk_t *); + const in6_addr_t *, int, mcast_record_t, const in6_addr_t *, mblk_t *), + ire_t *, conn_t *, boolean_t, const in6_addr_t *, mcast_record_t, + const in6_addr_t *, mblk_t *); extern void ip_newroute_ipif_v6(queue_t *, mblk_t *, ipif_t *, in6_addr_t, int, zoneid_t); extern void ip_newroute_v6(queue_t *, mblk_t *, const in6_addr_t *, @@ -391,6 +391,11 @@ extern size_t ip6_get_src_preferences(conn_t *, uint32_t *); extern int ip6_set_src_preferences(conn_t *, uint32_t); extern int ip6_set_pktinfo(cred_t *, conn_t *, struct in6_pktinfo *, mblk_t *); +extern int ip_proto_bind_laddr_v6(conn_t *, mblk_t **, uint8_t, + const in6_addr_t *, uint16_t, boolean_t); +extern int ip_proto_bind_connected_v6(conn_t *, mblk_t **, + uint8_t, in6_addr_t *, uint16_t, const in6_addr_t *, ip6_pkt_t *, + uint16_t, boolean_t, boolean_t); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/inet/ip_if.h b/usr/src/uts/common/inet/ip_if.h index c0a6c51696..c5982de059 100644 --- a/usr/src/uts/common/inet/ip_if.h +++ b/usr/src/uts/common/inet/ip_if.h @@ -234,8 +234,11 @@ extern ipif_t *ipif_getby_indexes(uint_t, uint_t, boolean_t, ip_stack_t *); extern void ipif_init(ip_stack_t *); extern ipif_t *ipif_lookup_addr(ipaddr_t, ill_t *, zoneid_t, queue_t *, mblk_t *, ipsq_func_t, int *, ip_stack_t *); +extern boolean_t ip_addr_exists(ipaddr_t, zoneid_t, ip_stack_t *); extern ipif_t *ipif_lookup_addr_v6(const in6_addr_t *, ill_t *, zoneid_t, queue_t *, mblk_t *, ipsq_func_t, int *, ip_stack_t *); +extern boolean_t ip_addr_exists_v6(const in6_addr_t *, zoneid_t, + ip_stack_t *); extern zoneid_t ipif_lookup_addr_zoneid(ipaddr_t, ill_t *, ip_stack_t *); extern zoneid_t ipif_lookup_addr_zoneid_v6(const in6_addr_t *, ill_t *, ip_stack_t *); diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h index f7a9b8ff58..dae62ab499 100644 --- a/usr/src/uts/common/inet/ip_impl.h +++ b/usr/src/uts/common/inet/ip_impl.h @@ -44,6 +44,8 @@ extern "C" { #define IP_MOD_ID 5701 +#define INET_NAME "ip" + #ifdef _BIG_ENDIAN #define IP_HDR_CSUM_TTL_ADJUST 256 #define IP_TCP_CSUM_COMP IPPROTO_TCP @@ -546,6 +548,22 @@ extern zoneid_t ip_get_zoneid_v4(ipaddr_t, mblk_t *, ip_stack_t *, zoneid_t); extern zoneid_t ip_get_zoneid_v6(in6_addr_t *, mblk_t *, const ill_t *, ip_stack_t *, zoneid_t); +/* + * flag passed in by IP based protocols to get a private ip stream with + * no conn_t. Note this flag has the same value as SO_FALLBACK + */ +#define IP_HELPER_STR SO_FALLBACK + +#define IP_MOD_MINPSZ 1 +#define IP_MOD_MAXPSZ INFPSZ +#define IP_MOD_HIWAT 65536 +#define IP_MOD_LOWAT 1024 + +#define DEV_IP "/devices/pseudo/ip@0:ip" +#define DEV_IP6 "/devices/pseudo/ip6@0:ip6" + +extern struct kmem_cache *ip_helper_stream_cache; + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/inet/ip_rts.h b/usr/src/uts/common/inet/ip_rts.h index a8d3971192..70b33e0278 100644 --- a/usr/src/uts/common/inet/ip_rts.h +++ b/usr/src/uts/common/inet/ip_rts.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _INET_IP_RTS_H #define _INET_IP_RTS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -72,8 +70,9 @@ extern void rts_fill_msg_v6(int, int, const in6_addr_t *, extern size_t rts_header_msg_size(int); -extern void rts_queue_input(mblk_t *, queue_t *, sa_family_t, - ip_stack_t *); +extern void rts_queue_input(mblk_t *, conn_t *, sa_family_t, ip_stack_t *); + +extern int ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *, cred_t *); #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/inet/ip_stack.h b/usr/src/uts/common/inet/ip_stack.h index d0c3953374..3c53e1a3d3 100644 --- a/usr/src/uts/common/inet/ip_stack.h +++ b/usr/src/uts/common/inet/ip_stack.h @@ -425,6 +425,8 @@ struct ip_stack { kmutex_t ips_ipobs_cb_lock; uint_t ips_ipobs_cb_nwalkers; kcondvar_t ips_ipobs_cb_cv; + + struct __ldi_ident *ips_ldi_ident; }; typedef struct ip_stack ip_stack_t; diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h index 4665549c69..39cdddb7c4 100644 --- a/usr/src/uts/common/inet/ipclassifier.h +++ b/usr/src/uts/common/inet/ipclassifier.h @@ -37,6 +37,9 @@ extern "C" { #include <inet/ip6.h> #include <netinet/in.h> /* for IPPROTO_* constants */ #include <sys/sdt.h> +#include <sys/socket_proto.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> typedef void (*edesc_spf)(void *, mblk_t *, void *, int); typedef void (*edesc_rpf)(void *, mblk_t *, void *); @@ -80,6 +83,8 @@ typedef void (*edesc_rpf)(void *, mblk_t *, void *); #define IPCL_RTSCONN 0x00000020 /* From rts_conn_cache */ #define IPCL_ISV6 0x00000040 /* AF_INET6 */ #define IPCL_IPTUN 0x00000080 /* Has "tun" plumbed above it */ +#define IPCL_NONSTR 0x00001000 /* A non-STREAMS socket */ +#define IPCL_IN_SQUEUE 0x10000000 /* Waiting squeue to finish */ /* Conn Masks */ #define IPCL_TCP (IPCL_TCP4|IPCL_TCP6) @@ -136,6 +141,8 @@ typedef void (*edesc_rpf)(void *, mblk_t *, void *); (connp)->conn_ulp == IPPROTO_IPV6) && \ ((connp)->conn_flags & IPCL_IPTUN)) +#define IPCL_IS_NONSTR(connp) ((connp)->conn_flags & IPCL_NONSTR) + typedef struct connf_s connf_t; typedef struct @@ -145,6 +152,21 @@ typedef struct pc_t ctb_stack[CONN_STACK_DEPTH]; } conn_trace_t; +typedef struct ip_helper_minor_info_s { + dev_t ip_minfo_dev; /* Device */ + vmem_t *ip_minfo_arena; /* Arena */ +} ip_helper_minfo_t; + +/* + * ip helper stream info + */ +typedef struct ip_helper_stream_info_s { + ldi_handle_t ip_helper_stream_handle; + queue_t *ip_helper_stream_rq; + queue_t *ip_helper_stream_wq; + ip_helper_minfo_t *ip_helper_stream_minfo; +} ip_helper_stream_info_t; + /* * The initial fields in the conn_t are setup by the kmem_cache constructor, * and are preserved when it is freed. Fields after that are bzero'ed when @@ -236,6 +258,7 @@ struct conn_s { queue_t *conn_wq; /* Write queue */ dev_t conn_dev; /* Minor number */ vmem_t *conn_minor_arena; /* Minor arena */ + ip_helper_stream_info_t *conn_helper_info; cred_t *conn_cred; /* Credentials */ connf_t *conn_g_fanout; /* Global Hash bucket head */ @@ -300,6 +323,11 @@ struct conn_s { #define conn_nexthop_v4 V4_PART_OF_V6(conn_nexthop_v6) cred_t *conn_peercred; /* Peer credentials, if any */ + kcondvar_t conn_sq_cv; /* For non-STREAMS socket IO */ + kthread_t *conn_sq_caller; /* Caller of squeue sync ops */ + sock_upcalls_t *conn_upcalls; /* Upcalls to sockfs */ + sock_upper_handle_t conn_upper_handle; /* Upper handle: sonode * */ + unsigned int conn_ulp_labeled : 1, /* ULP label is synced */ conn_mlp_type : 2, /* mlp_type_t; tsol/tndb.h */ @@ -308,6 +336,8 @@ struct conn_s { conn_anon_port : 1, /* user bound anonymously */ conn_mac_exempt : 1, /* unlabeled with loose MAC */ conn_spare : 26; + + boolean_t conn_flow_cntrld; netstack_t *conn_netstack; /* Corresponds to a netstack_hold */ #ifdef CONN_DEBUG #define CONN_TRACE_MAX 10 @@ -582,6 +612,14 @@ conn_t *ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *, ipha_t *, tcph_t *, ip_stack_t *); conn_t *ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *, ip6_t *, tcph_t *, ip_stack_t *); + +extern int ip_create_helper_stream(conn_t *connp, ldi_ident_t li); +extern void ip_close_helper_stream(conn_t *connp); + +extern int ip_get_options(conn_t *, int, int, void *, t_uscalar_t *, cred_t *); +extern int ip_set_options(conn_t *, int, int, const void *, t_uscalar_t, + cred_t *); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/inet/mi.c b/usr/src/uts/common/inet/mi.c index a8848a3499..f88fe3709b 100644 --- a/usr/src/uts/common/inet/mi.c +++ b/usr/src/uts/common/inet/mi.c @@ -24,8 +24,6 @@ */ /* Copyright (c) 1990 Mentat Inc. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */ #include <sys/stream.h> @@ -46,6 +44,9 @@ #include <sys/cmn_err.h> #include <sys/debug.h> #include <sys/kobj.h> +#include <sys/stropts.h> +#include <sys/strsubr.h> +#include <inet/proto_set.h> #define ISDIGIT(ch) ((ch) >= '0' && (ch) <= '9') #define ISUPPER(ch) ((ch) >= 'A' && (ch) <= 'Z') @@ -64,7 +65,7 @@ * allocation strategy is changed. */ -typedef struct stroptions *STROPTP; +typedef struct stroptions *STROPTP; typedef union T_primitives *TPRIMP; /* Timer block states. */ @@ -903,93 +904,6 @@ mi_offset_paramc(mblk_t *mp, size_t offset, size_t len) return (NULL); } - -boolean_t -mi_set_sth_hiwat(queue_t *q, size_t size) -{ - MBLKP mp; - STROPTP stropt; - - if (!(mp = allocb(sizeof (*stropt), BPRI_LO))) - return (B_FALSE); - mp->b_datap->db_type = M_SETOPTS; - mp->b_wptr += sizeof (*stropt); - stropt = (STROPTP)mp->b_rptr; - stropt->so_flags = SO_HIWAT; - stropt->so_hiwat = size; - putnext(q, mp); - return (B_TRUE); -} - -boolean_t -mi_set_sth_lowat(queue_t *q, size_t size) -{ - MBLKP mp; - STROPTP stropt; - - if (!(mp = allocb(sizeof (*stropt), BPRI_LO))) - return (B_FALSE); - mp->b_datap->db_type = M_SETOPTS; - mp->b_wptr += sizeof (*stropt); - stropt = (STROPTP)mp->b_rptr; - stropt->so_flags = SO_LOWAT; - stropt->so_lowat = size; - putnext(q, mp); - return (B_TRUE); -} - -/* ARGSUSED */ -boolean_t -mi_set_sth_maxblk(queue_t *q, ssize_t size) -{ - MBLKP mp; - STROPTP stropt; - - if (!(mp = allocb(sizeof (*stropt), BPRI_LO))) - return (B_FALSE); - mp->b_datap->db_type = M_SETOPTS; - mp->b_wptr += sizeof (*stropt); - stropt = (STROPTP)mp->b_rptr; - stropt->so_flags = SO_MAXBLK; - stropt->so_maxblk = size; - putnext(q, mp); - return (B_TRUE); -} - -boolean_t -mi_set_sth_copyopt(queue_t *q, int copyopt) -{ - MBLKP mp; - STROPTP stropt; - - if (!(mp = allocb(sizeof (*stropt), BPRI_LO))) - return (B_FALSE); - mp->b_datap->db_type = M_SETOPTS; - mp->b_wptr += sizeof (*stropt); - stropt = (STROPTP)mp->b_rptr; - stropt->so_flags = SO_COPYOPT; - stropt->so_copyopt = (ushort_t)copyopt; - putnext(q, mp); - return (B_TRUE); -} - -boolean_t -mi_set_sth_wroff(queue_t *q, size_t size) -{ - MBLKP mp; - STROPTP stropt; - - if (!(mp = allocb(sizeof (*stropt), BPRI_LO))) - return (B_FALSE); - mp->b_datap->db_type = M_SETOPTS; - mp->b_wptr += sizeof (*stropt); - stropt = (STROPTP)mp->b_rptr; - stropt->so_flags = SO_WROFF; - stropt->so_wroff = (ushort_t)size; - putnext(q, mp); - return (B_TRUE); -} - int mi_sprintf(char *buf, char *fmt, ...) { diff --git a/usr/src/uts/common/inet/mi.h b/usr/src/uts/common/inet/mi.h index 6cae6a1acf..53608ca316 100644 --- a/usr/src/uts/common/inet/mi.h +++ b/usr/src/uts/common/inet/mi.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -28,8 +27,6 @@ #ifndef _INET_MI_H #define _INET_MI_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -39,6 +36,7 @@ extern "C" { #include <sys/types.h> #include <sys/vmem.h> #include <sys/varargs.h> +#include <netinet/in.h> #define MI_MIN_DEV INET_MIN_DEV /* minimum minor device number */ #define MI_COPY_IN 1 @@ -137,13 +135,6 @@ extern int mi_open_link(void **mi_head, IDP ptr, dev_t *devp, int flag, extern uint8_t *mi_offset_param(mblk_t *mp, size_t offset, size_t len); extern uint8_t *mi_offset_paramc(mblk_t *mp, size_t offset, size_t len); - -extern boolean_t mi_set_sth_hiwat(queue_t *q, size_t size); -extern boolean_t mi_set_sth_lowat(queue_t *q, size_t size); -extern boolean_t mi_set_sth_maxblk(queue_t *q, ssize_t size); -extern boolean_t mi_set_sth_copyopt(queue_t *q, int copyopt); -extern boolean_t mi_set_sth_wroff(queue_t *q, size_t size); - /*PRINTFLIKE2*/ extern int mi_sprintf(char *buf, char *fmt, ...) __KPRINTFLIKE(2); diff --git a/usr/src/uts/common/inet/optcom.c b/usr/src/uts/common/inet/optcom.c index 3de4044e58..f241599426 100644 --- a/usr/src/uts/common/inet/optcom.c +++ b/usr/src/uts/common/inet/optcom.c @@ -19,13 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * This file contains common code for handling Options Management requests. */ @@ -38,6 +36,7 @@ #define _SUN_TPI_VERSION 2 #include <sys/tihdr.h> #include <sys/socket.h> +#include <sys/socketvar.h> #include <sys/ddi.h> #include <sys/debug.h> /* for ASSERT */ #include <sys/policy.h> @@ -52,6 +51,8 @@ #include "optcom.h" #include <inet/optcom.h> +#include <inet/ipclassifier.h> +#include <inet/proto_set.h> /* * Function prototypes @@ -69,7 +70,6 @@ static void do_opt_current(queue_t *, struct T_opthdr *, uchar_t **, static int do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt, uint_t optset_context, uchar_t **resptrp, t_uscalar_t *worst_statusp, cred_t *, optdb_obj_t *dbobjp, mblk_t *first_mp); -static opdes_t *opt_chk_lookup(t_uscalar_t, t_uscalar_t, opdes_t *, uint_t); static boolean_t opt_level_valid(t_uscalar_t, optlevel_t *, uint_t); static size_t opt_level_allopts_lengths(t_uscalar_t, opdes_t *, uint_t); static boolean_t opt_length_ok(opdes_t *, struct T_opthdr *); @@ -186,6 +186,9 @@ optcom_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error) * the sq framework arranges to restart this operation and passes control to * the restart function ip_restart_optmgmt() which in turn calls * svr4_optcom_req() or tpi_optcom_req() to restart the option processing. + * + * XXX Remove the asynchronous behavior of svr_optcom_req() and + * tpi_optcom_req(). */ int svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp, @@ -214,6 +217,7 @@ svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp, boolean_t pass_to_next = B_FALSE; struct T_optmgmt_ack *toa; struct T_optmgmt_req *tor; + int error; /* * Allocate M_CTL and prepend to the packet for restarting this @@ -409,85 +413,17 @@ no_mem:; if (opt->name == T_ALLOPT) goto bad_opt; - /* Find the option in the opt_arr. */ - if ((optd = opt_chk_lookup(opt->level, opt->name, - opt_arr, opt_arr_cnt)) == NULL) { - /* - * Not found, that is a bad thing if - * the caller is a tpi provider - */ - if (topmost_tpiprovider) - goto bad_opt; - else - continue; /* skip unmodified */ - } - - /* Additional checks dependent on operation. */ - switch (tor->MGMT_flags) { - case T_NEGOTIATE: - if (!OA_WRITE_OR_EXECUTE(optd, cr)) { - /* can't negotiate option */ - if (!(OA_MATCHED_PRIV(optd, cr)) && - OA_WX_ANYPRIV(optd)) { - /* - * not privileged but privilege - * will help negotiate option. - */ - optcom_err_ack(q, mp, TACCES, 0); - return (0); - } else - goto bad_opt; - } - /* - * Verify size for options - * Note: For retaining compatibility with historical - * behavior, variable lengths options will have their - * length verified in the setfn() processing. - * In order to be compatible with SunOS 4.X we return - * EINVAL errors for bad lengths. - */ - if (!(optd->opdes_props & OP_VARLEN)) { - /* fixed length - size must match */ - if (opt->len != optd->opdes_size) { - optcom_err_ack(q, mp, TSYSERR, EINVAL); - return (0); - } - } - break; - - case T_CHECK: - if (!OA_RWX_ANYPRIV(optd)) - /* any of "rwx" permission but not not none */ - goto bad_opt; - /* - * XXX Since T_CURRENT was not there in TLI and the - * official TLI inspired TPI standard, getsockopt() - * API uses T_CHECK (for T_CURRENT semantics) - * The following fallthru makes sense because of its - * historical use as semantic equivalent to T_CURRENT. - */ - /* FALLTHRU */ - case T_CURRENT: - if (!OA_READ_PERMISSION(optd, cr)) { - /* can't read option value */ - if (!(OA_MATCHED_PRIV(optd, cr)) && - OA_R_ANYPRIV(optd)) { - /* - * not privileged but privilege - * will help in reading option value. - */ - optcom_err_ack(q, mp, TACCES, 0); - return (0); - } else - goto bad_opt; - } - break; - - default: - optcom_err_ack(q, mp, TBADFLAG, 0); + error = proto_opt_check(opt->level, opt->name, opt->len, NULL, + opt_arr, opt_arr_cnt, topmost_tpiprovider, + tor->MGMT_flags == T_NEGOTIATE, tor->MGMT_flags == T_CHECK, + cr); + if (error < 0) { + optcom_err_ack(q, mp, -error, 0); + return (0); + } else if (error > 0) { + optcom_err_ack(q, mp, TSYSERR, error); return (0); } - /* We liked it. Keep going. */ } /* end for loop scanning option buffer */ /* Now complete the operation as required. */ @@ -609,7 +545,7 @@ restart: * non-fatal by svr4_optcom_req() and are * returned by setfn() when it is passed an * option it does not handle. Since the option - * passed opt_chk_lookup(), it is implied that + * passed proto_opt_lookup(), it is implied that * it is valid but was either handled upstream * or will be handled downstream. */ @@ -892,7 +828,7 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp, /* Find the option in the opt_arr. */ if (opt->name != T_ALLOPT) { - optd = opt_chk_lookup(opt->level, opt->name, + optd = proto_opt_lookup(opt->level, opt->name, opt_arr, opt_arr_cnt); if (optd == NULL) { /* @@ -972,7 +908,7 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp, case T_CURRENT: /* - * The opt_chk_lookup() routine call above approved of + * The proto_opt_lookup() routine call above approved of * this option so we can work on the status for it * based on the permissions for the operation. (This * can override any status for it set at higher levels) @@ -1044,7 +980,7 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp, } } /* - * The opt_chk_lookup() routine above() approved of + * The proto_opt_lookup() routine above() approved of * this option so we can work on the status for it based * on the permissions for the operation. (This can * override anything set at a higher level). @@ -1309,7 +1245,7 @@ do_opt_default(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp, /* * lookup the option in the table and fill default value */ - optd = opt_chk_lookup(reqopt->level, reqopt->name, + optd = proto_opt_lookup(reqopt->level, reqopt->name, opt_arr, opt_arr_cnt); if (optd == NULL) { @@ -1609,8 +1545,7 @@ do_opt_current(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp, } } - - +/* ARGSUSED */ static int do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt, uint_t optset_context, uchar_t **resptrp, t_uscalar_t *worst_statusp, @@ -1819,7 +1754,6 @@ do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt, * Then delete "ignored" options from option buffer and return success. * */ - int tpi_optcom_buf(queue_t *q, mblk_t *mp, t_scalar_t *opt_lenp, t_scalar_t opt_offset, cred_t *cr, optdb_obj_t *dbobjp, @@ -1890,7 +1824,7 @@ tpi_optcom_buf(queue_t *q, mblk_t *mp, t_scalar_t *opt_lenp, } /* Find the option in the opt_arr. */ - optd = opt_chk_lookup(opt->level, opt->name, + optd = proto_opt_lookup(opt->level, opt->name, opt_arr, opt_arr_cnt); if (optd == NULL) { @@ -2043,21 +1977,6 @@ error_ret: return (error); } -static opdes_t * -opt_chk_lookup(t_uscalar_t level, t_uscalar_t name, opdes_t *opt_arr, - uint_t opt_arr_cnt) -{ - opdes_t *optd; - - for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; - optd++) { - if (level == (uint_t)optd->opdes_level && - name == (uint_t)optd->opdes_name) - return (optd); - } - return (NULL); -} - static boolean_t opt_level_valid(t_uscalar_t level, optlevel_t *valid_level_arr, uint_t valid_level_arr_cnt) @@ -2287,3 +2206,68 @@ optcom_pkt_set(uchar_t *invalp, uint_t inlen, boolean_t sticky, *optlenp = inlen + reservelen; return (0); } + +int +process_auxiliary_options(conn_t *connp, void *control, t_uscalar_t controllen, + void *optbuf, optdb_obj_t *dbobjp, int (*opt_set_fn)(conn_t *, uint_t, int, + int, uint_t, uchar_t *, uint_t *, uchar_t *, void *, cred_t *)) +{ + struct cmsghdr *cmsg; + opdes_t *optd; + t_uscalar_t outlen; + int error = EOPNOTSUPP; + t_uscalar_t len; + uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt; + opdes_t *opt_arr = dbobjp->odb_opt_des_arr; + + for (cmsg = (struct cmsghdr *)control; + CMSG_VALID(cmsg, control, (uintptr_t)control + controllen); + cmsg = CMSG_NEXT(cmsg)) { + + len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg); + /* Find the option in the opt_arr. */ + optd = proto_opt_lookup(cmsg->cmsg_level, cmsg->cmsg_type, + opt_arr, opt_arr_cnt); + if (optd == NULL) { + return (EINVAL); + } + if (OA_READONLY_PERMISSION(optd, connp->conn_cred)) { + return (EACCES); + } + if (OA_MATCHED_PRIV(optd, connp->conn_cred)) { + /* + * For privileged options, we DO perform + * access checks as is common sense + */ + if (!OA_WX_ANYPRIV(optd)) { + return (EACCES); + } + } else { + /* + * For non privileged, we fail instead following + * "ignore" semantics dictated by XTI spec for + * permissions problems. + */ + if (!OA_WX_NOPRIV(optd)) { /* nopriv */ + return (EACCES); + } + } + error = opt_set_fn(connp, SETFN_UD_NEGOTIATE, optd->opdes_level, + optd->opdes_name, len, (uchar_t *)CMSG_CONTENT(cmsg), + &outlen, (uchar_t *)CMSG_CONTENT(cmsg), (void *)optbuf, + connp->conn_cred); + if (error > 0) { + return (error); + } else if (outlen > len) { + return (EINVAL); + } else { + /* + * error can be -ve if the protocol wants to + * pass the option to IP. We donot pass auxiliary + * options to IP. + */ + error = 0; + } + } + return (error); +} diff --git a/usr/src/uts/common/inet/optcom.h b/usr/src/uts/common/inet/optcom.h index 1d2d1cb09d..07cb7cf946 100644 --- a/usr/src/uts/common/inet/optcom.h +++ b/usr/src/uts/common/inet/optcom.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -27,14 +27,13 @@ #ifndef _INET_OPTCOM_H #define _INET_OPTCOM_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif #if defined(_KERNEL) && defined(__STDC__) +#include <inet/ipclassifier.h> /* Options Description Structure */ typedef struct opdes_s { t_uscalar_t opdes_name; /* option name */ @@ -139,6 +138,10 @@ typedef struct opdes_s { #define OA_NO_PERMISSION(x, c) (OA_MATCHED_PRIV((x), (c)) ? \ ((x)->opdes_access_priv == 0) : ((x)->opdes_access_nopriv == 0)) +#define PASS_OPT_TO_IP(connp) \ + if (IPCL_IS_NONSTR(connp)) \ + return (-EINVAL) + /* * Other properties set in opdes_props field. */ @@ -217,6 +220,10 @@ extern t_uscalar_t optcom_max_optsize(opdes_t *, uint_t); extern int optcom_pkt_set(uchar_t *, uint_t, boolean_t, uchar_t **, uint_t *, uint_t); +extern int process_auxiliary_options(conn_t *, void *, t_uscalar_t, + void *, optdb_obj_t *, int (*)(conn_t *, uint_t, int, int, uint_t, + uchar_t *, uint_t *, uchar_t *, void *, cred_t *)); + #endif /* defined(_KERNEL) && defined(__STDC__) */ #ifdef __cplusplus diff --git a/usr/src/uts/common/inet/proto_set.c b/usr/src/uts/common/inet/proto_set.c new file mode 100644 index 0000000000..45f07d2ed3 --- /dev/null +++ b/usr/src/uts/common/inet/proto_set.c @@ -0,0 +1,440 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <inet/common.h> +#include <sys/stream.h> +#include <sys/stropts.h> +#include <sys/strsun.h> +#include <sys/sysmacros.h> +#include <sys/stropts.h> +#include <sys/strsubr.h> +#include <sys/tpicommon.h> +#include <sys/socket_proto.h> +#include <sys/policy.h> +#include <inet/optcom.h> +#include <inet/ipclassifier.h> + +boolean_t +proto_set_rx_hiwat(queue_t *q, conn_t *connp, size_t size) +{ + + if (connp != NULL && IPCL_IS_NONSTR(connp)) { + struct sock_proto_props sopp; + + sopp.sopp_flags = SOCKOPT_RCVHIWAT; + sopp.sopp_rxhiwat = size; + (*connp->conn_upcalls->su_set_proto_props) + (connp->conn_upper_handle, &sopp); + } else { + MBLKP mp; + struct stroptions *stropt; + + if (!(mp = allocb(sizeof (*stropt), BPRI_LO))) + return (B_FALSE); + mp->b_datap->db_type = M_SETOPTS; + mp->b_wptr += sizeof (*stropt); + stropt = (struct stroptions *)mp->b_rptr; + stropt->so_flags = SO_HIWAT; + stropt->so_hiwat = size; + putnext(q, mp); + } + return (B_TRUE); +} + +boolean_t +proto_set_rx_lowat(queue_t *q, conn_t *connp, size_t size) +{ + + if (connp != NULL && IPCL_IS_NONSTR(connp)) { + struct sock_proto_props sopp; + + sopp.sopp_flags = SOCKOPT_RCVLOWAT; + sopp.sopp_rxlowat = size; + (*connp->conn_upcalls->su_set_proto_props) + (connp->conn_upper_handle, &sopp); + } else { + MBLKP mp; + struct stroptions *stropt; + + if (!(mp = allocb(sizeof (*stropt), BPRI_LO))) + return (B_FALSE); + mp->b_datap->db_type = M_SETOPTS; + mp->b_wptr += sizeof (*stropt); + stropt = (struct stroptions *)mp->b_rptr; + stropt->so_flags = SO_LOWAT; + stropt->so_lowat = size; + putnext(q, mp); + } + return (B_TRUE); +} + +/* + * Set maximum packet size. This is the maximum amount of data the protocol + * wants to be given at any time, Larger data needs to be broken in multiples + * of maximum packet size and given to the protocol one at a time. + */ +boolean_t +proto_set_maxpsz(queue_t *q, conn_t *connp, size_t size) +{ + if (connp != NULL && IPCL_IS_NONSTR(connp)) { + struct sock_proto_props sopp; + + sopp.sopp_flags = SOCKOPT_MAXPSZ; + sopp.sopp_maxpsz = size; + (*connp->conn_upcalls->su_set_proto_props) + (connp->conn_upper_handle, &sopp); + return (B_TRUE); + } else { + struct stdata *stp; + queue_t *wq; + stp = STREAM(q); + + /* + * At this point change of a queue parameter is not allowed + * when a multiplexor is sitting on top. + */ + if (stp == NULL || stp->sd_flag & STPLEX) + return (B_FALSE); + + claimstr(stp->sd_wrq); + wq = stp->sd_wrq->q_next; + ASSERT(wq != NULL); + (void) strqset(wq, QMAXPSZ, 0, size); + releasestr(stp->sd_wrq); + return (B_TRUE); + } +} + +/* ARGSUSED */ +boolean_t +proto_set_tx_maxblk(queue_t *q, conn_t *connp, ssize_t size) +{ + if (connp != NULL && IPCL_IS_NONSTR(connp)) { + struct sock_proto_props sopp; + + sopp.sopp_flags = SOCKOPT_MAXBLK; + sopp.sopp_maxblk = size; + (*connp->conn_upcalls->su_set_proto_props) + (connp->conn_upper_handle, &sopp); + } else { + MBLKP mp; + struct stroptions *stropt; + + if (!(mp = allocb(sizeof (*stropt), BPRI_LO))) + return (B_FALSE); + mp->b_datap->db_type = M_SETOPTS; + mp->b_wptr += sizeof (*stropt); + stropt = (struct stroptions *)mp->b_rptr; + stropt->so_flags = SO_MAXBLK; + stropt->so_maxblk = size; + putnext(q, mp); + } + return (B_TRUE); +} + +boolean_t +proto_set_tx_copyopt(queue_t *q, conn_t *connp, int copyopt) +{ + if (connp != NULL && IPCL_IS_NONSTR(connp)) { + struct sock_proto_props sopp; + + sopp.sopp_flags = SOCKOPT_ZCOPY; + sopp.sopp_zcopyflag = (ushort_t)copyopt; + (*connp->conn_upcalls->su_set_proto_props) + (connp->conn_upper_handle, &sopp); + } else { + MBLKP mp; + struct stroptions *stropt; + + if (!(mp = allocb(sizeof (*stropt), BPRI_LO))) + return (B_FALSE); + mp->b_datap->db_type = M_SETOPTS; + mp->b_wptr += sizeof (*stropt); + stropt = (struct stroptions *)mp->b_rptr; + stropt->so_flags = SO_COPYOPT; + stropt->so_copyopt = (ushort_t)copyopt; + putnext(q, mp); + } + return (B_TRUE); +} + +boolean_t +proto_set_tx_wroff(queue_t *q, conn_t *connp, size_t size) +{ + if (connp != NULL && IPCL_IS_NONSTR(connp)) { + struct sock_proto_props sopp; + + sopp.sopp_flags = SOCKOPT_WROFF; + sopp.sopp_wroff = size; + + /* XXX workaround for CR6757374 */ + if (connp->conn_upper_handle != NULL) + (*connp->conn_upcalls->su_set_proto_props) + (connp->conn_upper_handle, &sopp); + } else { + + MBLKP mp; + struct stroptions *stropt; + if (!(mp = allocb(sizeof (*stropt), BPRI_LO))) + return (B_FALSE); + mp->b_datap->db_type = M_SETOPTS; + mp->b_wptr += sizeof (*stropt); + stropt = (struct stroptions *)mp->b_rptr; + stropt->so_flags = SO_WROFF; + stropt->so_wroff = (ushort_t)size; + putnext(q, mp); + } + return (B_TRUE); +} + +/* + * set OOBINLINE processing on the socket + */ +void +proto_set_rx_oob_opt(conn_t *connp, boolean_t onoff) +{ + struct sock_proto_props sopp; + + ASSERT(IPCL_IS_NONSTR(connp)); + + sopp.sopp_flags = SOCKOPT_OOBINLINE; + sopp.sopp_oobinline = onoff; + (*connp->conn_upcalls->su_set_proto_props) + (connp->conn_upper_handle, &sopp); +} + +/* + * Translate a TLI(/XTI) error into a system error as best we can. + */ +static const int tli_errs[] = { + 0, /* no error */ + EADDRNOTAVAIL, /* TBADADDR */ + ENOPROTOOPT, /* TBADOPT */ + EACCES, /* TACCES */ + EBADF, /* TBADF */ + EADDRNOTAVAIL, /* TNOADDR */ + EPROTO, /* TOUTSTATE */ + ECONNABORTED, /* TBADSEQ */ + 0, /* TSYSERR - will never get */ + EPROTO, /* TLOOK - should never be sent by transport */ + EMSGSIZE, /* TBADDATA */ + EMSGSIZE, /* TBUFOVFLW */ + EPROTO, /* TFLOW */ + EWOULDBLOCK, /* TNODATA */ + EPROTO, /* TNODIS */ + EPROTO, /* TNOUDERR */ + EINVAL, /* TBADFLAG */ + EPROTO, /* TNOREL */ + EOPNOTSUPP, /* TNOTSUPPORT */ + EPROTO, /* TSTATECHNG */ + /* following represent error namespace expansion with XTI */ + EPROTO, /* TNOSTRUCTYPE - never sent by transport */ + EPROTO, /* TBADNAME - never sent by transport */ + EPROTO, /* TBADQLEN - never sent by transport */ + EADDRINUSE, /* TADDRBUSY */ + EBADF, /* TINDOUT */ + EBADF, /* TPROVMISMATCH */ + EBADF, /* TRESQLEN */ + EBADF, /* TRESADDR */ + EPROTO, /* TQFULL - never sent by transport */ + EPROTO, /* TPROTO */ +}; + +int +proto_tlitosyserr(int terr) +{ + ASSERT(terr != TSYSERR); + if (terr >= (sizeof (tli_errs) / sizeof (tli_errs[0]))) + return (EPROTO); + else + return (tli_errs[terr]); +} + +/* + * Verify that address is suitable for connect/sendmsg and is aligned properly + * Since this is a generic function we do not test for port being zero + * as some protocols like icmp do not require a port + */ +int +proto_verify_ip_addr(int family, const struct sockaddr *name, socklen_t namelen) +{ + + if (name == NULL || !OK_32PTR((char *)name)) + return (EINVAL); + + switch (family) { + case AF_INET: + if (name->sa_family != AF_INET) { + return (EAFNOSUPPORT); + } + + if (namelen != (socklen_t)sizeof (struct sockaddr_in)) { + return (EINVAL); + } + break; + case AF_INET6: { +#ifdef DEBUG + struct sockaddr_in6 *sin6; +#endif /* DEBUG */ + + if (name->sa_family != AF_INET6) { + return (EAFNOSUPPORT); + } + if (namelen != (socklen_t)sizeof (struct sockaddr_in6)) { + return (EINVAL); + } +#ifdef DEBUG + /* Verify that apps don't forget to clear sin6_scope_id etc */ + sin6 = (struct sockaddr_in6 *)name; + if (sin6->sin6_scope_id != 0 && + !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) { + zcmn_err(getzoneid(), CE_WARN, + "connect/send* with uninitialized sin6_scope_id " + "(%d) on socket. Pid = %d\n", + (int)sin6->sin6_scope_id, (int)curproc->p_pid); + } +#endif /* DEBUG */ + break; + } + default: + return (EINVAL); + } + + return (0); +} + +/* + * Do a lookup of the options in the array. + * Rerurn NULL if there isn't a match. + */ +opdes_t * +proto_opt_lookup(t_uscalar_t level, t_uscalar_t name, opdes_t *opt_arr, + uint_t opt_arr_cnt) +{ + opdes_t *optd; + + for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; + optd++) { + if (level == (uint_t)optd->opdes_level && + name == (uint_t)optd->opdes_name) + return (optd); + } + return (NULL); +} + +/* + * Do a lookup of the options in the array and do permission and length checking + * Returns zero if there is no error (note: for non-tpi-providers not being able + * to find the option is not an error). TPI errors are returned as -ve. + */ +int +proto_opt_check(int level, int name, int len, t_uscalar_t *max_len, + opdes_t *opt_arr, uint_t opt_arr_cnt, boolean_t topmost_tpiprovider, + boolean_t negotiate, boolean_t check, cred_t *cr) +{ + opdes_t *optd; + + /* Find the option in the opt_arr. */ + if ((optd = proto_opt_lookup(level, name, opt_arr, opt_arr_cnt)) == + NULL) { + /* + * Not found, that is a bad thing if + * the caller is a tpi provider + */ + if (topmost_tpiprovider) + return (-TBADOPT); + else + return (0); /* skip unmodified */ + } + + /* Additional checks dependent on operation. */ + if (negotiate) { + /* Cannot be true at the same time */ + ASSERT(check == B_FALSE); + + if (!OA_WRITE_OR_EXECUTE(optd, cr)) { + /* can't negotiate option */ + if (!(OA_MATCHED_PRIV(optd, cr)) && + OA_WX_ANYPRIV(optd)) { + /* + * not privileged but privilege + * will help negotiate option. + */ + return (-TACCES); + } else { + return (-TBADOPT); + } + } + /* + * Verify size for options + * Note: For retaining compatibility with historical + * behavior, variable lengths options will have their + * length verified in the setfn() processing. + * In order to be compatible with SunOS 4.X we return + * EINVAL errors for bad lengths. + */ + if (!(optd->opdes_props & OP_VARLEN)) { + /* fixed length - size must match */ + if (len != optd->opdes_size) { + return (EINVAL); + } + } + } else { + if (check) { + if (!OA_RWX_ANYPRIV(optd)) + /* any of "rwx" permission but not none */ + return (-TBADOPT); + } + /* + * XXX Change the comments. + * + * XXX Since T_CURRENT was not there in TLI and the + * official TLI inspired TPI standard, getsockopt() + * API uses T_CHECK (for T_CURRENT semantics) + * The following fallthru makes sense because of its + * historical use as semantic equivalent to T_CURRENT. + */ + /* FALLTHRU */ + if (!OA_READ_PERMISSION(optd, cr)) { + /* can't read option value */ + if (!(OA_MATCHED_PRIV(optd, cr)) && + OA_R_ANYPRIV(optd)) { + /* + * not privileged but privilege + * will help in reading option value. + */ + return (-TACCES); + } else { + return (-TBADOPT); + } + } + } + if (max_len != NULL) + *max_len = optd->opdes_size; + + /* We liked it. Keep going. */ + return (0); +} diff --git a/usr/src/uts/common/inet/proto_set.h b/usr/src/uts/common/inet/proto_set.h new file mode 100644 index 0000000000..8e714c7c05 --- /dev/null +++ b/usr/src/uts/common/inet/proto_set.h @@ -0,0 +1,58 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _INET_PROTO_SET_H +#define _INET_PROTO_SET_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#include <sys/socket_proto.h> +#include <inet/optcom.h> +#include <inet/ipclassifier.h> + +extern boolean_t proto_set_rx_hiwat(queue_t *, struct conn_s *, size_t); +extern boolean_t proto_set_rx_lowat(queue_t *, struct conn_s *, size_t); +extern boolean_t proto_set_maxpsz(queue_t *, struct conn_s *, size_t); +extern boolean_t proto_set_tx_maxblk(queue_t *, struct conn_s *, + ssize_t); +extern boolean_t proto_set_tx_copyopt(queue_t *, struct conn_s *, int); +extern boolean_t proto_set_tx_wroff(queue_t *, struct conn_s *, size_t); +extern void proto_set_rx_oob_opt(struct conn_s *, boolean_t); + +extern int proto_tlitosyserr(int); +extern int proto_verify_ip_addr(int, const struct sockaddr *, socklen_t); + +extern int proto_opt_check(int, int, int, t_uscalar_t *, opdes_t *, + uint_t, boolean_t, boolean_t, boolean_t, cred_t *); +extern opdes_t *proto_opt_lookup(t_uscalar_t, t_uscalar_t, opdes_t *, uint_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _INET_PROTO_SET_H */ diff --git a/usr/src/uts/common/inet/rawip_impl.h b/usr/src/uts/common/inet/rawip_impl.h index 638cea6c70..f818247b67 100644 --- a/usr/src/uts/common/inet/rawip_impl.h +++ b/usr/src/uts/common/inet/rawip_impl.h @@ -27,8 +27,6 @@ #ifndef _RAWIP_IMPL_H #define _RAWIP_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -44,6 +42,7 @@ extern "C" { #include <inet/common.h> #include <inet/ip.h> +#include <inet/optcom.h> /* Named Dispatch Parameter Management Structure */ typedef struct icmpparam_s { @@ -63,7 +62,9 @@ struct icmp_stack { icmpparam_t *is_param_arr; /* ndd variable table */ kstat_t *is_ksp; /* kstats */ mib2_rawip_t is_rawip_mib; /* SNMP fixed size info */ + ldi_ident_t is_ldi_ident; }; + typedef struct icmp_stack icmp_stack_t; /* Internal icmp control structure, one per open stream */ @@ -76,7 +77,7 @@ typedef struct icmp_s { uint_t icmp_state; /* TPI state */ in6_addr_t icmp_v6src; /* Source address of this stream */ in6_addr_t icmp_bound_v6src; /* Explicitely bound to address */ - in6_addr_t icmp_v6dst; /* Connected destination */ + sin6_t icmp_v6dst; /* Connected destination */ /* * IP format that packets transmitted from this struct should use. * Value can be IP4_VERSION or IPV6_VERSION. @@ -87,7 +88,6 @@ typedef struct icmp_s { sa_family_t icmp_family; /* Family from socket() call */ /* Following protected by icmp_rwlock */ - uint32_t icmp_flowinfo; /* Connected flow id and tclass */ uint32_t icmp_max_hdr_len; /* For write offset in stream head */ uint_t icmp_proto; uint_t icmp_ip_snd_options_len; /* Len of IPv4 options */ @@ -144,6 +144,15 @@ typedef struct icmp_s { uint_t icmp_label_len_v6; /* sec. part of sticky opt */ in6_addr_t icmp_v6lastdst; /* most recent destination */ icmp_stack_t *icmp_is; /* Stack instance */ + size_t icmp_xmit_hiwat; + size_t icmp_xmit_lowat; + size_t icmp_recv_hiwat; + size_t icmp_recv_lowat; + int icmp_delayed_error; + kmutex_t icmp_recv_lock; + mblk_t *icmp_fallback_queue_head; + mblk_t *icmp_fallback_queue_tail; + struct sockaddr_storage icmp_delayed_addr; } icmp_t; /* @@ -155,10 +164,16 @@ extern optdb_obj_t icmp_opt_obj; extern uint_t icmp_max_optsize; extern mblk_t *icmp_snmp_get(queue_t *q, mblk_t *mpctl); -extern void rawip_resume_bind(conn_t *, mblk_t *); -extern void icmp_ddi_init(void); -extern void icmp_ddi_destroy(void); +extern void icmp_ddi_g_init(void); +extern void icmp_ddi_g_destroy(void); + +extern sock_lower_handle_t rawip_create(int, int, int, sock_downcalls_t **, + uint_t *, int *, int, cred_t *); +extern void rawip_fallback(sock_lower_handle_t, queue_t *, boolean_t, + so_proto_quiesced_cb_t); + +extern sock_downcalls_t sock_rawip_downcalls; #endif /* _KERNEL */ diff --git a/usr/src/uts/common/inet/rts_impl.h b/usr/src/uts/common/inet/rts_impl.h index f89d1ec82c..de7cd8970b 100644 --- a/usr/src/uts/common/inet/rts_impl.h +++ b/usr/src/uts/common/inet/rts_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -27,8 +27,6 @@ #ifndef _RTS_IMPL_H #define _RTS_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -44,6 +42,7 @@ extern "C" { #include <inet/common.h> #include <inet/ip.h> +#include <inet/optcom.h> /* Named Dispatch Parameter Management Structure */ typedef struct rtsparam_s { @@ -61,6 +60,8 @@ struct rts_stack { caddr_t rtss_g_nd; rtsparam_t *rtss_params; + + ldi_ident_t rtss_ldi_ident; }; typedef struct rts_stack rts_stack_t; @@ -84,10 +85,25 @@ typedef struct rts_s { /* Written to only once at the time of opening the endpoint */ conn_t *rts_connp; + + /* Outbound flow control */ + size_t rts_xmit_hiwat; + size_t rts_xmit_lowat; + + /* Inbound flow control */ + size_t rts_recv_hiwat; + size_t rts_recv_lowat; + + kmutex_t rts_send_mutex; + kmutex_t rts_recv_mutex; + kcondvar_t rts_send_cv; + kcondvar_t rts_io_cv; } rts_t; #define RTS_WPUT_PENDING 0x1 /* Waiting for write-side to complete */ +#define RTS_REQ_PENDING 0x1 /* For direct sockets */ #define RTS_WRW_PENDING 0x2 /* Routing socket write in progress */ +#define RTS_REQ_INPROG 0x2 /* For direct sockets */ /* * Object to represent database of options to search passed to @@ -98,8 +114,19 @@ typedef struct rts_s { extern optdb_obj_t rts_opt_obj; extern uint_t rts_max_optsize; -extern void rts_ddi_init(void); -extern void rts_ddi_destroy(void); +extern void rts_ddi_g_init(void); +extern void rts_ddi_g_destroy(void); + +extern int rts_tpi_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *); +extern int rts_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *, + uint_t *, uchar_t *, void *, cred_t *, mblk_t *); +extern int rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, + uchar_t *ptr); + +extern sock_lower_handle_t rts_create(int, int, int, sock_downcalls_t **, + uint_t *, int *, int, cred_t *); + +extern sock_downcalls_t sock_rts_downcalls; #endif /* _KERNEL */ diff --git a/usr/src/uts/common/inet/sctp/sctp.c b/usr/src/uts/common/inet/sctp/sctp.c index f76612f04f..1dc96a687b 100644 --- a/usr/src/uts/common/inet/sctp/sctp.c +++ b/usr/src/uts/common/inet/sctp/sctp.c @@ -279,13 +279,13 @@ sctp_clean_death(sctp_t *sctp, int err) if (sctp->sctp_xmit_head || sctp->sctp_xmit_unsent) { sctp_regift_xmitlist(sctp); } - if (sctp->sctp_ulp_disconnected(sctp->sctp_ulpd, err)) { + if (sctp->sctp_ulp_disconnected(sctp->sctp_ulpd, 0, err)) { /* * Socket is gone, detach. */ sctp->sctp_detached = B_TRUE; sctp->sctp_ulpd = NULL; - bzero(&sctp->sctp_upcalls, sizeof (sctp_upcalls_t)); + sctp->sctp_upcalls = NULL; } } @@ -447,7 +447,7 @@ sctp_close(sctp_t *sctp) RUN_SCTP(sctp); sctp->sctp_detached = 1; sctp->sctp_ulpd = NULL; - bzero(&sctp->sctp_upcalls, sizeof (sctp_upcalls_t)); + sctp->sctp_upcalls = NULL; bzero(&sctp->sctp_events, sizeof (sctp->sctp_events)); /* If the graceful shutdown has not been completed, just return. */ @@ -1341,8 +1341,8 @@ sctp_icmp_error_ipv6(sctp_t *sctp, mblk_t *mp) * If parent pointer is passed in, inherit settings from it. */ sctp_t * -sctp_create(void *sctp_ulpd, sctp_t *parent, int family, int flags, - const sctp_upcalls_t *sctp_upcalls, sctp_sockbuf_limits_t *sbl, +sctp_create(void *ulpd, sctp_t *parent, int family, int flags, + sock_upcalls_t *upcalls, sctp_sockbuf_limits_t *sbl, cred_t *credp) { sctp_t *sctp, *psctp; @@ -1507,12 +1507,11 @@ sctp_create(void *sctp_ulpd, sctp_t *parent, int family, int flags, sctp->sctp_adv_pap = sctp->sctp_lastack_rxd; /* Information required by upper layer */ - if (sctp_ulpd != NULL) { - sctp->sctp_ulpd = sctp_ulpd; + if (ulpd != NULL) { + sctp->sctp_ulpd = ulpd; - ASSERT(sctp_upcalls != NULL); - bcopy(sctp_upcalls, &sctp->sctp_upcalls, - sizeof (sctp_upcalls_t)); + ASSERT(upcalls != NULL); + sctp->sctp_upcalls = upcalls; ASSERT(sbl != NULL); /* Fill in the socket buffer limits for sctpsockfs */ sbl->sbl_txlowat = sctp->sctp_xmit_lowater; @@ -1520,8 +1519,8 @@ sctp_create(void *sctp_ulpd, sctp_t *parent, int family, int flags, sbl->sbl_rxbuf = sctp->sctp_rwnd; sbl->sbl_rxlowat = SCTP_RECV_LOWATER; } - /* If no sctp_ulpd, must be creating the default sctp */ - ASSERT(sctp_ulpd != NULL || sctps->sctps_gsctp == NULL); + /* If no ulpd, must be creating the default sctp */ + ASSERT(ulpd != NULL || sctps->sctps_gsctp == NULL); /* Insert this in the global list. */ SCTP_LINK(sctp, sctps); diff --git a/usr/src/uts/common/inet/sctp/sctp_bind.c b/usr/src/uts/common/inet/sctp/sctp_bind.c index 2091d91ab5..dfb70fc202 100644 --- a/usr/src/uts/common/inet/sctp/sctp_bind.c +++ b/usr/src/uts/common/inet/sctp/sctp_bind.c @@ -24,8 +24,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/systm.h> #include <sys/stream.h> @@ -174,12 +172,16 @@ sctp_bind(sctp_t *sctp, struct sockaddr *sa, socklen_t len) int err = 0; ASSERT(sctp != NULL); - ASSERT(sa); RUN_SCTP(sctp); - if (sctp->sctp_state > SCTPS_BOUND || - (sctp->sctp_connp->conn_state_flags & CONN_CLOSING)) { + if ((sctp->sctp_state >= SCTPS_BOUND) || + (sctp->sctp_connp->conn_state_flags & CONN_CLOSING) || + (sa == NULL || len == 0)) { + /* + * Multiple binds not allowed for any SCTP socket + * Also binding with null address is not supported. + */ err = EINVAL; goto done; } diff --git a/usr/src/uts/common/inet/sctp/sctp_common.c b/usr/src/uts/common/inet/sctp/sctp_common.c index 548a326806..10aff2af34 100644 --- a/usr/src/uts/common/inet/sctp/sctp_common.c +++ b/usr/src/uts/common/inet/sctp/sctp_common.c @@ -398,6 +398,8 @@ void sctp_set_ulp_prop(sctp_t *sctp) { int hdrlen; + struct sock_proto_props sopp; + sctp_stack_t *sctps = sctp->sctp_sctps; if (sctp->sctp_current->isv4) { @@ -408,9 +410,12 @@ sctp_set_ulp_prop(sctp_t *sctp) ASSERT(sctp->sctp_ulpd); ASSERT(sctp->sctp_current->sfa_pmss == sctp->sctp_mss); - sctp->sctp_ulp_prop(sctp->sctp_ulpd, - sctps->sctps_wroff_xtra + hdrlen + sizeof (sctp_data_hdr_t), - sctp->sctp_mss - sizeof (sctp_data_hdr_t)); + bzero(&sopp, sizeof (sopp)); + sopp.sopp_flags = SOCKOPT_MAXBLK|SOCKOPT_WROFF; + sopp.sopp_wroff = sctps->sctps_wroff_xtra + hdrlen + + sizeof (sctp_data_hdr_t); + sopp.sopp_maxblk = sctp->sctp_mss - sizeof (sctp_data_hdr_t); + sctp->sctp_ulp_prop(sctp->sctp_ulpd, &sopp); } void diff --git a/usr/src/uts/common/inet/sctp/sctp_conn.c b/usr/src/uts/common/inet/sctp/sctp_conn.c index 716abc13bc..b4a9b56fdd 100644 --- a/usr/src/uts/common/inet/sctp/sctp_conn.c +++ b/usr/src/uts/common/inet/sctp/sctp_conn.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/systm.h> #include <sys/stream.h> @@ -152,8 +150,11 @@ sctp_accept_comm(sctp_t *listener, sctp_t *acceptor, mblk_t *cr_pkt, acceptor->sctp_rwnd = listener->sctp_rwnd; acceptor->sctp_irwnd = acceptor->sctp_rwnd; acceptor->sctp_pd_point = acceptor->sctp_rwnd; + acceptor->sctp_upcalls = listener->sctp_upcalls; +#if 0 bcopy(&listener->sctp_upcalls, &acceptor->sctp_upcalls, sizeof (sctp_upcalls_t)); +#endif return (0); } @@ -169,6 +170,7 @@ sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len, int err; conn_t *connp, *econnp; sctp_stack_t *sctps; + struct sock_proto_props sopp; /* * No need to check for duplicate as this is the listener @@ -292,22 +294,25 @@ sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len, /* Connection established, so send up the conn_ind */ if ((eager->sctp_ulpd = sctp->sctp_ulp_newconn(sctp->sctp_ulpd, - eager)) == NULL) { + (sock_lower_handle_t)eager, NULL, NULL, 0, + &eager->sctp_upcalls)) == NULL) { sctp_close_eager(eager); BUMP_MIB(&sctps->sctps_mib, sctpListenDrop); return (NULL); } ASSERT(SCTP_IS_DETACHED(eager)); eager->sctp_detached = B_FALSE; + bzero(&sopp, sizeof (sopp)); + sopp.sopp_flags = SOCKOPT_MAXBLK|SOCKOPT_WROFF; + sopp.sopp_maxblk = strmsgsz; if (eager->sctp_family == AF_INET) { - eager->sctp_ulp_prop(eager->sctp_ulpd, - sctps->sctps_wroff_xtra + sizeof (sctp_data_hdr_t) + - sctp->sctp_hdr_len, strmsgsz); + sopp.sopp_wroff = sctps->sctps_wroff_xtra + + sizeof (sctp_data_hdr_t) + sctp->sctp_hdr_len; } else { - eager->sctp_ulp_prop(eager->sctp_ulpd, - sctps->sctps_wroff_xtra + sizeof (sctp_data_hdr_t) + - sctp->sctp_hdr6_len, strmsgsz); + sopp.sopp_wroff = sctps->sctps_wroff_xtra + + sizeof (sctp_data_hdr_t) + sctp->sctp_hdr6_len; } + eager->sctp_ulp_prop(eager->sctp_ulpd, &sopp); return (eager); } @@ -333,6 +338,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen) int err; sctp_faddr_t *cur_fp; sctp_stack_t *sctps = sctp->sctp_sctps; + struct sock_proto_props sopp; /* * Determine packet type based on type of address passed in @@ -599,9 +605,11 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen) BUMP_LOCAL(sctp->sctp_opkts); notify_ulp: - sctp->sctp_ulp_prop(sctp->sctp_ulpd, - sctps->sctps_wroff_xtra + hdrlen + sizeof (sctp_data_hdr_t), - 0); + bzero(&sopp, sizeof (sopp)); + sopp.sopp_flags = SOCKOPT_WROFF; + sopp.sopp_wroff = sctps->sctps_wroff_xtra + hdrlen + + sizeof (sctp_data_hdr_t); + sctp->sctp_ulp_prop(sctp->sctp_ulpd, &sopp); return (0); default: diff --git a/usr/src/uts/common/inet/sctp/sctp_cookie.c b/usr/src/uts/common/inet/sctp/sctp_cookie.c index 93184bcd27..e089a901d3 100644 --- a/usr/src/uts/common/inet/sctp/sctp_cookie.c +++ b/usr/src/uts/common/inet/sctp/sctp_cookie.c @@ -1049,10 +1049,8 @@ sctp_send_cookie_echo(sctp_t *sctp, sctp_chunk_hdr_t *iackch, mblk_t *iackmp) * unsent, since there won't be any sent-unack'ed * here. */ - if (!SCTP_IS_DETACHED(sctp)) { - sctp->sctp_ulp_xmitted(sctp->sctp_ulpd, - sctp->sctp_unsent); - } + if (!SCTP_IS_DETACHED(sctp)) + SCTP_TXQ_UPDATE(sctp); } if (sctp->sctp_xmit_unsent == NULL) sctp->sctp_xmit_unsent_tail = NULL; diff --git a/usr/src/uts/common/inet/sctp/sctp_impl.h b/usr/src/uts/common/inet/sctp/sctp_impl.h index 5f41226bf3..089edc3835 100644 --- a/usr/src/uts/common/inet/sctp/sctp_impl.h +++ b/usr/src/uts/common/inet/sctp/sctp_impl.h @@ -608,16 +608,16 @@ typedef struct sctp_s { kcondvar_t sctp_cv; boolean_t sctp_running; - void *sctp_ulpd; /* SCTP upper layer desc. */ +#define sctp_ulpd sctp_connp->conn_upper_handle +#define sctp_upcalls sctp_connp->conn_upcalls - struct sctp_upcalls_s sctp_upcalls; /* upcalls for sctp_ulpd */ -#define sctp_ulp_newconn sctp_upcalls.su_newconn -#define sctp_ulp_connected sctp_upcalls.su_connected -#define sctp_ulp_disconnected sctp_upcalls.su_disconnected -#define sctp_ulp_disconnecting sctp_upcalls.su_disconnecting -#define sctp_ulp_recv sctp_upcalls.su_recv -#define sctp_ulp_xmitted sctp_upcalls.su_xmitted -#define sctp_ulp_prop sctp_upcalls.su_properties +#define sctp_ulp_newconn sctp_upcalls->su_newconn +#define sctp_ulp_connected sctp_upcalls->su_connected +#define sctp_ulp_disconnected sctp_upcalls->su_disconnected +#define sctp_ulp_opctl sctp_upcalls->su_opctl +#define sctp_ulp_recv sctp_upcalls->su_recv +#define sctp_ulp_xmitted sctp_upcalls->su_txq_full +#define sctp_ulp_prop sctp_upcalls->su_set_proto_props int32_t sctp_state; @@ -768,8 +768,9 @@ typedef struct sctp_s { sctp_rexmitting : 1, /* SCTP is retransmitting */ sctp_zero_win_probe : 1, /* doing zero win probe */ + sctp_txq_full : 1, /* the tx queue is full */ sctp_ulp_discon_done : 1, /* ulp_disconnecting done */ - sctp_dummy : 7; + sctp_dummy : 6; } sctp_bits; struct { uint32_t @@ -809,6 +810,7 @@ typedef struct sctp_s { #define sctp_linklocal sctp_bits.sctp_linklocal #define sctp_rexmitting sctp_bits.sctp_rexmitting #define sctp_zero_win_probe sctp_bits.sctp_zero_win_probe +#define sctp_txq_full sctp_bits.sctp_txq_full #define sctp_ulp_discon_done sctp_bits.sctp_ulp_discon_done #define sctp_recvsndrcvinfo sctp_events.sctp_recvsndrcvinfo @@ -935,6 +937,15 @@ typedef struct sctp_s { uint32_t sctp_err_len; /* Total error chunks length */ } sctp_t; +#define SCTP_TXQ_LEN(sctp) ((sctp)->sctp_unsent + (sctp)->sctp_unacked) +#define SCTP_TXQ_UPDATE(sctp) \ + if ((sctp)->sctp_txq_full && SCTP_TXQ_LEN(sctp) <= \ + (sctp)->sctp_xmit_lowater) { \ + (sctp)->sctp_txq_full = 0; \ + (sctp)->sctp_ulp_xmitted((sctp)->sctp_ulpd, \ + B_FALSE); \ + } + #endif /* (defined(_KERNEL) || defined(_KMEMUSER)) */ extern void sctp_ack_timer(sctp_t *); diff --git a/usr/src/uts/common/inet/sctp/sctp_input.c b/usr/src/uts/common/inet/sctp/sctp_input.c index 71a85ad04e..87c79eedff 100644 --- a/usr/src/uts/common/inet/sctp/sctp_input.c +++ b/usr/src/uts/common/inet/sctp/sctp_input.c @@ -24,8 +24,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/systm.h> #include <sys/stream.h> @@ -1192,6 +1190,7 @@ sctp_data_chunk(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *mp, mblk_t **dups, boolean_t tpfinished = B_TRUE; int32_t new_rwnd; sctp_stack_t *sctps = sctp->sctp_sctps; + int error; /* The following are used multiple times, so we inline them */ #define SCTP_ACK_IT(sctp, tsn) \ @@ -1292,8 +1291,8 @@ sctp_data_chunk(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *mp, mblk_t **dups, oftsn = sctp->sctp_ftsn; if (isfrag) { - int error = 0; + error = 0; /* fragmented data chunk */ dmp->b_rptr = (uchar_t *)dc; if (ubit) { @@ -1408,13 +1407,18 @@ sctp_data_chunk(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *mp, mblk_t **dups, sctp->sctp_rxqueued -= dlen; if (can_deliver) { + dmp->b_rptr = (uchar_t *)(dc + 1); if (sctp_input_add_ancillary(sctp, &dmp, dc, fp, ipp) == 0) { dprint(1, ("sctp_data_chunk: delivering %lu bytes\n", msgdsize(dmp))); sctp->sctp_rwnd -= dlen; + /* + * Override b_flag for SCTP sockfs internal use + */ + dmp->b_flag = tpfinished ? 0 : SCTP_PARTIAL_DATA; new_rwnd = sctp->sctp_ulp_recv(sctp->sctp_ulpd, dmp, - tpfinished ? 0 : SCTP_PARTIAL_DATA); + msgdsize(dmp), 0, &error, NULL); if (new_rwnd > sctp->sctp_rwnd) { sctp->sctp_rwnd = new_rwnd; } @@ -1492,8 +1496,13 @@ sctp_data_chunk(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *mp, mblk_t **dups, dprint(1, ("sctp_data_chunk: delivering %lu " "bytes\n", msgdsize(dmp))); sctp->sctp_rwnd -= dlen; + /* + * Override b_flag for SCTP sockfs internal use + */ + dmp->b_flag = tpfinished ? + 0 : SCTP_PARTIAL_DATA; new_rwnd = sctp->sctp_ulp_recv(sctp->sctp_ulpd, - dmp, tpfinished ? 0 : SCTP_PARTIAL_DATA); + dmp, msgdsize(dmp), 0, &error, NULL); if (new_rwnd > sctp->sctp_rwnd) { sctp->sctp_rwnd = new_rwnd; } @@ -1806,10 +1815,8 @@ sctp_check_abandoned_msg(sctp_t *sctp, mblk_t *meta) * Update ULP the amount of queued data, which is * sent-unack'ed + unsent. */ - if (!SCTP_IS_DETACHED(sctp)) { - sctp->sctp_ulp_xmitted(sctp->sctp_ulpd, - sctp->sctp_unacked + sctp->sctp_unsent); - } + if (!SCTP_IS_DETACHED(sctp)) + SCTP_TXQ_UPDATE(sctp); return (0); } return (-1); @@ -1922,10 +1929,8 @@ cum_ack_done: * Update ULP the amount of queued data, which is * sent-unack'ed + unsent. */ - if (!SCTP_IS_DETACHED(sctp)) { - sctp->sctp_ulp_xmitted(sctp->sctp_ulpd, - sctp->sctp_unacked + sctp->sctp_unsent); - } + if (!SCTP_IS_DETACHED(sctp)) + SCTP_TXQ_UPDATE(sctp); /* Time to send a shutdown? */ if (sctp->sctp_state == SCTPS_SHUTDOWN_PENDING) { @@ -2141,6 +2146,7 @@ sctp_process_forward_tsn(sctp_t *sctp, sctp_chunk_hdr_t *ch, sctp_faddr_t *fp, } if (can_deliver) { int32_t nrwnd; + int error; dmp->b_rptr = (uchar_t *)(dc + 1); dmp->b_next = NULL; @@ -2149,8 +2155,15 @@ sctp_process_forward_tsn(sctp_t *sctp, sctp_chunk_hdr_t *ch, sctp_faddr_t *fp, &dmp, dc, fp, ipp) == 0) { sctp->sctp_rxqueued -= dlen; sctp->sctp_rwnd -= dlen; + /* + * Override b_flag for SCTP sockfs + * internal use + */ + + dmp->b_flag = 0; nrwnd = sctp->sctp_ulp_recv( - sctp->sctp_ulpd, dmp, 0); + sctp->sctp_ulpd, dmp, msgdsize(dmp), + 0, &error, NULL); if (nrwnd > sctp->sctp_rwnd) sctp->sctp_rwnd = nrwnd; } else { @@ -3947,7 +3960,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) sctp_stop_faddr_timers(sctp); if (!SCTP_IS_DETACHED(sctp)) { sctp->sctp_ulp_connected( - sctp->sctp_ulpd); + sctp->sctp_ulpd, 0, NULL, -1); sctp_set_ulp_prop(sctp); } sctp->sctp_state = SCTPS_ESTABLISHED; @@ -3983,7 +3996,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) case CHUNK_COOKIE_ACK: if (!SCTP_IS_DETACHED(sctp)) { sctp->sctp_ulp_connected( - sctp->sctp_ulpd); + sctp->sctp_ulpd, 0, NULL, -1); sctp_set_ulp_prop(sctp); } if (sctp->sctp_unacked == 0) @@ -4020,7 +4033,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) if (!SCTP_IS_DETACHED(sctp)) { sctp->sctp_ulp_connected( - sctp->sctp_ulpd); + sctp->sctp_ulpd, 0, NULL, -1); sctp_set_ulp_prop(sctp); } if (sctp->sctp_unacked == 0) diff --git a/usr/src/uts/common/inet/sctp/sctp_notify.c b/usr/src/uts/common/inet/sctp/sctp_notify.c index f516154ce6..3ede878954 100644 --- a/usr/src/uts/common/inet/sctp/sctp_notify.c +++ b/usr/src/uts/common/inet/sctp/sctp_notify.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/systm.h> #include <sys/stream.h> @@ -39,9 +37,12 @@ #include <netinet/sctp.h> #include <inet/common.h> +#include <inet/ipclassifier.h> #include <inet/ip.h> + #include "sctp_impl.h" +/* ARGSUSED */ static void sctp_notify(sctp_t *sctp, mblk_t *emp, size_t len) { @@ -49,6 +50,7 @@ sctp_notify(sctp_t *sctp, mblk_t *emp, size_t len) mblk_t *mp; sctp_faddr_t *fp; int32_t rwnd = 0; + int error; if ((mp = allocb(sizeof (*tudi) + sizeof (void *) + sizeof (struct sockaddr_in6), BPRI_HI)) == NULL) { @@ -108,7 +110,13 @@ sctp_notify(sctp_t *sctp, mblk_t *emp, size_t len) ASSERT(len == rwnd); #endif - rwnd = sctp->sctp_ulp_recv(sctp->sctp_ulpd, mp, SCTP_NOTIFICATION); + /* + * Override b_flag for SCTP sockfs internal use + */ + mp->b_flag = (short)SCTP_NOTIFICATION; + + rwnd = sctp->sctp_ulp_recv(sctp->sctp_ulpd, mp, msgdsize(mp), 0, + &error, NULL); if (rwnd > sctp->sctp_rwnd) { sctp->sctp_rwnd = rwnd; } diff --git a/usr/src/uts/common/inet/sctp/sctp_opt_data.c b/usr/src/uts/common/inet/sctp/sctp_opt_data.c index c24c81c01f..b3921cf6ad 100644 --- a/usr/src/uts/common/inet/sctp/sctp_opt_data.c +++ b/usr/src/uts/common/inet/sctp/sctp_opt_data.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/stream.h> #define _SUN_TPI_VERSION 2 @@ -1386,8 +1384,11 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp, } us = (struct sctp_uc_swap *)invalp; sctp->sctp_ulpd = us->sus_handle; + sctp->sctp_upcalls = us->sus_upcalls; +#if 0 bcopy(us->sus_upcalls, &sctp->sctp_upcalls, sizeof (sctp_upcalls_t)); +#endif break; } case SCTP_PRSCTP: diff --git a/usr/src/uts/common/inet/sctp/sctp_output.c b/usr/src/uts/common/inet/sctp/sctp_output.c index 8065f1dcf1..938573b1be 100644 --- a/usr/src/uts/common/inet/sctp/sctp_output.c +++ b/usr/src/uts/common/inet/sctp/sctp_output.c @@ -288,6 +288,13 @@ sctp_sendmsg(sctp_t *sctp, mblk_t *mp, int flags) } sctp->sctp_unsent += msg_len; BUMP_LOCAL(sctp->sctp_msgcount); + /* + * Notify sockfs if the tx queue is full. + */ + if (SCTP_TXQ_LEN(sctp) >= sctp->sctp_xmit_hiwater) { + sctp->sctp_txq_full = 1; + sctp->sctp_ulp_xmitted(sctp->sctp_ulpd, B_TRUE); + } if (sctp->sctp_state == SCTPS_ESTABLISHED) sctp_output(sctp, UINT_MAX); process_sendq: @@ -366,10 +373,8 @@ nextmsg: * Update ULP the amount of queued data, which is * sent-unack'ed + unsent. */ - if (!SCTP_IS_DETACHED(sctp)) { - sctp->sctp_ulp_xmitted(sctp->sctp_ulpd, - sctp->sctp_unacked + sctp->sctp_unsent); - } + if (!SCTP_IS_DETACHED(sctp)) + SCTP_TXQ_UPDATE(sctp); sctp_sendfail_event(sctp, mdblk, 0, B_FALSE); goto try_next; } @@ -875,10 +880,8 @@ chunkified: * Update ULP the amount of queued data, which is * sent-unack'ed + unsent. */ - if (!SCTP_IS_DETACHED(sctp)) { - sctp->sctp_ulp_xmitted(sctp->sctp_ulpd, - sctp->sctp_unacked + sctp->sctp_unsent); - } + if (!SCTP_IS_DETACHED(sctp)) + SCTP_TXQ_UPDATE(sctp); sctp_sendfail_event(sctp, meta, 0, B_TRUE); next_msg: meta = tmp_meta; @@ -1541,10 +1544,8 @@ ftsn_done: * Update ULP the amount of queued data, which is * sent-unack'ed + unsent. */ - if (!SCTP_IS_DETACHED(sctp)) { - sctp->sctp_ulp_xmitted(sctp->sctp_ulpd, - sctp->sctp_unacked + sctp->sctp_unsent); - } + if (!SCTP_IS_DETACHED(sctp)) + SCTP_TXQ_UPDATE(sctp); } } diff --git a/usr/src/uts/common/inet/sctp/sctp_shutdown.c b/usr/src/uts/common/inet/sctp/sctp_shutdown.c index e8311a018f..b58016eb15 100644 --- a/usr/src/uts/common/inet/sctp/sctp_shutdown.c +++ b/usr/src/uts/common/inet/sctp/sctp_shutdown.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/systm.h> #include <sys/stream.h> @@ -165,7 +163,7 @@ sctp_shutdown_received(sctp_t *sctp, sctp_chunk_hdr_t *sch, boolean_t crwsd, /* Don't allow sending new data */ if (!SCTP_IS_DETACHED(sctp) && !sctp->sctp_ulp_discon_done) { - sctp->sctp_ulp_disconnecting(sctp->sctp_ulpd); + sctp->sctp_ulp_opctl(sctp->sctp_ulpd, SOCK_OPCTL_SHUT_SEND, 0); sctp->sctp_ulp_discon_done = B_TRUE; } diff --git a/usr/src/uts/common/inet/sctp_itf.h b/usr/src/uts/common/inet/sctp_itf.h index 4a94cab233..eb7597ac0a 100644 --- a/usr/src/uts/common/inet/sctp_itf.h +++ b/usr/src/uts/common/inet/sctp_itf.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _INET_SCTP_ITF_H #define _INET_SCTP_ITF_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -54,21 +51,6 @@ extern "C" { #define SCTP_ITF_VER 1 /* - * This struct holds all the upcalls the SCTP kernel module will - * invoke for different events. When calling sctp_create() to create - * a SCTP handle, the caller must provide this information. - */ -typedef struct sctp_upcalls_s { - void * (*su_newconn)(void *parenthandle, void *connind); - void (*su_connected)(void *handle); - int (*su_disconnected)(void *handle, int error); - void (*su_disconnecting)(void *handle); - int (*su_recv)(void *handle, mblk_t *mp, int flags); - void (*su_xmitted)(void *handle, int txqueued); - void (*su_properties)(void *handle, int wroff, size_t maxblk); -} sctp_upcalls_t; - -/* * This struct holds various flow control limits the caller of * sctp_create() should observe when interacting with SCTP. */ @@ -82,9 +64,10 @@ typedef struct sctp_sockbuf_limits_s { /* * Parameter to SCTP_UC_SWAP setsockopt */ +struct sock_upcalls_s; struct sctp_uc_swap { - void *sus_handle; - sctp_upcalls_t *sus_upcalls; + void *sus_handle; + struct sock_upcalls_s *sus_upcalls; }; struct sctp_s; @@ -102,7 +85,7 @@ extern void sctp_close(struct sctp_s *conn); extern int sctp_connect(struct sctp_s *conn, const struct sockaddr *dst, socklen_t addrlen); extern struct sctp_s *sctp_create(void *newhandle, struct sctp_s *parent, - int family, int flags, const sctp_upcalls_t *su, + int family, int flags, struct sock_upcalls_s *su, sctp_sockbuf_limits_t *sbl, cred_t *cr); extern int sctp_disconnect(struct sctp_s *conn); extern int sctp_get_opt(struct sctp_s *conn, int level, int opt, void *opts, diff --git a/usr/src/uts/common/inet/sockmods/sockmod_sctp.c b/usr/src/uts/common/inet/sockmods/sockmod_sctp.c new file mode 100644 index 0000000000..2600cfa181 --- /dev/null +++ b/usr/src/uts/common/inet/sockmods/sockmod_sctp.c @@ -0,0 +1,221 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/sysmacros.h> +#include <sys/strsubr.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/modctl.h> +#include <sys/cmn_err.h> +#include <netinet/sctp.h> +#include <fs/sockfs/sockcommon.h> +#include "socksctp.h" + +struct sonode *socksctp_create(struct sockparams *, int, int, int, + int, int, int *, cred_t *); +void socksctp_destroy(struct sonode *); + +static int socksctp_constructor(void *, void *, int); +static void socksctp_destructor(void *, void *); + +static __smod_priv_t sosctp_priv = { + socksctp_create, + socksctp_destroy, + NULL +}; + +static smod_reg_t sinfo = { + SOCKMOD_VERSION, + "socksctp", + SOCK_UC_VERSION, + SOCK_DC_VERSION, + NULL, + &sosctp_priv +}; + +kmem_cache_t *sosctp_assoccache; +static kmem_cache_t *sosctp_sockcache; + +/* + * Module linkage information for the kernel. + */ +static struct modlsockmod modlsockmod = { + &mod_sockmodops, "SCTP socket module", &sinfo +}; + +static struct modlinkage modlinkage = { + MODREV_1, + &modlsockmod, + NULL +}; + +static int +socksctp_init(void) +{ + sosctp_sockcache = kmem_cache_create("sctpsock", + sizeof (struct sctp_sonode), 0, socksctp_constructor, + socksctp_destructor, NULL, NULL, NULL, 0); + sosctp_assoccache = kmem_cache_create("sctp_assoc", + sizeof (struct sctp_soassoc), 0, NULL, NULL, NULL, NULL, NULL, 0); + return (0); +} + +static void +socksctp_fini(void) +{ + kmem_cache_destroy(sosctp_sockcache); + kmem_cache_destroy(sosctp_assoccache); +} + +/*ARGSUSED*/ +static int +socksctp_constructor(void *buf, void *cdrarg, int kmflags) +{ + struct sctp_sonode *ss = buf; + struct sonode *so = &ss->ss_so; + + ss->ss_type = SOSCTP_SOCKET; + return (sonode_constructor((void *)so, cdrarg, kmflags)); +} + +/*ARGSUSED*/ +static void +socksctp_destructor(void *buf, void *cdrarg) +{ + struct sctp_sonode *ss = buf; + struct sonode *so = &ss->ss_so; + + sonode_destructor((void *)so, cdrarg); +} + +/* + * Creates a sctp socket data structure. + */ +/* ARGSUSED */ +struct sonode * +socksctp_create(struct sockparams *sp, int family, int type, int protocol, + int version, int sflags, int *errorp, cred_t *cr) +{ + struct sctp_sonode *ss; + struct sonode *so; + int kmflags = (sflags & SOCKET_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; + + if (version == SOV_STREAM) { + *errorp = EINVAL; + return (NULL); + } + + /* + * We only support two types of SCTP socket. Let sotpi_create() + * handle all other cases, such as raw socket. + */ + if (!(family == AF_INET || family == AF_INET6) || + !(type == SOCK_STREAM || type == SOCK_SEQPACKET)) { + *errorp = EINVAL; + return (NULL); + } + + ss = kmem_cache_alloc(sosctp_sockcache, kmflags); + if (ss == NULL) { + *errorp = ENOMEM; + return (NULL); + } + + so = &ss->ss_so; + + ss->ss_maxassoc = 0; + ss->ss_assoccnt = 0; + ss->ss_assocs = NULL; + + if (type == SOCK_STREAM) { + sonode_init(so, sp, family, type, protocol, + &sosctp_sonodeops); + } else { + sonode_init(so, sp, family, type, protocol, + &sosctp_seq_sonodeops); + ASSERT(type == SOCK_SEQPACKET); + mutex_enter(&so->so_lock); + (void) sosctp_aid_grow(ss, 1, kmflags); + mutex_exit(&so->so_lock); + } + + if (version == SOV_DEFAULT) { + version = so_default_version; + } + so->so_version = (short)version; + + dprint(2, ("sosctp_create: %p domain %d type %d\n", (void *)so, family, + type)); + + return (so); +} + +/* + * Free SCTP socket data structure. + */ +void +socksctp_destroy(struct sonode *so) +{ + struct sctp_sonode *ss; + + ASSERT((so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) && + so->so_protocol == IPPROTO_SCTP); + + sosctp_fini(so, CRED()); + + ss = SOTOSSO(so); + kmem_cache_free(sosctp_sockcache, ss); +} + +int +_init(void) +{ + int error = 0; + + (void) socksctp_init(); + + if ((error = mod_install(&modlinkage)) != 0) + socksctp_fini(); + + return (error); +} + +int +_fini(void) +{ + int error = 0; + + if ((error = mod_remove(&modlinkage)) == 0) + socksctp_fini(); + + return (error); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/inet/sockmods/sockmod_sdp.c b/usr/src/uts/common/inet/sockmods/sockmod_sdp.c new file mode 100644 index 0000000000..f609cbe069 --- /dev/null +++ b/usr/src/uts/common/inet/sockmods/sockmod_sdp.c @@ -0,0 +1,154 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/sysmacros.h> +#include <sys/strsubr.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/modctl.h> +#include <sys/cmn_err.h> +#include <sys/tihdr.h> +#include <sys/vfs.h> +#include <fs/sockfs/nl7c.h> +#include <inet/kssl/ksslapi.h> +#include <inet/sdp_itf.h> +#include <fs/sockfs/sockcommon.h> +#include "socksdp.h" + +struct sonode *socksdp_create(struct sockparams *, int, int, int, + int, int, int *, cred_t *); +static void socksdp_destroy(struct sonode *); + +static __smod_priv_t sosdp_priv = { + socksdp_create, + socksdp_destroy, + NULL +}; + +static smod_reg_t sinfo = { + SOCKMOD_VERSION, + "socksdp", + SOCK_UC_VERSION, + SOCK_DC_VERSION, + NULL, + &sosdp_priv +}; + +/* + * Module linkage information for the kernel + */ +static struct modlsockmod modlsockmod = { + &mod_sockmodops, "SDP socket module", &sinfo +}; + +static struct modlinkage modlinkage = { + MODREV_1, + &modlsockmod, + NULL +}; + +/* + * Creates a sdp socket data structure. + */ +/* ARGSUSED */ +struct sonode * +socksdp_create(struct sockparams *sp, int family, int type, int protocol, + int version, int sflags, int *errorp, cred_t *cr) +{ + struct sonode *so; + int kmflags = (sflags & SOCKET_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; + + dprint(4, ("Inside sosdp_create: domain:%d proto:%d type:%d", + family, protocol, type)); + + *errorp = 0; + if (is_system_labeled()) { + *errorp = EOPNOTSUPP; + return (NULL); + } + + if (version == SOV_STREAM) { + *errorp = EINVAL; + return (NULL); + } + + /* + * We only support one type of SDP socket. Let sotpi_create() + * handle all other cases, such as raw socket. + */ + if (!(family == AF_INET || family == AF_INET6) || + !(type == SOCK_STREAM)) { + *errorp = EINVAL; + return (NULL); + } + + so = kmem_cache_alloc(socket_cache, kmflags); + if (so == NULL) { + *errorp = ENOMEM; + return (NULL); + } + + sonode_init(so, sp, family, type, protocol, &sosdp_sonodeops); + so->so_pollev |= SO_POLLEV_ALWAYS; + + dprint(2, ("sosdp_create: %p domain %d type %d\n", (void *)so, family, + type)); + + if (version == SOV_DEFAULT) { + version = so_default_version; + } + so->so_version = (short)version; + + return (so); +} + +static void +socksdp_destroy(struct sonode *so) +{ + ASSERT(so->so_ops == &sosdp_sonodeops); + + sosdp_fini(so, CRED()); + + kmem_cache_free(socket_cache, so); +} + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/inet/sockmods/socksctp.c b/usr/src/uts/common/inet/sockmods/socksctp.c new file mode 100644 index 0000000000..e013940703 --- /dev/null +++ b/usr/src/uts/common/inet/sockmods/socksctp.c @@ -0,0 +1,2105 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/buf.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/stropts.h> +#include <sys/cmn_err.h> +#include <sys/sysmacros.h> +#include <sys/filio.h> + +#include <sys/project.h> +#include <sys/tihdr.h> +#include <sys/strsubr.h> +#include <sys/esunddi.h> +#include <sys/ddi.h> + +#include <sys/sockio.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/strsun.h> + +#include <netinet/sctp.h> +#include <inet/sctp_itf.h> +#include <fs/sockfs/sockcommon.h> +#include "socksctp.h" + +/* + * SCTP sockfs sonode operations, 1-1 socket + */ +static int sosctp_init(struct sonode *, struct sonode *, struct cred *, int); +static int sosctp_accept(struct sonode *, int, struct cred *, struct sonode **); +static int sosctp_bind(struct sonode *, struct sockaddr *, socklen_t, int, + struct cred *); +static int sosctp_listen(struct sonode *, int, struct cred *); +static int sosctp_connect(struct sonode *, const struct sockaddr *, socklen_t, + int, int, struct cred *); +static int sosctp_recvmsg(struct sonode *, struct nmsghdr *, struct uio *, + struct cred *); +static int sosctp_sendmsg(struct sonode *, struct nmsghdr *, struct uio *, + struct cred *); +static int sosctp_getpeername(struct sonode *, struct sockaddr *, socklen_t *, + boolean_t, struct cred *); +static int sosctp_getsockname(struct sonode *, struct sockaddr *, socklen_t *, + struct cred *); +static int sosctp_shutdown(struct sonode *, int, struct cred *); +static int sosctp_getsockopt(struct sonode *, int, int, void *, socklen_t *, + int, struct cred *); +static int sosctp_setsockopt(struct sonode *, int, int, const void *, + socklen_t, struct cred *); +static int sosctp_ioctl(struct sonode *, int, intptr_t, int, struct cred *, + int32_t *); +static int sosctp_close(struct sonode *, int, struct cred *); +void sosctp_fini(struct sonode *, struct cred *); + +/* + * SCTP sockfs sonode operations, 1-N socket + */ +static int sosctp_seq_connect(struct sonode *, const struct sockaddr *, + socklen_t, int, int, struct cred *); +static int sosctp_seq_sendmsg(struct sonode *, struct nmsghdr *, struct uio *, + struct cred *); + +/* + * Socket association upcalls, 1-N socket connection + */ +sock_upper_handle_t sctp_assoc_newconn(sock_upper_handle_t, + sock_lower_handle_t, sock_downcalls_t *, struct cred *, pid_t, + sock_upcalls_t **); +static void sctp_assoc_connected(sock_upper_handle_t, sock_connid_t, + struct cred *, pid_t); +static int sctp_assoc_disconnected(sock_upper_handle_t, sock_connid_t, int); +static void sctp_assoc_disconnecting(sock_upper_handle_t, sock_opctl_action_t, + uintptr_t arg); +static ssize_t sctp_assoc_recv(sock_upper_handle_t, mblk_t *, size_t, int, + int *, boolean_t *); +static void sctp_assoc_xmitted(sock_upper_handle_t, boolean_t); +static void sctp_assoc_properties(sock_upper_handle_t, + struct sock_proto_props *); + +sonodeops_t sosctp_sonodeops = { + sosctp_init, /* sop_init */ + sosctp_accept, /* sop_accept */ + sosctp_bind, /* sop_bind */ + sosctp_listen, /* sop_listen */ + sosctp_connect, /* sop_connect */ + sosctp_recvmsg, /* sop_recvmsg */ + sosctp_sendmsg, /* sop_sendmsg */ + so_sendmblk_notsupp, /* sop_sendmblk */ + sosctp_getpeername, /* sop_getpeername */ + sosctp_getsockname, /* sop_getsockname */ + sosctp_shutdown, /* sop_shutdown */ + sosctp_getsockopt, /* sop_getsockopt */ + sosctp_setsockopt, /* sop_setsockopt */ + sosctp_ioctl, /* sop_ioctl */ + so_poll, /* sop_poll */ + sosctp_close, /* sop_close */ +}; + +sonodeops_t sosctp_seq_sonodeops = { + sosctp_init, /* sop_init */ + so_accept_notsupp, /* sop_accept */ + sosctp_bind, /* sop_bind */ + sosctp_listen, /* sop_listen */ + sosctp_seq_connect, /* sop_connect */ + sosctp_recvmsg, /* sop_recvmsg */ + sosctp_seq_sendmsg, /* sop_sendmsg */ + so_sendmblk_notsupp, /* sop_sendmblk */ + so_getpeername_notsupp, /* sop_getpeername */ + sosctp_getsockname, /* sop_getsockname */ + so_shutdown_notsupp, /* sop_shutdown */ + sosctp_getsockopt, /* sop_getsockopt */ + sosctp_setsockopt, /* sop_setsockopt */ + sosctp_ioctl, /* sop_ioctl */ + so_poll, /* sop_poll */ + sosctp_close, /* sop_close */ +}; + +sock_upcalls_t sosctp_sock_upcalls = { + so_newconn, + so_connected, + so_disconnected, + so_opctl, + so_queue_msg, + so_set_prop, + so_txq_full, + NULL, /* su_signal_oob */ +}; + +sock_upcalls_t sosctp_assoc_upcalls = { + sctp_assoc_newconn, + sctp_assoc_connected, + sctp_assoc_disconnected, + sctp_assoc_disconnecting, + sctp_assoc_recv, + sctp_assoc_properties, + sctp_assoc_xmitted, + NULL, /* su_recv_space */ + NULL, /* su_signal_oob */ +}; + +/* ARGSUSED */ +static int +sosctp_init(struct sonode *so, struct sonode *pso, struct cred *cr, int flags) +{ + struct sctp_sonode *ss; + struct sctp_sonode *pss; + sctp_sockbuf_limits_t sbl; + sock_upcalls_t *upcalls; + + ss = SOTOSSO(so); + + if (pso != NULL) { + /* + * Passive open, just inherit settings from parent. We should + * not end up here for SOCK_SEQPACKET type sockets, since no + * new sonode is created in that case. + */ + ASSERT(so->so_type == SOCK_STREAM); + pss = SOTOSSO(pso); + + mutex_enter(&pso->so_lock); + so->so_state |= (SS_ISBOUND | SS_ISCONNECTED | + (pso->so_state & SS_ASYNC)); + sosctp_so_inherit(pss, ss); + so->so_proto_props = pso->so_proto_props; + so->so_mode = pso->so_mode; + mutex_exit(&pso->so_lock); + + return (0); + } + + if (so->so_type == SOCK_STREAM) { + upcalls = &sosctp_sock_upcalls; + so->so_mode = SM_CONNREQUIRED; + } else { + ASSERT(so->so_type == SOCK_SEQPACKET); + upcalls = &sosctp_assoc_upcalls; + } + so->so_proto_handle = (sock_lower_handle_t)sctp_create(so, NULL, + so->so_family, SCTP_CAN_BLOCK, upcalls, &sbl, cr); + if (so->so_proto_handle == NULL) + return (ENOMEM); + + so->so_rcvbuf = sbl.sbl_rxbuf; + so->so_rcvlowat = sbl.sbl_rxlowat; + so->so_sndbuf = sbl.sbl_txbuf; + so->so_sndlowat = sbl.sbl_txlowat; + + return (0); +} + +/* + * Accept incoming connection. + */ +/*ARGSUSED*/ +static int +sosctp_accept(struct sonode *so, int fflag, struct cred *cr, + struct sonode **nsop) +{ + int error = 0; + + if ((so->so_state & SS_ACCEPTCONN) == 0) + return (EINVAL); + + error = so_acceptq_dequeue(so, (fflag & (FNONBLOCK|FNDELAY)), nsop); + + return (error); +} + +/* + * Bind local endpoint. + */ +/*ARGSUSED*/ +static int +sosctp_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen, + int flags, struct cred *cr) +{ + int error; + + if (!(flags & _SOBIND_LOCK_HELD)) { + mutex_enter(&so->so_lock); + so_lock_single(so); /* Set SOLOCKED */ + } else { + ASSERT(MUTEX_HELD(&so->so_lock)); + } + + /* + * X/Open requires this check + */ + if (so->so_state & SS_CANTSENDMORE) { + error = EINVAL; + goto done; + } + + + /* + * Protocol module does address family checks. + */ + mutex_exit(&so->so_lock); + + error = sctp_bind((struct sctp_s *)so->so_proto_handle, name, namelen); + + mutex_enter(&so->so_lock); + if (error == 0) { + so->so_state |= SS_ISBOUND; + } else { + eprintsoline(so, error); + } +done: + if (!(flags & _SOBIND_LOCK_HELD)) { + so_unlock_single(so, SOLOCKED); + mutex_exit(&so->so_lock); + } else { + /* If the caller held the lock don't release it here */ + ASSERT(MUTEX_HELD(&so->so_lock)); + ASSERT(so->so_flag & SOLOCKED); + } + + return (error); +} + +/* + * Turn socket into a listen socket. + */ +/* ARGSUSED */ +static int +sosctp_listen(struct sonode *so, int backlog, struct cred *cr) +{ + int error = 0; + + mutex_enter(&so->so_lock); + so_lock_single(so); + + /* + * If this socket is trying to do connect, or if it has + * been connected, disallow. + */ + if (so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED | + SS_ISDISCONNECTING | SS_CANTRCVMORE | SS_CANTSENDMORE)) { + error = EINVAL; + eprintsoline(so, error); + goto done; + } + + if (backlog < 0) { + backlog = 0; + } + + /* + * If listen() is only called to change backlog, we don't + * need to notify protocol module. + */ + if (so->so_state & SS_ACCEPTCONN) { + so->so_backlog = backlog; + goto done; + } + + mutex_exit(&so->so_lock); + error = sctp_listen((struct sctp_s *)so->so_proto_handle); + mutex_enter(&so->so_lock); + if (error == 0) { + so->so_state |= (SS_ACCEPTCONN|SS_ISBOUND); + so->so_backlog = backlog; + } else { + eprintsoline(so, error); + } +done: + so_unlock_single(so, SOLOCKED); + mutex_exit(&so->so_lock); + + return (error); +} + +/* + * Active open. + */ +/*ARGSUSED*/ +static int +sosctp_connect(struct sonode *so, const struct sockaddr *name, + socklen_t namelen, int fflag, int flags, struct cred *cr) +{ + int error = 0; + + ASSERT(so->so_type == SOCK_STREAM); + + mutex_enter(&so->so_lock); + so_lock_single(so); + + /* + * Can't connect() after listen(), or if the socket is already + * connected. + */ + if (so->so_state & (SS_ACCEPTCONN|SS_ISCONNECTED|SS_ISCONNECTING)) { + if (so->so_state & SS_ISCONNECTED) { + error = EISCONN; + } else if (so->so_state & SS_ISCONNECTING) { + error = EALREADY; + } else { + error = EOPNOTSUPP; + } + eprintsoline(so, error); + goto done; + } + + /* + * Check for failure of an earlier call + */ + if (so->so_error != 0) { + error = sogeterr(so, B_TRUE); + eprintsoline(so, error); + goto done; + } + + /* + * Connection is closing, or closed, don't allow reconnect. + * TCP allows this to proceed, but the socket remains unwriteable. + * BSD returns EINVAL. + */ + if (so->so_state & (SS_ISDISCONNECTING|SS_CANTRCVMORE| + SS_CANTSENDMORE)) { + error = EINVAL; + eprintsoline(so, error); + goto done; + } + + if (name == NULL || namelen == 0) { + mutex_exit(&so->so_lock); + error = EINVAL; + eprintsoline(so, error); + goto done; + } + + soisconnecting(so); + mutex_exit(&so->so_lock); + + error = sctp_connect((struct sctp_s *)so->so_proto_handle, + name, namelen); + + mutex_enter(&so->so_lock); + if (error == 0) { + /* + * Allow other threads to access the socket + */ + error = sowaitconnected(so, fflag, 0); + } +done: + so_unlock_single(so, SOLOCKED); + mutex_exit(&so->so_lock); + return (error); +} + +/* + * Active open for 1-N sockets, create a new association and + * call connect on that. + * If there parent hasn't been bound yet (this is the first association), + * make it so. + */ +static int +sosctp_seq_connect(struct sonode *so, const struct sockaddr *name, + socklen_t namelen, int fflag, int flags, struct cred *cr) +{ + struct sctp_soassoc *ssa; + struct sctp_sonode *ss; + int error; + + ASSERT(so->so_type == SOCK_SEQPACKET); + + mutex_enter(&so->so_lock); + so_lock_single(so); + + if (name == NULL || namelen == 0) { + error = EINVAL; + eprintsoline(so, error); + goto done; + } + + ss = SOTOSSO(so); + + error = sosctp_assoc_createconn(ss, name, namelen, NULL, 0, fflag, + cr, &ssa); + if (error != 0) { + if ((error == EHOSTUNREACH) && (flags & _SOCONNECT_XPG4_2)) { + error = ENETUNREACH; + } + } + if (ssa != NULL) { + SSA_REFRELE(ss, ssa); + } + +done: + so_unlock_single(so, SOLOCKED); + mutex_exit(&so->so_lock); + return (error); +} + +/* + * Receive data. + */ +/* ARGSUSED */ +static int +sosctp_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, + struct cred *cr) +{ + struct sctp_sonode *ss = SOTOSSO(so); + struct sctp_soassoc *ssa = NULL; + int flags, error = 0; + struct T_unitdata_ind *tind; + int len, count, readcnt = 0, rxqueued; + socklen_t controllen, namelen; + void *opt; + mblk_t *mp; + rval_t rval; + + controllen = msg->msg_controllen; + namelen = msg->msg_namelen; + flags = msg->msg_flags; + msg->msg_flags = 0; + msg->msg_controllen = 0; + msg->msg_namelen = 0; + + if (so->so_type == SOCK_STREAM) { + if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING| + SS_CANTRCVMORE))) { + return (ENOTCONN); + } + } else { + /* NOTE: Will come here from vop_read() as well */ + /* For 1-N socket, recv() cannot be used. */ + if (namelen == 0) + return (EOPNOTSUPP); + /* + * If there are no associations, and no new connections are + * coming in, there's not going to be new messages coming + * in either. + */ + if (so->so_rcv_q_head == NULL && ss->ss_assoccnt == 0 && + !(so->so_state & SS_ACCEPTCONN)) { + return (ENOTCONN); + } + } + + /* + * out-of-band data not supported. + */ + if (flags & MSG_OOB) { + return (EOPNOTSUPP); + } + + /* + * flag possibilities: + * + * MSG_PEEK Don't consume data + * MSG_WAITALL Wait for full quantity of data (ignored if MSG_PEEK) + * MSG_DONTWAIT Non-blocking (same as FNDELAY | FNONBLOCK) + * + * MSG_WAITALL can return less than the full buffer if either + * + * 1. we would block and we are non-blocking + * 2. a full message cannot be delivered + * + * Given that we always get a full message from proto below, + * MSG_WAITALL is not meaningful. + */ + + mutex_enter(&so->so_lock); + + /* + * Allow just one reader at a time. + */ + error = so_lock_read_intr(so, + uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0)); + if (error) { + mutex_exit(&so->so_lock); + return (error); + } + mutex_exit(&so->so_lock); +again: + error = so_dequeue_msg(so, &mp, uiop, &rval, flags | MSG_DUPCTRL); + if (mp != NULL) { + if (so->so_type == SOCK_SEQPACKET) { + ssa = *(struct sctp_soassoc **)DB_BASE(mp); + } + + tind = (struct T_unitdata_ind *)mp->b_rptr; + + len = tind->SRC_length; + + if (namelen > 0 && len > 0) { + + opt = sogetoff(mp, tind->SRC_offset, len, 1); + + ASSERT(opt != NULL); + + msg->msg_name = kmem_alloc(len, KM_SLEEP); + msg->msg_namelen = len; + + bcopy(opt, msg->msg_name, len); + } + + len = tind->OPT_length; + if (controllen == 0) { + if (len > 0) { + msg->msg_flags |= MSG_CTRUNC; + } + } else if (len > 0) { + opt = sogetoff(mp, tind->OPT_offset, len, + __TPI_ALIGN_SIZE); + + ASSERT(opt != NULL); + sosctp_pack_cmsg(opt, msg, len); + } + + if (mp->b_flag & SCTP_NOTIFICATION) { + msg->msg_flags |= MSG_NOTIFICATION; + } + + if (!(mp->b_flag & SCTP_PARTIAL_DATA)) + msg->msg_flags |= MSG_EOR; + freemsg(mp); + } +done: + /* + * Determine if we need to update SCTP about the buffer + * space. For performance reason, we cannot update SCTP + * every time a message is read. The socket buffer low + * watermark is used as the threshold. + */ + if (ssa == NULL) { + mutex_enter(&so->so_lock); + rxqueued = so->so_rcv_queued; + + so->so_rcv_queued = rxqueued - readcnt; + count = so->so_rcvbuf - so->so_rcv_queued; + + ASSERT(so->so_rcv_q_head != NULL || + so->so_rcv_head != NULL || + so->so_rcv_queued == 0); + + so_unlock_read(so); + mutex_exit(&so->so_lock); + + if (readcnt > 0 && (((count > 0) && + (rxqueued >= so->so_rcvlowat)) || + (so->so_rcv_queued == 0))) { + /* + * If amount of queued data is higher than watermark, + * updata SCTP's idea of available buffer space. + */ + sctp_recvd((struct sctp_s *)so->so_proto_handle, count); + } + } else { + mutex_enter(&so->so_lock); + rxqueued = ssa->ssa_rcv_queued; + + ssa->ssa_rcv_queued = rxqueued - readcnt; + count = so->so_rcvbuf - ssa->ssa_rcv_queued; + + so_unlock_read(so); + + if (readcnt > 0 && + (((count > 0) && (rxqueued >= so->so_rcvlowat)) || + (ssa->ssa_rcv_queued == 0))) { + /* + * If amount of queued data is higher than watermark, + * updata SCTP's idea of available buffer space. + */ + mutex_exit(&so->so_lock); + + sctp_recvd((struct sctp_s *)ssa->ssa_conn, count); + + mutex_enter(&so->so_lock); + } + /* + * MOREDATA flag is set if all data could not be copied + */ + if (!(flags & MSG_PEEK) && !(rval.r_val1 & MOREDATA)) { + SSA_REFRELE(ss, ssa); + } + mutex_exit(&so->so_lock); + } + + return (error); +} + +int +sosctp_uiomove(mblk_t *hdr_mp, ssize_t count, ssize_t blk_size, int wroff, + struct uio *uiop, int flags, cred_t *cr) +{ + ssize_t size; + int error; + mblk_t *mp; + dblk_t *dp; + + /* + * Loop until we have all data copied into mblk's. + */ + while (count > 0) { + size = MIN(count, blk_size); + + /* + * As a message can be splitted up and sent in different + * packets, each mblk will have the extra space before + * data to accommodate what SCTP wants to put in there. + */ + while ((mp = allocb_cred(size + wroff, cr)) == NULL) { + if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) || + (flags & MSG_DONTWAIT)) { + return (EAGAIN); + } + if ((error = strwaitbuf(size + wroff, BPRI_MED))) { + return (error); + } + } + + dp = mp->b_datap; + dp->db_cpid = curproc->p_pid; + ASSERT(wroff <= dp->db_lim - mp->b_wptr); + mp->b_rptr += wroff; + error = uiomove(mp->b_rptr, size, UIO_WRITE, uiop); + if (error != 0) { + freeb(mp); + return (error); + } + mp->b_wptr = mp->b_rptr + size; + count -= size; + hdr_mp->b_cont = mp; + hdr_mp = mp; + } + return (0); +} + +/* + * Send message. + */ +static int +sosctp_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, + struct cred *cr) +{ + struct sctp_sonode *ss = SOTOSSO(so); + mblk_t *mctl; + struct cmsghdr *cmsg; + struct sctp_sndrcvinfo *sinfo; + int optlen, flags, fflag; + ssize_t count, msglen; + int error; + + ASSERT(so->so_type == SOCK_STREAM); + + flags = msg->msg_flags; + if (flags & MSG_OOB) { + /* + * No out-of-band data support. + */ + return (EOPNOTSUPP); + } + + if (msg->msg_controllen != 0) { + optlen = msg->msg_controllen; + cmsg = sosctp_find_cmsg(msg->msg_control, optlen, SCTP_SNDRCV); + if (cmsg != NULL) { + if (cmsg->cmsg_len < + (sizeof (*sinfo) + sizeof (*cmsg))) { + eprintsoline(so, EINVAL); + return (EINVAL); + } + sinfo = (struct sctp_sndrcvinfo *)(cmsg + 1); + + /* Both flags should not be set together. */ + if ((sinfo->sinfo_flags & MSG_EOF) && + (sinfo->sinfo_flags & MSG_ABORT)) { + eprintsoline(so, EINVAL); + return (EINVAL); + } + + /* Initiate a graceful shutdown. */ + if (sinfo->sinfo_flags & MSG_EOF) { + /* Can't include data in MSG_EOF message. */ + if (uiop->uio_resid != 0) { + eprintsoline(so, EINVAL); + return (EINVAL); + } + + /* + * This is the same sequence as done in + * shutdown(SHUT_WR). + */ + mutex_enter(&so->so_lock); + so_lock_single(so); + socantsendmore(so); + cv_broadcast(&so->so_snd_cv); + so->so_state |= SS_ISDISCONNECTING; + mutex_exit(&so->so_lock); + + pollwakeup(&so->so_poll_list, POLLOUT); + sctp_recvd((struct sctp_s *)so->so_proto_handle, + so->so_rcvbuf); + error = sctp_disconnect( + (struct sctp_s *)so->so_proto_handle); + + mutex_enter(&so->so_lock); + so_unlock_single(so, SOLOCKED); + mutex_exit(&so->so_lock); + return (error); + } + } + } else { + optlen = 0; + } + + mutex_enter(&so->so_lock); + for (;;) { + if (so->so_state & SS_CANTSENDMORE) { + mutex_exit(&so->so_lock); + return (EPIPE); + } + + if (so->so_error != 0) { + error = sogeterr(so, B_TRUE); + mutex_exit(&so->so_lock); + return (error); + } + + if (!so->so_snd_qfull) + break; + + if (so->so_state & SS_CLOSING) { + mutex_exit(&so->so_lock); + return (EINTR); + } + /* + * Xmit window full in a blocking socket. + */ + if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) || + (flags & MSG_DONTWAIT)) { + mutex_exit(&so->so_lock); + return (EAGAIN); + } else { + /* + * Wait for space to become available and try again. + */ + error = cv_wait_sig(&so->so_snd_cv, &so->so_lock); + if (!error) { /* signal */ + mutex_exit(&so->so_lock); + return (EINTR); + } + } + } + msglen = count = uiop->uio_resid; + + /* Don't allow sending a message larger than the send buffer size. */ + /* XXX Transport module need to enforce this */ + if (msglen > so->so_sndbuf) { + mutex_exit(&so->so_lock); + return (EMSGSIZE); + } + + /* + * Allow piggybacking data on handshake messages (SS_ISCONNECTING). + */ + if (!(so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED))) { + /* + * We need to check here for listener so that the + * same error will be returned as with a TCP socket. + * In this case, sosctp_connect() returns EOPNOTSUPP + * while a TCP socket returns ENOTCONN instead. Catch it + * here to have the same behavior as a TCP socket. + * + * We also need to make sure that the peer address is + * provided before we attempt to do the connect. + */ + if ((so->so_state & SS_ACCEPTCONN) || + msg->msg_name == NULL) { + mutex_exit(&so->so_lock); + error = ENOTCONN; + goto error_nofree; + } + mutex_exit(&so->so_lock); + fflag = uiop->uio_fmode; + if (flags & MSG_DONTWAIT) { + fflag |= FNDELAY; + } + error = sosctp_connect(so, msg->msg_name, msg->msg_namelen, + fflag, (so->so_version == SOV_XPG4_2) * _SOCONNECT_XPG4_2, + cr); + if (error) { + /* + * Check for non-fatal errors, socket connected + * while the lock had been lifted. + */ + if (error != EISCONN && error != EALREADY) { + goto error_nofree; + } + error = 0; + } + } else { + mutex_exit(&so->so_lock); + } + + mctl = sctp_alloc_hdr(msg->msg_name, msg->msg_namelen, + msg->msg_control, optlen, SCTP_CAN_BLOCK); + if (mctl == NULL) { + error = EINTR; + goto error_nofree; + } + + /* Copy in the message. */ + if ((error = sosctp_uiomove(mctl, count, ss->ss_wrsize, ss->ss_wroff, + uiop, flags, cr)) != 0) { + goto error_ret; + } + error = sctp_sendmsg((struct sctp_s *)so->so_proto_handle, mctl, 0); + if (error == 0) + return (0); + +error_ret: + freemsg(mctl); +error_nofree: + mutex_enter(&so->so_lock); + if ((error == EPIPE) && (so->so_state & SS_CANTSENDMORE)) { + /* + * We received shutdown between the time lock was + * lifted and call to sctp_sendmsg(). + */ + mutex_exit(&so->so_lock); + return (EPIPE); + } + mutex_exit(&so->so_lock); + return (error); +} + +/* + * Send message on 1-N socket. Connects automatically if there is + * no association. + */ +static int +sosctp_seq_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, + struct cred *cr) +{ + struct sctp_sonode *ss; + struct sctp_soassoc *ssa; + struct cmsghdr *cmsg; + struct sctp_sndrcvinfo *sinfo; + int aid = 0; + mblk_t *mctl; + int namelen, optlen, flags; + ssize_t count, msglen; + int error; + uint16_t s_flags = 0; + + ASSERT(so->so_type == SOCK_SEQPACKET); + + /* + * There shouldn't be problems with alignment, as the memory for + * msg_control was alloced with kmem_alloc. + */ + cmsg = sosctp_find_cmsg(msg->msg_control, msg->msg_controllen, + SCTP_SNDRCV); + if (cmsg != NULL) { + if (cmsg->cmsg_len < (sizeof (*sinfo) + sizeof (*cmsg))) { + eprintsoline(so, EINVAL); + return (EINVAL); + } + sinfo = (struct sctp_sndrcvinfo *)(cmsg + 1); + s_flags = sinfo->sinfo_flags; + aid = sinfo->sinfo_assoc_id; + } + + ss = SOTOSSO(so); + namelen = msg->msg_namelen; + + if (msg->msg_controllen > 0) { + optlen = msg->msg_controllen; + } else { + optlen = 0; + } + + mutex_enter(&so->so_lock); + + /* + * If there is no association id, connect to address specified + * in msg_name. Otherwise look up the association using the id. + */ + if (aid == 0) { + /* + * Connect and shutdown cannot be done together, so check for + * MSG_EOF. + */ + if (msg->msg_name == NULL || namelen == 0 || + (s_flags & MSG_EOF)) { + error = EINVAL; + eprintsoline(so, error); + goto done; + } + flags = uiop->uio_fmode; + if (msg->msg_flags & MSG_DONTWAIT) { + flags |= FNDELAY; + } + so_lock_single(so); + error = sosctp_assoc_createconn(ss, msg->msg_name, namelen, + msg->msg_control, optlen, flags, cr, &ssa); + if (error) { + if ((so->so_version == SOV_XPG4_2) && + (error == EHOSTUNREACH)) { + error = ENETUNREACH; + } + if (ssa == NULL) { + /* + * Fatal error during connect(). Bail out. + * If ssa exists, it means that the handshake + * is in progress. + */ + eprintsoline(so, error); + so_unlock_single(so, SOLOCKED); + goto done; + } + /* + * All the errors are non-fatal ones, don't return + * e.g. EINPROGRESS from sendmsg(). + */ + error = 0; + } + so_unlock_single(so, SOLOCKED); + } else { + if ((error = sosctp_assoc(ss, aid, &ssa)) != 0) { + eprintsoline(so, error); + goto done; + } + } + + /* + * Now we have an association. + */ + flags = msg->msg_flags; + + /* + * MSG_EOF initiates graceful shutdown. + */ + if (s_flags & MSG_EOF) { + if (uiop->uio_resid) { + /* + * Can't include data in MSG_EOF message. + */ + error = EINVAL; + } else { + mutex_exit(&so->so_lock); + ssa->ssa_state |= SS_ISDISCONNECTING; + sctp_recvd((struct sctp_s *)ssa->ssa_conn, + so->so_rcvbuf); + error = sctp_disconnect((struct sctp_s *)ssa->ssa_conn); + mutex_enter(&so->so_lock); + } + goto refrele; + } + + for (;;) { + if (ssa->ssa_state & SS_CANTSENDMORE) { + SSA_REFRELE(ss, ssa); + mutex_exit(&so->so_lock); + return (EPIPE); + } + if (ssa->ssa_error != 0) { + error = ssa->ssa_error; + ssa->ssa_error = 0; + goto refrele; + } + + if (!ssa->ssa_snd_qfull) + break; + + if (so->so_state & SS_CLOSING) { + error = EINTR; + goto refrele; + } + if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) || + (flags & MSG_DONTWAIT)) { + error = EAGAIN; + goto refrele; + } else { + /* + * Wait for space to become available and try again. + */ + error = cv_wait_sig(&so->so_snd_cv, &so->so_lock); + if (!error) { /* signal */ + error = EINTR; + goto refrele; + } + } + } + + msglen = count = uiop->uio_resid; + + /* Don't allow sending a message larger than the send buffer size. */ + if (msglen > so->so_sndbuf) { + error = EMSGSIZE; + goto refrele; + } + + /* + * Update TX buffer usage here so that we can lift the socket lock. + */ + mutex_exit(&so->so_lock); + + mctl = sctp_alloc_hdr(msg->msg_name, namelen, msg->msg_control, + optlen, SCTP_CAN_BLOCK); + if (mctl == NULL) { + error = EINTR; + goto lock_rele; + } + + /* Copy in the message. */ + if ((error = sosctp_uiomove(mctl, count, ssa->ssa_wrsize, + ssa->ssa_wroff, uiop, flags, cr)) != 0) { + goto lock_rele; + } + error = sctp_sendmsg((struct sctp_s *)ssa->ssa_conn, mctl, 0); +lock_rele: + mutex_enter(&so->so_lock); + if (error != 0) { + freemsg(mctl); + if ((error == EPIPE) && (ssa->ssa_state & SS_CANTSENDMORE)) { + /* + * We received shutdown between the time lock was + * lifted and call to sctp_sendmsg(). + */ + SSA_REFRELE(ss, ssa); + mutex_exit(&so->so_lock); + return (EPIPE); + } + } + +refrele: + SSA_REFRELE(ss, ssa); +done: + mutex_exit(&so->so_lock); + return (error); +} + +/* + * Get address of remote node. + */ +/* ARGSUSED */ +static int +sosctp_getpeername(struct sonode *so, struct sockaddr *addr, socklen_t *addrlen, + boolean_t accept, struct cred *cr) +{ + return (sctp_getpeername((struct sctp_s *)so->so_proto_handle, addr, + addrlen)); +} + +/* + * Get local address. + */ +/* ARGSUSED */ +static int +sosctp_getsockname(struct sonode *so, struct sockaddr *addr, socklen_t *addrlen, + struct cred *cr) +{ + return (sctp_getsockname((struct sctp_s *)so->so_proto_handle, addr, + addrlen)); +} + +/* + * Called from shutdown(). + */ +/* ARGSUSED */ +static int +sosctp_shutdown(struct sonode *so, int how, struct cred *cr) +{ + uint_t state_change; + int wakesig = 0; + int error = 0; + + mutex_enter(&so->so_lock); + /* + * Record the current state and then perform any state changes. + * Then use the difference between the old and new states to + * determine which needs to be done. + */ + state_change = so->so_state; + + switch (how) { + case SHUT_RD: + socantrcvmore(so); + break; + case SHUT_WR: + socantsendmore(so); + break; + case SHUT_RDWR: + socantsendmore(so); + socantrcvmore(so); + break; + default: + mutex_exit(&so->so_lock); + return (EINVAL); + } + + state_change = so->so_state & ~state_change; + + if (state_change & SS_CANTRCVMORE) { + if (so->so_rcv_q_head == NULL) { + cv_signal(&so->so_rcv_cv); + } + wakesig = POLLIN|POLLRDNORM; + + socket_sendsig(so, SOCKETSIG_READ); + } + if (state_change & SS_CANTSENDMORE) { + cv_broadcast(&so->so_snd_cv); + wakesig |= POLLOUT; + + so->so_state |= SS_ISDISCONNECTING; + } + mutex_exit(&so->so_lock); + + pollwakeup(&so->so_poll_list, wakesig); + + if (state_change & SS_CANTSENDMORE) { + sctp_recvd((struct sctp_s *)so->so_proto_handle, so->so_rcvbuf); + error = sctp_disconnect((struct sctp_s *)so->so_proto_handle); + } + + /* + * HACK: sctp_disconnect() may return EWOULDBLOCK. But this error is + * not documented in standard socket API. Catch it here. + */ + if (error == EWOULDBLOCK) + error = 0; + return (error); +} + +/* + * Get socket options. + */ +/*ARGSUSED5*/ +static int +sosctp_getsockopt(struct sonode *so, int level, int option_name, + void *optval, socklen_t *optlenp, int flags, struct cred *cr) +{ + if (level == IPPROTO_SCTP) { + /* + * Should go through ioctl(). + */ + return (EINVAL); + } + return (sctp_get_opt((struct sctp_s *)so->so_proto_handle, level, + option_name, optval, optlenp)); +} + +/* + * Set socket options + */ +/* ARGSUSED */ +static int +sosctp_setsockopt(struct sonode *so, int level, int option_name, + const void *optval, t_uscalar_t optlen, struct cred *cr) +{ + struct sctp_sonode *ss = SOTOSSO(so); + struct sctp_soassoc *ssa = NULL; + sctp_assoc_t id; + int error, rc; + void *conn = NULL; + + mutex_enter(&so->so_lock); + + /* + * For some SCTP level options, one can select the association this + * applies to. + */ + if (so->so_type == SOCK_STREAM) { + conn = so->so_proto_handle; + } else { + /* + * SOCK_SEQPACKET only + */ + id = 0; + if (level == IPPROTO_SCTP) { + switch (option_name) { + case SCTP_RTOINFO: + case SCTP_ASSOCINFO: + case SCTP_SET_PEER_PRIMARY_ADDR: + case SCTP_PRIMARY_ADDR: + case SCTP_PEER_ADDR_PARAMS: + /* + * Association ID is the first element + * params struct + */ + if (optlen < sizeof (sctp_assoc_t)) { + error = EINVAL; + eprintsoline(so, error); + goto done; + } + id = *(sctp_assoc_t *)optval; + break; + case SCTP_DEFAULT_SEND_PARAM: + if (optlen != sizeof (struct sctp_sndrcvinfo)) { + error = EINVAL; + eprintsoline(so, error); + goto done; + } + id = ((struct sctp_sndrcvinfo *) + optval)->sinfo_assoc_id; + break; + case SCTP_INITMSG: + /* + * Only applies to future associations + */ + conn = so->so_proto_handle; + break; + default: + break; + } + } else if (level == SOL_SOCKET) { + if (option_name == SO_LINGER) { + error = EOPNOTSUPP; + eprintsoline(so, error); + goto done; + } + /* + * These 2 options are applied to all associations. + * The other socket level options are only applied + * to the socket (not associations). + */ + if ((option_name != SO_RCVBUF) && + (option_name != SO_SNDBUF)) { + conn = so->so_proto_handle; + } + } else { + conn = NULL; + } + + /* + * If association ID was specified, do op on that assoc. + * Otherwise set the default setting of a socket. + */ + if (id != 0) { + if ((error = sosctp_assoc(ss, id, &ssa)) != 0) { + eprintsoline(so, error); + goto done; + } + conn = ssa->ssa_conn; + } + } + dprint(2, ("sosctp_setsockopt %p (%d) - conn %p %d %d id:%d\n", + (void *)ss, so->so_type, (void *)conn, level, option_name, id)); + + ASSERT(ssa == NULL || (ssa != NULL && conn != NULL)); + if (conn != NULL) { + mutex_exit(&so->so_lock); + error = sctp_set_opt((struct sctp_s *)conn, level, option_name, + optval, optlen); + mutex_enter(&so->so_lock); + if (ssa != NULL) + SSA_REFRELE(ss, ssa); + } else { + /* + * 1-N socket, and we have to apply the operation to ALL + * associations. Like with anything of this sort, the + * problem is what to do if the operation fails. + * Just try to apply the setting to everyone, but store + * error number if someone returns such. And since we are + * looping through all possible aids, some of them can be + * invalid. We just ignore this kind (sosctp_assoc()) of + * errors. + */ + sctp_assoc_t aid; + + mutex_exit(&so->so_lock); + error = sctp_set_opt((struct sctp_s *)so->so_proto_handle, + level, option_name, optval, optlen); + mutex_enter(&so->so_lock); + for (aid = 1; aid < ss->ss_maxassoc; aid++) { + if (sosctp_assoc(ss, aid, &ssa) != 0) + continue; + mutex_exit(&so->so_lock); + rc = sctp_set_opt((struct sctp_s *)ssa->ssa_conn, level, + option_name, optval, optlen); + mutex_enter(&so->so_lock); + SSA_REFRELE(ss, ssa); + if (error == 0) { + error = rc; + } + } + } +done: + mutex_exit(&so->so_lock); + return (error); +} + +/*ARGSUSED*/ +static int +sosctp_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode, + struct cred *cr, int32_t *rvalp) +{ + struct sctp_sonode *ss; + int32_t value; + int error; + int intval; + pid_t pid; + struct sctp_soassoc *ssa; + void *conn; + void *buf; + STRUCT_DECL(sctpopt, opt); + uint32_t optlen; + int buflen; + + ss = SOTOSSO(so); + + /* handle socket specific ioctls */ + switch (cmd) { + case FIONBIO: + if (so_copyin((void *)arg, &value, sizeof (int32_t), + (mode & (int)FKIOCTL))) { + return (EFAULT); + } + mutex_enter(&so->so_lock); + if (value) { + so->so_state |= SS_NDELAY; + } else { + so->so_state &= ~SS_NDELAY; + } + mutex_exit(&so->so_lock); + return (0); + + case FIOASYNC: + if (so_copyin((void *)arg, &value, sizeof (int32_t), + (mode & (int)FKIOCTL))) { + return (EFAULT); + } + mutex_enter(&so->so_lock); + + if (value) { + /* Turn on SIGIO */ + so->so_state |= SS_ASYNC; + } else { + /* Turn off SIGIO */ + so->so_state &= ~SS_ASYNC; + } + mutex_exit(&so->so_lock); + return (0); + + case SIOCSPGRP: + case FIOSETOWN: + if (so_copyin((void *)arg, &pid, sizeof (pid_t), + (mode & (int)FKIOCTL))) { + return (EFAULT); + } + mutex_enter(&so->so_lock); + + error = (pid != so->so_pgrp) ? socket_chgpgrp(so, pid) : 0; + mutex_exit(&so->so_lock); + return (error); + + case SIOCGPGRP: + case FIOGETOWN: + if (so_copyout(&so->so_pgrp, (void *)arg, + sizeof (pid_t), (mode & (int)FKIOCTL))) + return (EFAULT); + return (0); + + case FIONREAD: + /* XXX: Cannot be used unless standard buffer is used */ + /* + * Return number of bytes of data in all data messages + * in queue in "arg". + * For stream socket, amount of available data. + * For sock_dgram, # of available bytes + addresses. + */ + intval = (so->so_state & SS_ACCEPTCONN) ? 0 : + MIN(so->so_rcv_queued, INT_MAX); + if (so_copyout(&intval, (void *)arg, sizeof (intval), + (mode & (int)FKIOCTL))) + return (EFAULT); + return (0); + case SIOCATMARK: + /* + * No support for urgent data. + */ + intval = 0; + + if (so_copyout(&intval, (void *)arg, sizeof (int), + (mode & (int)FKIOCTL))) + return (EFAULT); + return (0); + case SIOCSCTPGOPT: + STRUCT_INIT(opt, mode); + + if (so_copyin((void *)arg, STRUCT_BUF(opt), STRUCT_SIZE(opt), + (mode & (int)FKIOCTL))) { + return (EFAULT); + } + if ((optlen = STRUCT_FGET(opt, sopt_len)) > SO_MAXARGSIZE) + return (EINVAL); + + /* + * Find the correct sctp_t based on whether it is 1-N socket + * or not. + */ + intval = STRUCT_FGET(opt, sopt_aid); + mutex_enter(&so->so_lock); + if ((so->so_type == SOCK_SEQPACKET) && intval) { + if ((error = sosctp_assoc(ss, intval, &ssa)) != 0) { + mutex_exit(&so->so_lock); + return (error); + } + conn = ssa->ssa_conn; + ASSERT(conn != NULL); + } else { + conn = so->so_proto_handle; + ssa = NULL; + } + mutex_exit(&so->so_lock); + + /* Copyin the option buffer and then call sctp_get_opt(). */ + buflen = optlen; + /* Let's allocate a buffer enough to hold an int */ + if (buflen < sizeof (uint32_t)) + buflen = sizeof (uint32_t); + buf = kmem_alloc(buflen, KM_SLEEP); + if (so_copyin(STRUCT_FGETP(opt, sopt_val), buf, optlen, + (mode & (int)FKIOCTL))) { + if (ssa != NULL) { + mutex_enter(&so->so_lock); + SSA_REFRELE(ss, ssa); + mutex_exit(&so->so_lock); + } + kmem_free(buf, buflen); + return (EFAULT); + } + /* The option level has to be IPPROTO_SCTP */ + error = sctp_get_opt((struct sctp_s *)conn, IPPROTO_SCTP, + STRUCT_FGET(opt, sopt_name), buf, &optlen); + if (ssa != NULL) { + mutex_enter(&so->so_lock); + SSA_REFRELE(ss, ssa); + mutex_exit(&so->so_lock); + } + optlen = MIN(buflen, optlen); + /* No error, copyout the result with the correct buf len. */ + if (error == 0) { + STRUCT_FSET(opt, sopt_len, optlen); + if (so_copyout(STRUCT_BUF(opt), (void *)arg, + STRUCT_SIZE(opt), (mode & (int)FKIOCTL))) { + error = EFAULT; + } else if (so_copyout(buf, STRUCT_FGETP(opt, sopt_val), + optlen, (mode & (int)FKIOCTL))) { + error = EFAULT; + } + } + kmem_free(buf, buflen); + return (error); + + case SIOCSCTPSOPT: + STRUCT_INIT(opt, mode); + + if (so_copyin((void *)arg, STRUCT_BUF(opt), STRUCT_SIZE(opt), + (mode & (int)FKIOCTL))) { + return (EFAULT); + } + if ((optlen = STRUCT_FGET(opt, sopt_len)) > SO_MAXARGSIZE) + return (EINVAL); + + /* + * Find the correct sctp_t based on whether it is 1-N socket + * or not. + */ + intval = STRUCT_FGET(opt, sopt_aid); + mutex_enter(&so->so_lock); + if (intval != 0) { + if ((error = sosctp_assoc(ss, intval, &ssa)) != 0) { + mutex_exit(&so->so_lock); + return (error); + } + conn = ssa->ssa_conn; + ASSERT(conn != NULL); + } else { + conn = so->so_proto_handle; + ssa = NULL; + } + mutex_exit(&so->so_lock); + + /* Copyin the option buffer and then call sctp_set_opt(). */ + buf = kmem_alloc(optlen, KM_SLEEP); + if (so_copyin(STRUCT_FGETP(opt, sopt_val), buf, optlen, + (mode & (int)FKIOCTL))) { + if (ssa != NULL) { + mutex_enter(&so->so_lock); + SSA_REFRELE(ss, ssa); + mutex_exit(&so->so_lock); + } + kmem_free(buf, intval); + return (EFAULT); + } + /* The option level has to be IPPROTO_SCTP */ + error = sctp_set_opt((struct sctp_s *)conn, IPPROTO_SCTP, + STRUCT_FGET(opt, sopt_name), buf, optlen); + if (ssa) { + mutex_enter(&so->so_lock); + SSA_REFRELE(ss, ssa); + mutex_exit(&so->so_lock); + } + kmem_free(buf, optlen); + return (error); + + case SIOCSCTPPEELOFF: { + struct sonode *nso; + struct sctp_uc_swap us; + int nfd; + struct file *nfp; + struct vnode *nvp = NULL; + struct sockparams *sp; + + dprint(2, ("sctppeeloff %p\n", (void *)ss)); + + if (so->so_type != SOCK_SEQPACKET) { + return (EOPNOTSUPP); + } + if (so_copyin((void *)arg, &intval, sizeof (intval), + (mode & (int)FKIOCTL))) { + return (EFAULT); + } + if (intval == 0) { + return (EINVAL); + } + + /* + * Find sockparams. This is different from parent's entry, + * as the socket type is different. + */ + error = solookup(so->so_family, SOCK_STREAM, so->so_protocol, + &sp); + + /* + * Allocate the user fd. + */ + if ((nfd = ufalloc(0)) == -1) { + eprintsoline(so, EMFILE); + return (EMFILE); + } + + /* + * Copy the fd out. + */ + if (so_copyout(&nfd, (void *)arg, sizeof (nfd), + (mode & (int)FKIOCTL))) { + error = EFAULT; + goto err; + } + mutex_enter(&so->so_lock); + + /* + * Don't use sosctp_assoc() in order to peel off disconnected + * associations. + */ + ssa = ((uint32_t)intval >= ss->ss_maxassoc) ? NULL : + ss->ss_assocs[intval].ssi_assoc; + if (ssa == NULL) { + mutex_exit(&so->so_lock); + error = EINVAL; + goto err; + } + SSA_REFHOLD(ssa); + + nso = socksctp_create(sp, so->so_family, SOCK_STREAM, + so->so_protocol, so->so_version, SOCKET_NOSLEEP, + &error, cr); + if (nso == NULL) { + SSA_REFRELE(ss, ssa); + mutex_exit(&so->so_lock); + goto err; + } + /* cannot fail, only inheriting properties */ + (void) sosctp_init(nso, so, CRED(), 0); + nvp = SOTOV(nso); + so_lock_single(so); + mutex_exit(&so->so_lock); + us.sus_handle = SOTOSSO(nso); + us.sus_upcalls = &sosctp_sock_upcalls; + + /* + * Upcalls to new socket are blocked for the duration of + * downcall. + */ + mutex_enter(&nso->so_lock); + + error = sctp_set_opt((struct sctp_s *)ssa->ssa_conn, + IPPROTO_SCTP, SCTP_UC_SWAP, &us, sizeof (us)); + if (error) { + goto peelerr; + } + error = falloc(nvp, FWRITE|FREAD, &nfp, NULL); + if (error) { + goto peelerr; + } + + /* + * fill in the entries that falloc reserved + */ + nfp->f_vnode = nvp; + mutex_exit(&nfp->f_tlock); + setf(nfd, nfp); + + mutex_enter(&so->so_lock); + + sosctp_assoc_move(ss, SOTOSSO(nso), ssa); + + mutex_exit(&nso->so_lock); + + ssa->ssa_conn = NULL; + sosctp_assoc_free(ss, ssa); + + so_unlock_single(so, SOLOCKED); + mutex_exit(&so->so_lock); + + return (0); + +err: + setf(nfd, NULL); + eprintsoline(so, error); + return (error); + +peelerr: + mutex_exit(&nso->so_lock); + mutex_enter(&so->so_lock); + ASSERT(nso->so_count == 1); + nso->so_count = 0; + so_unlock_single(so, SOLOCKED); + SSA_REFRELE(ss, ssa); + mutex_exit(&so->so_lock); + + setf(nfd, NULL); + ASSERT(nvp->v_count == 1); + socket_destroy(nso); + eprintsoline(so, error); + return (error); + } + default: + return (EINVAL); + } +} + +/*ARGSUSED*/ +static int +sosctp_close(struct sonode *so, int flag, struct cred *cr) +{ + struct sctp_sonode *ss; + struct sctp_sa_id *ssi; + struct sctp_soassoc *ssa; + int32_t i; + + ss = SOTOSSO(so); + + /* + * Initiate connection shutdown. Update SCTP's receive + * window. + */ + sctp_recvd((struct sctp_s *)so->so_proto_handle, + so->so_rcvbuf - so->so_rcv_queued); + (void) sctp_disconnect((struct sctp_s *)so->so_proto_handle); + + /* + * New associations can't come in, but old ones might get + * closed in upcall. Protect against that by taking a reference + * on the association. + */ + mutex_enter(&so->so_lock); + ssi = ss->ss_assocs; + for (i = 0; i < ss->ss_maxassoc; i++, ssi++) { + if ((ssa = ssi->ssi_assoc) != NULL) { + SSA_REFHOLD(ssa); + sosctp_assoc_isdisconnected(ssa, 0); + mutex_exit(&so->so_lock); + + sctp_recvd((struct sctp_s *)ssa->ssa_conn, + so->so_rcvbuf - ssa->ssa_rcv_queued); + (void) sctp_disconnect((struct sctp_s *)ssa->ssa_conn); + + mutex_enter(&so->so_lock); + SSA_REFRELE(ss, ssa); + } + } + mutex_exit(&so->so_lock); + + return (0); +} + +/* + * Closes incoming connections which were never accepted, frees + * resources. + */ +/* ARGSUSED */ +void +sosctp_fini(struct sonode *so, struct cred *cr) +{ + struct sctp_sonode *ss; + struct sctp_sa_id *ssi; + struct sctp_soassoc *ssa; + int32_t i; + + ss = SOTOSSO(so); + + ASSERT(so->so_ops == &sosctp_sonodeops || + so->so_ops == &sosctp_seq_sonodeops); + + /* We are the sole owner of so now */ + mutex_enter(&so->so_lock); + + so_rcv_flush(so); + + /* Free all pending connections */ + so_acceptq_flush(so); + + ssi = ss->ss_assocs; + for (i = 0; i < ss->ss_maxassoc; i++, ssi++) { + if ((ssa = ssi->ssi_assoc) != NULL) { + SSA_REFHOLD(ssa); + mutex_exit(&so->so_lock); + + sctp_close((struct sctp_s *)ssa->ssa_conn); + + mutex_enter(&so->so_lock); + ssa->ssa_conn = NULL; + sosctp_assoc_free(ss, ssa); + } + } + if (ss->ss_assocs != NULL) { + ASSERT(ss->ss_assoccnt == 0); + kmem_free(ss->ss_assocs, + ss->ss_maxassoc * sizeof (struct sctp_sa_id)); + } + mutex_exit(&so->so_lock); + + if (so->so_proto_handle) + sctp_close((struct sctp_s *)so->so_proto_handle); + so->so_proto_handle = NULL; + + sonode_fini(so); +} + +/* + * Upcalls from SCTP + */ + +/* + * This is the upcall function for 1-N (SOCK_SEQPACKET) socket when a new + * association is created. Note that the first argument (handle) is of type + * sctp_sonode *, which is the one changed to a listener for new + * associations. All the other upcalls for 1-N socket take sctp_soassoc * + * as handle. The only exception is the su_properties upcall, which + * can take both types as handle. + */ +/* ARGSUSED */ +sock_upper_handle_t +sctp_assoc_newconn(sock_upper_handle_t parenthandle, + sock_lower_handle_t connind, sock_downcalls_t *dc, + struct cred *peer_cred, pid_t peer_cpid, sock_upcalls_t **ucp) +{ + struct sonode *lso = (struct sonode *)parenthandle; + struct sctp_sonode *lss = SOTOSSO(lso); + struct sctp_soassoc *ssa; + sctp_assoc_t id; + + ASSERT(lss->ss_type == SOSCTP_SOCKET); + ASSERT(lso->so_state & SS_ACCEPTCONN); + ASSERT(lso->so_proto_handle != NULL); /* closed conn */ + ASSERT(lso->so_type == SOCK_SEQPACKET); + + mutex_enter(&lso->so_lock); + + if ((id = sosctp_aid_get(lss)) == -1) { + /* + * Array not large enough; increase size. + */ + if (sosctp_aid_grow(lss, lss->ss_maxassoc, KM_NOSLEEP) < 0) { + mutex_exit(&lso->so_lock); + return (NULL); + } + id = sosctp_aid_get(lss); + ASSERT(id != -1); + } + + /* + * Create soassoc for this connection + */ + ssa = sosctp_assoc_create(lss, KM_NOSLEEP); + if (ssa == NULL) { + mutex_exit(&lso->so_lock); + return (NULL); + } + sosctp_aid_reserve(lss, id, 1); + lss->ss_assocs[id].ssi_assoc = ssa; + ++lss->ss_assoccnt; + ssa->ssa_id = id; + ssa->ssa_conn = (struct sctp_s *)connind; + ssa->ssa_state = (SS_ISBOUND | SS_ISCONNECTED); + ssa->ssa_wroff = lss->ss_wroff; + ssa->ssa_wrsize = lss->ss_wrsize; + + mutex_exit(&lso->so_lock); + + *ucp = &sosctp_assoc_upcalls; + + return ((sock_upper_handle_t)ssa); +} + +/* ARGSUSED */ +static void +sctp_assoc_connected(sock_upper_handle_t handle, sock_connid_t id, + struct cred *peer_cred, pid_t peer_cpid) +{ + struct sctp_soassoc *ssa = (struct sctp_soassoc *)handle; + struct sonode *so = &ssa->ssa_sonode->ss_so; + + ASSERT(so->so_type == SOCK_SEQPACKET); + ASSERT(ssa->ssa_conn); + + mutex_enter(&so->so_lock); + sosctp_assoc_isconnected(ssa); + mutex_exit(&so->so_lock); +} + +/* ARGSUSED */ +static int +sctp_assoc_disconnected(sock_upper_handle_t handle, sock_connid_t id, int error) +{ + struct sctp_soassoc *ssa = (struct sctp_soassoc *)handle; + struct sonode *so = &ssa->ssa_sonode->ss_so; + int ret; + + ASSERT(so->so_type == SOCK_SEQPACKET); + ASSERT(ssa->ssa_conn != NULL); + + mutex_enter(&so->so_lock); + sosctp_assoc_isdisconnected(ssa, error); + if (ssa->ssa_refcnt == 1) { + ret = 1; + ssa->ssa_conn = NULL; + } else { + ret = 0; + } + SSA_REFRELE(SOTOSSO(so), ssa); + + cv_broadcast(&so->so_snd_cv); + + mutex_exit(&so->so_lock); + + return (ret); +} + +/* ARGSUSED */ +static void +sctp_assoc_disconnecting(sock_upper_handle_t handle, sock_opctl_action_t action, + uintptr_t arg) +{ + struct sctp_soassoc *ssa = (struct sctp_soassoc *)handle; + struct sonode *so = &ssa->ssa_sonode->ss_so; + + ASSERT(so->so_type == SOCK_SEQPACKET); + ASSERT(ssa->ssa_conn != NULL); + ASSERT(action == SOCK_OPCTL_SHUT_SEND); + + mutex_enter(&so->so_lock); + sosctp_assoc_isdisconnecting(ssa); + mutex_exit(&so->so_lock); +} + +/* ARGSUSED */ +static ssize_t +sctp_assoc_recv(sock_upper_handle_t handle, mblk_t *mp, size_t len, int flags, + int *errorp, boolean_t *forcepush) +{ + struct sctp_soassoc *ssa = (struct sctp_soassoc *)handle; + struct sctp_sonode *ss = ssa->ssa_sonode; + struct sonode *so = &ss->ss_so; + struct T_unitdata_ind *tind; + mblk_t *mp2; + union sctp_notification *sn; + struct sctp_sndrcvinfo *sinfo; + + ASSERT(ssa->ssa_type == SOSCTP_ASSOC); + ASSERT(so->so_type == SOCK_SEQPACKET); + ASSERT(ssa->ssa_conn != NULL); /* closed conn */ + ASSERT(mp != NULL); + + ASSERT(errorp != NULL); + *errorp = 0; + + /* + * Should be getting T_unitdata_req's only. + * Must have address as part of packet. + */ + tind = (struct T_unitdata_ind *)mp->b_rptr; + ASSERT((DB_TYPE(mp) == M_PROTO) && + (tind->PRIM_type == T_UNITDATA_IND)); + ASSERT(tind->SRC_length); + + mutex_enter(&so->so_lock); + + /* + * Override b_flag for SCTP sockfs internal use + */ + mp->b_flag = (short)flags; + + /* + * For notify messages, need to fill in association id. + * For data messages, sndrcvinfo could be in ancillary data. + */ + if (flags & SCTP_NOTIFICATION) { + mp2 = mp->b_cont; + sn = (union sctp_notification *)mp2->b_rptr; + switch (sn->sn_header.sn_type) { + case SCTP_ASSOC_CHANGE: + sn->sn_assoc_change.sac_assoc_id = ssa->ssa_id; + break; + case SCTP_PEER_ADDR_CHANGE: + sn->sn_paddr_change.spc_assoc_id = ssa->ssa_id; + break; + case SCTP_REMOTE_ERROR: + sn->sn_remote_error.sre_assoc_id = ssa->ssa_id; + break; + case SCTP_SEND_FAILED: + sn->sn_send_failed.ssf_assoc_id = ssa->ssa_id; + break; + case SCTP_SHUTDOWN_EVENT: + sn->sn_shutdown_event.sse_assoc_id = ssa->ssa_id; + break; + case SCTP_ADAPTATION_INDICATION: + sn->sn_adaptation_event.sai_assoc_id = ssa->ssa_id; + break; + case SCTP_PARTIAL_DELIVERY_EVENT: + sn->sn_pdapi_event.pdapi_assoc_id = ssa->ssa_id; + break; + default: + ASSERT(0); + break; + } + } else { + if (tind->OPT_length > 0) { + struct cmsghdr *cmsg; + char *cend; + + cmsg = (struct cmsghdr *) + ((uchar_t *)mp->b_rptr + tind->OPT_offset); + cend = (char *)cmsg + tind->OPT_length; + for (;;) { + if ((char *)(cmsg + 1) > cend || + ((char *)cmsg + cmsg->cmsg_len) > cend) { + break; + } + if ((cmsg->cmsg_level == IPPROTO_SCTP) && + (cmsg->cmsg_type == SCTP_SNDRCV)) { + sinfo = (struct sctp_sndrcvinfo *) + (cmsg + 1); + sinfo->sinfo_assoc_id = ssa->ssa_id; + break; + } + if (cmsg->cmsg_len > 0) { + cmsg = (struct cmsghdr *) + ((uchar_t *)cmsg + cmsg->cmsg_len); + } else { + break; + } + } + } + } + + /* + * SCTP has reserved space in the header for storing a pointer. + * Put the pointer to assocation there, and queue the data. + */ + SSA_REFHOLD(ssa); + ASSERT((mp->b_rptr - DB_BASE(mp)) >= sizeof (ssa)); + *(struct sctp_soassoc **)DB_BASE(mp) = ssa; + + mutex_exit(&so->so_lock); + + return (so_queue_msg((sock_upper_handle_t)so, mp, len, 0, errorp, + NULL)); +} + +static void +sctp_assoc_xmitted(sock_upper_handle_t handle, boolean_t qfull) +{ + struct sctp_soassoc *ssa = (struct sctp_soassoc *)handle; + struct sctp_sonode *ss = ssa->ssa_sonode; + + ASSERT(ssa->ssa_type == SOSCTP_ASSOC); + ASSERT(ss->ss_so.so_type == SOCK_SEQPACKET); + ASSERT(ssa->ssa_conn != NULL); + + mutex_enter(&ss->ss_so.so_lock); + + ssa->ssa_snd_qfull = qfull; + + /* + * Wake blocked writers. + */ + cv_broadcast(&ss->ss_so.so_snd_cv); + + mutex_exit(&ss->ss_so.so_lock); +} + +static void +sctp_assoc_properties(sock_upper_handle_t handle, + struct sock_proto_props *soppp) +{ + struct sctp_soassoc *ssa = (struct sctp_soassoc *)handle; + struct sctp_sonode *ss; + + if (ssa->ssa_type == SOSCTP_ASSOC) { + ss = ssa->ssa_sonode; + mutex_enter(&ss->ss_so.so_lock); + + /* + * Only change them if they're set. + */ + if (soppp->sopp_wroff != 0) { + ssa->ssa_wroff = soppp->sopp_wroff; + } + if (soppp->sopp_maxblk != 0) { + ssa->ssa_wrsize = soppp->sopp_maxblk; + } + } else { + ss = (struct sctp_sonode *)handle; + mutex_enter(&ss->ss_so.so_lock); + + if (soppp->sopp_wroff != 0) { + ss->ss_wroff = soppp->sopp_wroff; + } + if (soppp->sopp_maxblk != 0) { + ss->ss_wrsize = soppp->sopp_maxblk; + } + } + + mutex_exit(&ss->ss_so.so_lock); +} diff --git a/usr/src/uts/common/fs/sockfs/socksctp.h b/usr/src/uts/common/inet/sockmods/socksctp.h index dfbd818e40..55d56df7ae 100644 --- a/usr/src/uts/common/fs/sockfs/socksctp.h +++ b/usr/src/uts/common/inet/sockmods/socksctp.h @@ -26,8 +26,6 @@ #ifndef _SOCKSCTP_H_ #define _SOCKSCTP_H_ -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -47,15 +45,8 @@ struct sctp_sonode { sctp_assoc_t ss_maxassoc; /* assoc array size for 1-N */ sctp_assoc_t ss_assoccnt; /* current # of assocs */ struct sctp_sa_id *ss_assocs; /* assoc array for 1-N */ - kcondvar_t ss_txdata_cv; /* wait TX window to open */ - int ss_wroff; - size_t ss_wrsize; - int ss_txqueued; /* queued tx bytes */ - kcondvar_t ss_rxdata_cv; /* for waiting RX data */ - mblk_t *ss_rxdata; /* queued rx data */ - mblk_t **ss_rxtail; /* ptr to last message */ - int ss_rxqueued; /* queued rx bytes/# of conn */ - struct pollhead ss_poll_list; +#define ss_wroff ss_so.so_proto_props.sopp_wroff +#define ss_wrsize ss_so.so_proto_props.sopp_maxblk }; /* @@ -69,14 +60,13 @@ struct sctp_soassoc { struct sctp_s *ssa_conn; /* opaque ptr passed to SCTP */ uint_t ssa_state; /* same as so_state */ int ssa_error; /* same as so_error */ - int ssa_txqueued; /* queued tx bytes */ + boolean_t ssa_snd_qfull; int ssa_wroff; size_t ssa_wrsize; - int ssa_rxqueued; /* queued rx bytes/# of conn */ + int ssa_rcv_queued; /* queued rx bytes/# of conn */ }; /* 1-N socket association cache defined in socksctp.c */ -extern kmem_cache_t *sosctp_assoccache; /* * Association array element. @@ -91,18 +81,14 @@ struct sctp_sa_id { struct sctp_soassoc *ssi_assoc; }; -extern sctp_upcalls_t sosctp_sock_upcalls; -extern sctp_upcalls_t sosctp_assoc_upcalls; -extern struct vnodeops *socksctp_vnodeops; -extern const fs_operation_def_t socksctp_vnodeops_template[]; - -extern void sosctp_free(struct sonode *so); -extern int sosctp_chgpgrp(struct sctp_sonode *ss, pid_t pid); -extern void sosctp_sendsig(struct sctp_sonode *ss, int event); +extern sonodeops_t sosctp_sonodeops; +extern sonodeops_t sosctp_seq_sonodeops; +extern sock_upcalls_t sosctp_sock_upcalls; +extern sock_upcalls_t sosctp_assoc_upcalls; -extern int sosctp_bind(struct sonode *so, struct sockaddr *name, - socklen_t namelen, int flags); -extern int sosctp_recvmsg(struct sonode *, struct nmsghdr *, struct uio *); +extern struct sonode *socksctp_create(struct sockparams *, int, int, + int, int, int, int *, cred_t *); +extern void sosctp_fini(struct sonode *, struct cred *); extern int sosctp_aid_grow(struct sctp_sonode *ss, sctp_assoc_t maxid, int kmflags); extern sctp_assoc_t sosctp_aid_get(struct sctp_sonode *ss); @@ -119,7 +105,7 @@ extern struct sctp_soassoc *sosctp_assoc_create(struct sctp_sonode *ss, extern void sosctp_assoc_free(struct sctp_sonode *ss, struct sctp_soassoc *ssa); extern int sosctp_assoc_createconn(struct sctp_sonode *ss, const struct sockaddr *name, socklen_t namelen, - const uchar_t *control, socklen_t controllen, int fflag, + const uchar_t *control, socklen_t controllen, int fflag, struct cred *, struct sctp_soassoc **ssap); extern void sosctp_assoc_move(struct sctp_sonode *ss, struct sctp_sonode *nss, struct sctp_soassoc *ssa); @@ -165,12 +151,6 @@ extern int sosctp_uiomove(mblk_t *hdr_mp, ssize_t count, ssize_t blk_size, } \ } -/* - * Event flags to sosctp_sendsig(). - */ -#define SCTPSIG_WRITE 0x1 -#define SCTPSIG_READ 0x2 - #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/fs/sockfs/socksctpsubr.c b/usr/src/uts/common/inet/sockmods/socksctpsubr.c index e741bd29f7..fab1a4534d 100644 --- a/usr/src/uts/common/fs/sockfs/socksctpsubr.c +++ b/usr/src/uts/common/inet/sockmods/socksctpsubr.c @@ -24,8 +24,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/t_lock.h> #include <sys/param.h> @@ -36,9 +34,6 @@ #include <sys/cmn_err.h> #include <sys/sysmacros.h> -#include <sys/vfs.h> -#include <sys/vfs_opreg.h> - #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/strsun.h> @@ -46,8 +41,10 @@ #include <netinet/sctp.h> #include <inet/sctp_itf.h> +#include <fs/sockfs/sockcommon.h> #include "socksctp.h" +extern kmem_cache_t *sosctp_assoccache; /* * Find a free association id. See os/fio.c file descriptor allocator * for description of the algorithm. @@ -178,8 +175,10 @@ sosctp_assoc_create(struct sctp_sonode *ss, int kmflag) ssa->ssa_sonode = ss; ssa->ssa_state = 0; ssa->ssa_error = 0; +#if 0 ssa->ssa_txqueued = 0; - ssa->ssa_rxqueued = 0; +#endif + ssa->ssa_snd_qfull = 0; } dprint(2, ("sosctp_assoc_create %p %p\n", (void *)ss, (void *)ssa)); return (ssa); @@ -305,55 +304,6 @@ sosctp_find_cmsg(const uchar_t *control, socklen_t clen, int type) } /* - * Wait until the socket is connected or there is an error. - * fmode should contain any nonblocking flags. - */ -int -sosctp_waitconnected(struct sonode *so, int fmode) -{ - int error = 0; - - ASSERT(MUTEX_HELD(&so->so_lock)); - ASSERT((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) || - so->so_error != 0); - - while ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == - SS_ISCONNECTING && so->so_error == 0) { - - dprint(3, ("waiting for SS_ISCONNECTED on %p\n", (void *)so)); - if (fmode & (FNDELAY|FNONBLOCK)) - return (EINPROGRESS); - - if (!cv_wait_sig_swap(&so->so_state_cv, &so->so_lock)) { - /* - * Return EINTR and let the application use - * nonblocking techniques for detecting when - * the connection has been established. - */ - return (EINTR); - } - dprint(3, ("awoken on %p\n", (void *)so)); - } - - if (so->so_error != 0) { - error = sogeterr(so); - ASSERT(error != 0); - dprint(3, ("sosctp_waitconnected: error %d\n", error)); - return (error); - } - if (!(so->so_state & SS_ISCONNECTED)) { - /* - * Another thread could have consumed so_error - * e.g. by calling read. - take from sowaitconnected() - */ - error = ECONNREFUSED; - dprint(3, ("sosctp_waitconnected: error %d\n", error)); - return (error); - } - return (0); -} - -/* * Wait until the association is connected or there is an error. * fmode should contain any nonblocking flags. */ @@ -373,6 +323,8 @@ sosctp_assoc_waitconnected(struct sctp_soassoc *ssa, int fmode) if (fmode & (FNDELAY|FNONBLOCK)) return (EINPROGRESS); + if (so->so_state & SS_CLOSING) + return (EINTR); if (!cv_wait_sig_swap(&so->so_state_cv, &so->so_lock)) { /* * Return EINTR and let the application use @@ -408,7 +360,7 @@ sosctp_assoc_waitconnected(struct sctp_soassoc *ssa, int fmode) int sosctp_assoc_createconn(struct sctp_sonode *ss, const struct sockaddr *name, socklen_t namelen, const uchar_t *control, socklen_t controllen, int fflag, - struct sctp_soassoc **ssap) + struct cred *cr, struct sctp_soassoc **ssap) { struct sonode *so = &ss->ss_so; struct sctp_soassoc *ssa; @@ -427,8 +379,8 @@ sosctp_assoc_createconn(struct sctp_sonode *ss, const struct sockaddr *name, bzero(&laddr, sizeof (laddr)); laddr.ss_family = so->so_family; - error = sosctp_bind(so, (struct sockaddr *)&laddr, - sizeof (laddr), _SOBIND_LOCK_HELD); + error = SOP_BIND(so, (struct sockaddr *)&laddr, + sizeof (laddr), _SOBIND_LOCK_HELD, cr); if (error) { *ssap = NULL; return (error); @@ -456,8 +408,8 @@ sosctp_assoc_createconn(struct sctp_sonode *ss, const struct sockaddr *name, ssa = sosctp_assoc_create(ss, KM_SLEEP); ssa->ssa_wroff = ss->ss_wroff; ssa->ssa_wrsize = ss->ss_wrsize; - ssa->ssa_conn = sctp_create(ssa, so->so_priv, so->so_family, - SCTP_CAN_BLOCK, &sosctp_assoc_upcalls, &sbl, CRED()); + ssa->ssa_conn = sctp_create(ssa, (struct sctp_s *)so->so_proto_handle, + so->so_family, SCTP_CAN_BLOCK, &sosctp_assoc_upcalls, &sbl, cr); mutex_enter(&so->so_lock); ss->ss_assocs[id].ssi_assoc = ssa; @@ -561,7 +513,7 @@ void sosctp_assoc_move(struct sctp_sonode *ss, struct sctp_sonode *nss, struct sctp_soassoc *ssa) { - mblk_t *mp, **nmp; + mblk_t *mp, **nmp, *last_mp; struct sctp_soassoc *tmp; sosctp_so_inherit(ss, nss); @@ -571,26 +523,39 @@ sosctp_assoc_move(struct sctp_sonode *ss, struct sctp_sonode *nss, (ssa->ssa_state & (SS_ISCONNECTED|SS_ISCONNECTING| SS_ISDISCONNECTING|SS_CANTSENDMORE|SS_CANTRCVMORE|SS_ISBOUND)); nss->ss_so.so_error = ssa->ssa_error; - nss->ss_txqueued = ssa->ssa_txqueued; +#if 0 + nss->ss_so.so_txqueued = ssa->ssa_txqueued; +#endif + nss->ss_so.so_snd_qfull = ssa->ssa_snd_qfull; nss->ss_wroff = ssa->ssa_wroff; nss->ss_wrsize = ssa->ssa_wrsize; - nss->ss_rxqueued = ssa->ssa_rxqueued; - nss->ss_so.so_priv = ssa->ssa_conn; + nss->ss_so.so_rcv_queued = ssa->ssa_rcv_queued; + nss->ss_so.so_proto_handle = (sock_lower_handle_t)ssa->ssa_conn; - if (nss->ss_rxqueued > 0) { - nmp = &ss->ss_rxdata; + if (nss->ss_so.so_rcv_queued > 0) { + nmp = &ss->ss_so.so_rcv_q_head; + last_mp = NULL; while ((mp = *nmp) != NULL) { tmp = *(struct sctp_soassoc **)DB_BASE(mp); if (tmp == ssa) { *nmp = mp->b_next; - *nss->ss_rxtail = mp; - nss->ss_rxtail = &mp->b_next; + ASSERT(DB_TYPE(mp) != M_DATA); + if (nss->ss_so.so_rcv_q_last_head == NULL) { + nss->ss_so.so_rcv_q_head = mp; + } else { + nss->ss_so.so_rcv_q_last_head->b_next = + mp; + } + nss->ss_so.so_rcv_q_last_head = mp; + nss->ss_so.so_rcv_q_last_head->b_prev = last_mp; + mp->b_next = NULL; } else { nmp = &mp->b_next; + last_mp = mp; } } - ss->ss_rxtail = nmp; - *nss->ss_rxtail = NULL; + ss->ss_so.so_rcv_q_last_head = last_mp; + ss->ss_so.so_rcv_q_last_head->b_prev = last_mp; } } @@ -643,97 +608,3 @@ sosctp_assoc_isdisconnected(struct sctp_soassoc *ssa, int error) ssa->ssa_error = (ushort_t)error; cv_broadcast(&so->so_state_cv); } - -/* - * Change the process/process group to which SIGIO is sent. - */ -int -sosctp_chgpgrp(struct sctp_sonode *ss, pid_t pid) -{ - int error; - - ASSERT(MUTEX_HELD(&ss->ss_so.so_lock)); - if (pid != 0) { - /* - * Permissions check by sending signal 0. - * Note that when kill fails it does a - * set_errno causing the system call to fail. - */ - error = kill(pid, 0); - if (error != 0) { - return (error); - } - } - ss->ss_so.so_pgrp = pid; - return (0); -} - -/* - * Generate a SIGIO, for 'writable' events include siginfo structure, - * for read events just send the signal. - */ -static void -sosctp_sigproc(proc_t *proc, int event) -{ - k_siginfo_t info; - - if (event & SCTPSIG_WRITE) { - info.si_signo = SIGPOLL; - info.si_code = POLL_OUT; - info.si_errno = 0; - info.si_fd = 0; /* not set with TCP either */ - info.si_band = 0; - sigaddq(proc, NULL, &info, KM_NOSLEEP); - } - if (event & SCTPSIG_READ) { - sigtoproc(proc, NULL, SIGPOLL); - } -} - -void -sosctp_sendsig(struct sctp_sonode *ss, int event) -{ - proc_t *proc; - struct sonode *so = &ss->ss_so; - - ASSERT(MUTEX_HELD(&ss->ss_so.so_lock)); - - if (so->so_pgrp == 0 || !(so->so_state & SS_ASYNC)) { - return; - } - dprint(3, ("sending sig to %d\n", so->so_pgrp)); - - if (so->so_pgrp > 0) { - /* - * XXX This unfortunately still generates - * a signal when a fd is closed but - * the proc is active. - */ - mutex_enter(&pidlock); - proc = prfind(so->so_pgrp); - if (proc == NULL) { - mutex_exit(&pidlock); - return; - } - mutex_enter(&proc->p_lock); - mutex_exit(&pidlock); - sosctp_sigproc(proc, event); - mutex_exit(&proc->p_lock); - } else { - /* - * Send to process group. Hold pidlock across - * calls to sosctp_sigproc(). - */ - pid_t pgrp = -so->so_pgrp; - - mutex_enter(&pidlock); - proc = pgfind(pgrp); - while (proc != NULL) { - mutex_enter(&proc->p_lock); - sosctp_sigproc(proc, event); - proc = proc->p_pglink; - mutex_exit(&proc->p_lock); - } - mutex_exit(&pidlock); - } -} diff --git a/usr/src/uts/common/fs/sockfs/socksdp.c b/usr/src/uts/common/inet/sockmods/socksdp.c index 7376783fc0..fdbdca5cb3 100644 --- a/usr/src/uts/common/fs/sockfs/socksdp.c +++ b/usr/src/uts/common/inet/sockmods/socksdp.c @@ -30,7 +30,6 @@ #include <sys/systm.h> #include <sys/buf.h> #include <sys/vfs.h> -#include <sys/vfs_opreg.h> #include <sys/vnode.h> #include <sys/debug.h> #include <sys/errno.h> @@ -38,6 +37,9 @@ #include <sys/cmn_err.h> #include <sys/sysmacros.h> +#include <sys/filio.h> +#include <sys/sockio.h> + #include <sys/project.h> #include <sys/tihdr.h> #include <sys/strsubr.h> @@ -50,22 +52,37 @@ #include <inet/sdp_itf.h> #include "socksdp.h" +#include <fs/sockfs/sockcommon.h> /* * SDP sockfs sonode operations */ -static int sosdp_accept(struct sonode *, int, struct sonode **); -static int sosdp_listen(struct sonode *, int); +static int sosdp_init(struct sonode *, struct sonode *, struct cred *, int); +static int sosdp_accept(struct sonode *, int, struct cred *, struct sonode **); +static int sosdp_bind(struct sonode *, struct sockaddr *, socklen_t, int, + struct cred *); +static int sosdp_listen(struct sonode *, int, struct cred *); static int sosdp_connect(struct sonode *, const struct sockaddr *, socklen_t, - int, int); -static int sosdp_sendmsg(struct sonode *, struct nmsghdr *, struct uio *); -static int sosdp_getpeername(struct sonode *); -static int sosdp_getsockname(struct sonode *); -static int sosdp_shutdown(struct sonode *, int); + int, int, struct cred *); +static int sosdp_recvmsg(struct sonode *, struct nmsghdr *, struct uio *, + struct cred *); +static int sosdp_sendmsg(struct sonode *, struct nmsghdr *, struct uio *, + struct cred *); +static int sosdp_getpeername(struct sonode *, struct sockaddr *, socklen_t *, + boolean_t, struct cred *); +static int sosdp_getsockname(struct sonode *, struct sockaddr *, socklen_t *, + struct cred *); +static int sosdp_shutdown(struct sonode *, int, struct cred *); static int sosdp_getsockopt(struct sonode *, int, int, void *, socklen_t *, - int); + int, struct cred *); static int sosdp_setsockopt(struct sonode *, int, int, const void *, - socklen_t); + socklen_t, struct cred *); +static int sosdp_ioctl(struct sonode *, int, intptr_t, int, struct cred *, + int32_t *); +static int sosdp_poll(struct sonode *, short, int, short *, + struct pollhead **); +static int sosdp_close(struct sonode *, int, struct cred *); +void sosdp_fini(struct sonode *, struct cred *); /* @@ -80,20 +97,23 @@ static void sdp_sock_xmitted(void *handle, int txqueued); static void sdp_sock_urgdata(void *handle); static void sdp_sock_ordrel(void *handle); -static kmem_cache_t *sosdp_sockcache; - sonodeops_t sosdp_sonodeops = { - sosdp_accept, /* sop_accept */ - sosdp_bind, /* sop_bind */ - sosdp_listen, /* sop_listen */ - sosdp_connect, /* sop_connect */ - sosdp_recvmsg, /* sop_recvmsg */ - sosdp_sendmsg, /* sop_sendmsg */ - sosdp_getpeername, /* sop_getpeername */ - sosdp_getsockname, /* sop_getsockname */ - sosdp_shutdown, /* sop_shutdown */ - sosdp_getsockopt, /* sop_getsockopt */ - sosdp_setsockopt /* sop_setsockopt */ + sosdp_init, /* sop_init */ + sosdp_accept, /* sop_accept */ + sosdp_bind, /* sop_bind */ + sosdp_listen, /* sop_listen */ + sosdp_connect, /* sop_connect */ + sosdp_recvmsg, /* sop_recvmsg */ + sosdp_sendmsg, /* sop_sendmsg */ + so_sendmblk_notsupp, /* sop_sendmblk */ + sosdp_getpeername, /* sop_getpeername */ + sosdp_getsockname, /* sop_getsockname */ + sosdp_shutdown, /* sop_shutdown */ + sosdp_getsockopt, /* sop_getsockopt */ + sosdp_setsockopt, /* sop_setsockopt */ + sosdp_ioctl, /* sop_ioctl */ + sosdp_poll, /* sop_poll */ + sosdp_close, /* sop_close */ }; sdp_upcalls_t sosdp_sock_upcalls = { @@ -107,320 +127,57 @@ sdp_upcalls_t sosdp_sock_upcalls = { sdp_sock_ordrel, }; - -/*ARGSUSED*/ +/* ARGSUSED */ static int -sosdp_sock_constructor(void *buf, void *cdrarg, int kmflags) -{ - struct sdp_sonode *ss = buf; - struct sonode *so = &ss->ss_so; - struct vnode *vp; - - ss->ss_type = SOSDP_SOCKET; - so->so_oobmsg = NULL; - so->so_ack_mp = NULL; - so->so_conn_ind_head = NULL; - so->so_conn_ind_tail = NULL; - so->so_discon_ind_mp = NULL; - so->so_ux_bound_vp = NULL; - so->so_unbind_mp = NULL; - so->so_ops = NULL; - so->so_accessvp = NULL; - so->so_priv = NULL; - - so->so_nl7c_flags = 0; - so->so_nl7c_uri = NULL; - so->so_nl7c_rcv_mp = NULL; - - so->so_direct = NULL; - - vp = vn_alloc(kmflags); - if (vp == NULL) { - return (-1); - } - so->so_vnode = vp; - - vn_setops(vp, socksdp_vnodeops); - vp->v_data = (caddr_t)so; - - mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&so->so_plumb_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL); - cv_init(&so->so_ack_cv, NULL, CV_DEFAULT, NULL); - cv_init(&so->so_connind_cv, NULL, CV_DEFAULT, NULL); - cv_init(&so->so_want_cv, NULL, CV_DEFAULT, NULL); - return (0); -} - -/*ARGSUSED*/ -static void -sosdp_sock_destructor(void *buf, void *cdrarg) -{ - struct sdp_sonode *ss = buf; - struct sonode *so = &ss->ss_so; - struct vnode *vp = SOTOV(so); - - ASSERT(so->so_direct == NULL); - - ASSERT(so->so_nl7c_flags == 0); - ASSERT(so->so_nl7c_uri == NULL); - ASSERT(so->so_nl7c_rcv_mp == NULL); - - ASSERT(so->so_oobmsg == NULL); - ASSERT(so->so_ack_mp == NULL); - ASSERT(so->so_conn_ind_head == NULL); - ASSERT(so->so_conn_ind_tail == NULL); - ASSERT(so->so_discon_ind_mp == NULL); - ASSERT(so->so_ux_bound_vp == NULL); - ASSERT(so->so_unbind_mp == NULL); - ASSERT(so->so_ops == NULL || so->so_ops == &sosdp_sonodeops); - - ASSERT(vn_matchops(vp, socksdp_vnodeops)); - ASSERT(vp->v_data == (caddr_t)so); - - vn_free(vp); - - mutex_destroy(&so->so_lock); - mutex_destroy(&so->so_plumb_lock); - cv_destroy(&so->so_state_cv); - cv_destroy(&so->so_ack_cv); - cv_destroy(&so->so_connind_cv); - cv_destroy(&so->so_want_cv); -} - - -int -sosdp_init(void) -{ - int error; - - error = vn_make_ops("socksdp", socksdp_vnodeops_template, - &socksdp_vnodeops); - if (error != 0) { - cmn_err(CE_WARN, "sosdp_init: bad vnode ops template"); - return (error); - } - - sosdp_sockcache = kmem_cache_create("sdpsock", - sizeof (struct sdp_sonode), 0, sosdp_sock_constructor, - sosdp_sock_destructor, NULL, NULL, NULL, 0); - return (0); -} - -static struct vnode * -sosdp_makevp(struct vnode *accessvp, int domain, int type, int protocol, - int kmflags) +sosdp_init(struct sonode *so, struct sonode *pso, struct cred *cr, int flags) { - struct sdp_sonode *ss; - struct sonode *so; - struct vnode *vp; - time_t now; - - ss = kmem_cache_alloc(sosdp_sockcache, kmflags); - if (ss == NULL) { - return (NULL); - } - so = &ss->ss_so; - so->so_cache = sosdp_sockcache; - so->so_obj = ss; - vp = SOTOV(so); - now = gethrestime_sec(); - - so->so_flag = 0; - so->so_accessvp = accessvp; - so->so_dev = accessvp->v_rdev; - - so->so_state = 0; - so->so_mode = 0; - - so->so_fsid = sockdev; - so->so_atime = now; - so->so_mtime = now; - so->so_ctime = now; - so->so_count = 0; - - so->so_family = domain; - so->so_type = type; - so->so_protocol = protocol; - so->so_pushcnt = 0; - - so->so_options = 0; - so->so_linger.l_onoff = 0; - so->so_linger.l_linger = 0; - so->so_sndbuf = 0; - so->so_rcvbuf = 0; - so->so_error = 0; - so->so_delayed_error = 0; - - ASSERT(so->so_oobmsg == NULL); - so->so_oobcnt = 0; - so->so_oobsigcnt = 0; - so->so_pgrp = 0; - so->so_provinfo = NULL; - - so->so_laddr_sa = (struct sockaddr *)&ss->ss_laddr; - so->so_faddr_sa = (struct sockaddr *)&ss->ss_faddr; - so->so_laddr_maxlen = so->so_faddr_maxlen = sizeof (ss->ss_laddr); - so->so_laddr_len = so->so_faddr_len = 0; - so->so_eaddr_mp = NULL; - so->so_delayed_error = 0; - - so->so_peercred = NULL; - - ASSERT(so->so_ack_mp == NULL); - ASSERT(so->so_conn_ind_head == NULL); - ASSERT(so->so_conn_ind_tail == NULL); - ASSERT(so->so_ux_bound_vp == NULL); - ASSERT(so->so_unbind_mp == NULL); - - vn_reinit(vp); - vp->v_vfsp = rootvfs; - vp->v_type = VSOCK; - vp->v_rdev = so->so_dev; - - so->so_ops = &sosdp_sonodeops; - - ss->ss_rxqueued = 0; - bzero(&ss->ss_poll_list, sizeof (ss->ss_poll_list)); - - vn_exists(vp); - return (vp); -} - -/* - * Creates a sdp socket data structure. - * tso is non-NULL if it's passive open. - */ -struct sonode * -sosdp_create(vnode_t *accessvp, int domain, int type, int protocol, - int version, struct sonode *tso, int *errorp) -{ - struct sonode *so; - vnode_t *vp; - int error; - int soflags; - cred_t *cr; - - dprint(4, ("Inside sosdp_create: domain:%d proto:%d type:%d", - domain, protocol, type)); - - if (is_system_labeled()) { - *errorp = EOPNOTSUPP; - return (NULL); - } - - if (version == SOV_STREAM) { - *errorp = EINVAL; - return (NULL); - } - ASSERT(accessvp != NULL); + int error = 0; + sdp_sockbuf_limits_t sbl; + sdp_upcalls_t *upcalls; - /* - * We only support one type of SDP socket. Let sotpi_create() - * handle all other cases, such as raw socket. - */ - if (!(domain == AF_INET || domain == AF_INET6) || - !(type == SOCK_STREAM)) { - return (sotpi_create(accessvp, domain, type, protocol, version, - NULL, errorp)); - } + if (pso != NULL) { + /* passive open, just inherit settings from parent */ - if (tso == NULL) { - vp = sosdp_makevp(accessvp, domain, type, protocol, KM_SLEEP); - ASSERT(vp != NULL); + mutex_enter(&so->so_lock); - soflags = FREAD | FWRITE; - } else { - vp = sosdp_makevp(accessvp, domain, type, protocol, - KM_NOSLEEP); - if (vp == NULL) { - /* - * sosdp_makevp() only fails when there is no memory. - */ - *errorp = ENOMEM; - return (NULL); - } - soflags = FREAD | FWRITE | SO_ACCEPTOR; - } - /* - * This function may be called in interrupt context, and CRED() - * will be NULL. In this case, pass in kcred to VOP_OPEN(). - */ - if ((cr = CRED()) == NULL) - cr = kcred; - if ((error = VOP_OPEN(&vp, soflags, cr, NULL)) != 0) { - VN_RELE(vp); - *errorp = error; - return (NULL); - } - so = VTOSO(vp); + so->so_state |= (SS_ISBOUND | SS_ISCONNECTED | + (pso->so_state & SS_ASYNC)); + sosdp_so_inherit(pso, so); + so->so_proto_props = pso->so_proto_props; - dprint(2, ("sosdp_create: %p domain %d type %d\n", (void *)so, - domain, type)); + mutex_exit(&so->so_lock); - if (version == SOV_DEFAULT) { - version = so_default_version; + return (0); } - so->so_version = (short)version; - return (so); -} + upcalls = &sosdp_sock_upcalls; -/* - * Free SDP socket data structure. - * Closes incoming connections which were never accepted, frees - * resources. - */ -void -sosdp_free(struct sonode *so) -{ - struct sonode *nso; - mblk_t *mp; + so->so_proto_handle = (sock_lower_handle_t)sdp_create(so, NULL, + so->so_family, SDP_CAN_BLOCK, upcalls, &sbl, cr, &error); + if (so->so_proto_handle == NULL) + return (ENOMEM); - dprint(3, ("sosdp_free: so:%p priv:%p", (void *)so, so->so_priv)); + so->so_rcvbuf = sbl.sbl_rxbuf; + so->so_rcvlowat = sbl.sbl_rxlowat; + so->so_sndbuf = sbl.sbl_txbuf; + so->so_sndlowat = sbl.sbl_txlowat; - mutex_enter(&so->so_lock); - - /* - * Need to clear these out so that sockfree() doesn't think that - * there's memory in need of free'ing. - */ - so->so_laddr_sa = so->so_faddr_sa = NULL; - so->so_laddr_len = so->so_laddr_maxlen = 0; - so->so_faddr_len = so->so_faddr_maxlen = 0; - - while ((mp = so->so_conn_ind_head) != NULL) { - so->so_conn_ind_head = mp->b_next; - mutex_exit(&so->so_lock); - mp->b_next = NULL; - nso = *(struct sonode **)mp->b_rptr; - - (void) VOP_CLOSE(SOTOV(nso), 0, 1, 0, CRED(), NULL); - vn_invalid(SOTOV(nso)); - VN_RELE(SOTOV(nso)); - - freeb(mp); - mutex_enter(&so->so_lock); - } - so->so_conn_ind_tail = NULL; - so->so_state &= ~SS_HASCONNIND; - mutex_exit(&so->so_lock); - - sockfree(so); + return (error); } /* * Accept incoming connection. */ +/* ARGSUSED */ static int -sosdp_accept(struct sonode *lso, int fflag, struct sonode **nsop) +sosdp_accept(struct sonode *lso, int fflag, struct cred *cr, + struct sonode **nsop) { int error = 0; - mblk_t *mp; struct sonode *nso; - dprint(3, ("sosdp_accept: so:%p priv:%p", (void *)lso, - lso->so_priv)); + dprint(3, ("sosdp_accept: so:%p so_proto_handle:%p", (void *)lso, + (void *)lso->so_proto_handle)); if (!(lso->so_state & SS_ACCEPTCONN)) { /* @@ -429,50 +186,36 @@ sosdp_accept(struct sonode *lso, int fflag, struct sonode **nsop) eprintsoline(lso, EINVAL); return (EINVAL); } - /* * Returns right away if socket is nonblocking. */ - error = sowaitconnind(lso, fflag, &mp); + error = so_acceptq_dequeue(lso, (fflag & (FNONBLOCK|FNDELAY)), &nso); if (error != 0) { eprintsoline(lso, error); - dprint(4, ("sosdp_accept: failed <%d>:lso:%p prv:%p", - error, (void *)lso, lso->so_priv)); + dprint(4, ("sosdp_accept: failed %d:lso:%p so_proto_handle:%p", + error, (void *)lso, (void *)lso->so_proto_handle)); return (error); } - nso = *(struct sonode **)mp->b_rptr; - freeb(mp); - - mutex_enter(&lso->so_lock); - ASSERT(SOTOSDO(lso)->ss_rxqueued > 0); - --SOTOSDO(lso)->ss_rxqueued; - mutex_exit(&lso->so_lock); - - - /* - * accept() needs remote address right away. - */ - (void) sosdp_getpeername(nso); dprint(2, ("sosdp_accept: new %p\n", (void *)nso)); - *nsop = nso; + return (0); } /* * Bind local endpoint. */ +/* ARGSUSED */ int sosdp_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen, - int flags) + int flags, struct cred *cr) { - int error = 0; + int error = 0; if (!(flags & _SOBIND_LOCK_HELD)) { mutex_enter(&so->so_lock); so_lock_single(so); /* Set SOLOCKED */ - /* LINTED - statement has no conseq */ } else { ASSERT(MUTEX_HELD(&so->so_lock)); ASSERT(so->so_flag & SOLOCKED); @@ -487,6 +230,7 @@ sosdp_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen, eprintsoline(so, error); goto done; } + /* * X/Open requires this check */ @@ -496,16 +240,17 @@ sosdp_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen, } /* - * Protocol module does address family checks. + * Protocol module does address family checks */ mutex_exit(&so->so_lock); - error = sdp_bind(so->so_priv, name, namelen); + error = sdp_bind((struct sdp_conn_struct_t *)so->so_proto_handle, + name, namelen); mutex_enter(&so->so_lock); + if (error == 0) { so->so_state |= SS_ISBOUND; - /* LINTED - statement has no conseq */ } else { eprintsoline(so, error); } @@ -513,7 +258,6 @@ done: if (!(flags & _SOBIND_LOCK_HELD)) { so_unlock_single(so, SOLOCKED); mutex_exit(&so->so_lock); - /* LINTED - statement has no conseq */ } else { /* If the caller held the lock don't release it here */ ASSERT(MUTEX_HELD(&so->so_lock)); @@ -525,12 +269,12 @@ done: /* * Turn socket into a listen socket. */ +/* ARGSUSED */ static int -sosdp_listen(struct sonode *so, int backlog) +sosdp_listen(struct sonode *so, int backlog, struct cred *cr) { int error = 0; - mutex_enter(&so->so_lock); so_lock_single(so); @@ -541,30 +285,9 @@ sosdp_listen(struct sonode *so, int backlog) if (so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING | SS_CANTRCVMORE | SS_CANTSENDMORE)) { error = EINVAL; - eprintsoline(so, error); + eprintsoline(so, EINVAL); goto done; } - - if (backlog < 0) { - backlog = 0; - } - - /* - * Use the same qlimit as in BSD. BSD checks the qlimit - * before queuing the next connection implying that a - * listen(sock, 0) allows one connection to be queued. - * BSD also uses 1.5 times the requested backlog. - * - * XNS Issue 4 required a strict interpretation of the backlog. - * This has been waived subsequently for Issue 4 and the change - * incorporated in XNS Issue 5. So we aren't required to do - * anything special for XPG apps. - */ - if (backlog >= (INT_MAX - 1) / 3) - backlog = INT_MAX; - else - backlog = backlog * 3 / 2 + 1; - /* * If listen() is only called to change backlog, we don't * need to notify protocol module. @@ -576,13 +299,13 @@ sosdp_listen(struct sonode *so, int backlog) mutex_exit(&so->so_lock); - error = sdp_listen(so->so_priv, backlog); + error = sdp_listen((struct sdp_conn_struct_t *)so->so_proto_handle, + backlog); mutex_enter(&so->so_lock); if (error == 0) { - so->so_state |= (SS_ACCEPTCONN|SS_ISBOUND); + so->so_state |= (SS_ACCEPTCONN | SS_ISBOUND); so->so_backlog = backlog; - /* LINTED - statement has no conseq */ } else { eprintsoline(so, error); } @@ -599,13 +322,9 @@ done: /*ARGSUSED*/ static int sosdp_connect(struct sonode *so, const struct sockaddr *name, - socklen_t namelen, int fflag, int flags) + socklen_t namelen, int fflag, int flags, struct cred *cr) { - int error; - - ASSERT(so->so_type == SOCK_STREAM); - dprint(3, ("sosdp_connect: so:%p priv:%p", (void *)so, - so->so_priv)); + int error = 0; mutex_enter(&so->so_lock); so_lock_single(so); @@ -627,10 +346,10 @@ sosdp_connect(struct sonode *so, const struct sockaddr *name, } /* - * Check for failure of an earlier call + * check for failure of an earlier call */ if (so->so_error != 0) { - error = sogeterr(so); + error = sogeterr(so, B_TRUE); eprintsoline(so, error); goto done; } @@ -647,24 +366,27 @@ sosdp_connect(struct sonode *so, const struct sockaddr *name, goto done; } if (name == NULL || namelen == 0) { - error = EINVAL; - eprintsoline(so, error); + eprintsoline(so, EINVAL); goto done; } soisconnecting(so); - mutex_exit(&so->so_lock); - error = sdp_connect(so->so_priv, name, namelen); + error = sdp_connect((struct sdp_conn_struct_t *)so->so_proto_handle, + name, namelen); + mutex_enter(&so->so_lock); if (error == 0) { /* * Allow other threads to access the socket */ - error = sosdp_waitconnected(so, fflag); - dprint(4, ("sosdp_connect: wait on so:%p priv:%p failed:%d", - (void *)so, so->so_priv, error)); + error = sowaitconnected(so, fflag, 0); + dprint(4, + ("sosdp_connect: wait on so:%p " + "so_proto_handle:%p failed:%d", + (void *)so, (void *)so->so_proto_handle, error)); } + switch (error) { case 0: case EINPROGRESS: @@ -684,12 +406,13 @@ done: return (error); } - /* * Receive data. */ +/* ARGSUSED */ int -sosdp_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) +sosdp_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, + struct cred *cr) { int flags, error = 0; int size; @@ -735,7 +458,9 @@ sosdp_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) if (uiop->uio_fmode & (FNDELAY|FNONBLOCK)) { flags |= MSG_DONTWAIT; } - error = sdp_recv(so->so_priv, msg, size, flags, uiop); + error = sdp_recv( + (struct sdp_conn_struct_t *)so->so_proto_handle, msg, + size, flags, uiop); } else { msg->msg_controllen = 0; msg->msg_namelen = 0; @@ -750,8 +475,10 @@ done: /* * Send message. */ +/* ARGSUSED */ static int -sosdp_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) +sosdp_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, + struct cred *cr) { int flags; ssize_t count; @@ -759,8 +486,8 @@ sosdp_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) ASSERT(so->so_type == SOCK_STREAM); - dprint(4, ("sosdp_sendmsg: so:%p priv:%p", - (void *)so, so->so_priv)); + dprint(4, ("sosdp_sendmsg: so:%p so_proto_handle:%p", + (void *)so, (void *)so->so_proto_handle)); flags = msg->msg_flags; @@ -771,12 +498,11 @@ sosdp_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) mutex_enter(&so->so_lock); if (so->so_state & SS_CANTSENDMORE) { mutex_exit(&so->so_lock); - tsignal(curthread, SIGPIPE); return (EPIPE); } if (so->so_error != 0) { - error = sogeterr(so); + error = sogeterr(so, B_TRUE); mutex_exit(&so->so_lock); return (error); } @@ -794,93 +520,83 @@ sosdp_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) } mutex_exit(&so->so_lock); - error = sdp_send(so->so_priv, msg, count, flags, uiop); - if (error == 0) - return (0); + error = sdp_send((struct sdp_conn_struct_t *)so->so_proto_handle, + msg, count, flags, uiop); - mutex_enter(&so->so_lock); - if ((error == EPIPE) && (so->so_state & SS_CANTSENDMORE)) { - /* - * We received shutdown between the time lock was - * lifted and call to sdp_sendmsg(). - */ - mutex_exit(&so->so_lock); - tsignal(curthread, SIGPIPE); - return (EPIPE); - } - mutex_exit(&so->so_lock); return (error); } - /* * Get address of remote node. */ +/* ARGSUSED */ static int -sosdp_getpeername(struct sonode *so) +sosdp_getpeername(struct sonode *so, struct sockaddr *addr, socklen_t *addrlen, + boolean_t accept, struct cred *cr) { - int error; - - if (!(so->so_state & SS_ISCONNECTED)) { - error = ENOTCONN; + if (!accept && !(so->so_state & SS_ISCONNECTED)) { + return (ENOTCONN); } else { - error = sdp_getpeername(so->so_priv, so->so_faddr_sa, - &so->so_faddr_len); + return (sdp_getpeername( + (struct sdp_conn_struct_t *)so->so_proto_handle, + addr, addrlen)); } - return (error); } /* * Get local address. */ +/* ARGSUSED */ static int -sosdp_getsockname(struct sonode *so) +sosdp_getsockname(struct sonode *so, struct sockaddr *addr, socklen_t *addrlen, + struct cred *cr) { - int error; - mutex_enter(&so->so_lock); + if (!(so->so_state & SS_ISBOUND)) { /* * Zero address, except for address family */ - bzero(so->so_laddr_sa, so->so_laddr_maxlen); - - so->so_laddr_len = (so->so_family == AF_INET6) ? - sizeof (struct sockaddr_in6) : sizeof (struct sockaddr_in); - so->so_laddr_sa->sa_family = so->so_family; - error = 0; + if (so->so_family == AF_INET || so->so_family == AF_INET6) { + bzero(addr, *addrlen); + *addrlen = (so->so_family == AF_INET6) ? + sizeof (struct sockaddr_in6) : + sizeof (struct sockaddr_in); + addr->sa_family = so->so_family; + } mutex_exit(&so->so_lock); + return (0); } else { mutex_exit(&so->so_lock); - - error = sdp_getsockname(so->so_priv, so->so_laddr_sa, - &so->so_laddr_len); + return (sdp_getsockname( + (struct sdp_conn_struct_t *)so->so_proto_handle, + addr, addrlen)); } - - return (error); } /* * Called from shutdown(). */ +/* ARGSUSED */ static int -sosdp_shutdown(struct sonode *so, int how) +sosdp_shutdown(struct sonode *so, int how, struct cred *cr) { - struct sdp_sonode *ss = SOTOSDO(so); uint_t state_change; int error = 0; - short wakesig = 0; mutex_enter(&so->so_lock); so_lock_single(so); - /* * Record the current state and then perform any state changes. * Then use the difference between the old and new states to * determine which needs to be done. */ state_change = so->so_state; + if (!(state_change & SS_ISCONNECTED)) { + error = ENOTCONN; + goto done; + } switch (how) { case SHUT_RD: @@ -900,21 +616,16 @@ sosdp_shutdown(struct sonode *so, int how) state_change = so->so_state & ~state_change; - if (state_change & SS_CANTRCVMORE) { - wakesig = POLLIN|POLLRDNORM; - sosdp_sendsig(ss, SDPSIG_READ); - } if (state_change & SS_CANTSENDMORE) { - wakesig |= POLLOUT; so->so_state |= SS_ISDISCONNECTING; } - mutex_exit(&so->so_lock); - - pollwakeup(&ss->ss_poll_list, wakesig); + so_notify_shutdown(so); if (state_change & SS_CANTSENDMORE) { - error = sdp_shutdown(so->so_priv, how); + error = sdp_shutdown( + (struct sdp_conn_struct_t *)so->so_proto_handle, how); } + mutex_enter(&so->so_lock); done: so_unlock_single(so, SOLOCKED); @@ -935,7 +646,7 @@ done: /*ARGSUSED*/ static int sosdp_getsockopt(struct sonode *so, int level, int option_name, - void *optval, socklen_t *optlenp, int flags) + void *optval, socklen_t *optlenp, int flags, struct cred *cr) { int error = 0; void *option = NULL; @@ -987,7 +698,7 @@ sosdp_getsockopt(struct sonode *so, int level, int option_name, goto copyout; case SO_ERROR: - value = sogeterr(so); + value = sogeterr(so, B_TRUE); goto copyout; case SO_ACCEPTCONN: @@ -1045,7 +756,8 @@ sosdp_getsockopt(struct sonode *so, int level, int option_name, } optlen = maxlen; mutex_exit(&so->so_lock); - error = sdp_get_opt(so->so_priv, level, option_name, optbuf, &optlen); + error = sdp_get_opt((struct sdp_conn_struct_t *)so->so_proto_handle, + level, option_name, optbuf, &optlen); mutex_enter(&so->so_lock); ASSERT(optlen <= maxlen); if (error != 0) { @@ -1078,43 +790,35 @@ done: /* * Set socket options */ +/* ARGSUSED */ static int sosdp_setsockopt(struct sonode *so, int level, int option_name, - const void *optval, t_uscalar_t optlen) + const void *optval, t_uscalar_t optlen, struct cred *cr) { - int error; void *conn = NULL; + int error = 0; - - /* X/Open requires this check */ if (so->so_state & SS_CANTSENDMORE) { return (EINVAL); } - /* Caller allocates aligned optval, or passes null */ - ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0); - - /* No SDP options should be zero-length */ - if (optlen == 0) { - error = EINVAL; - eprintsoline(so, error); - return (error); - } - mutex_enter(&so->so_lock); so_lock_single(so); if (so->so_type == SOCK_STREAM) { - conn = so->so_priv; + conn = (void *)so->so_proto_handle; } dprint(2, ("sosdp_setsockopt (%d) - conn %p %d %d \n", so->so_type, conn, level, option_name)); + if (conn != NULL) { mutex_exit(&so->so_lock); - error = sdp_set_opt(conn, level, option_name, optval, optlen); + error = sdp_set_opt((struct sdp_conn_struct_t *)conn, level, + option_name, optval, optlen); mutex_enter(&so->so_lock); } + /* * Check for SOL_SOCKET options and record their values. * If we know about a SOL_SOCKET parameter and the transport @@ -1244,6 +948,239 @@ done: return (error); } +/* ARGSUSED */ +static int +sosdp_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode, + struct cred *cr, int32_t *rvalp) +{ + int32_t value; + int error, intval; + pid_t pid; + + /* handle socket specific ioctls */ + switch (cmd) { + case FIONBIO: + if (so_copyin((void *)arg, &value, sizeof (int32_t), + (mode & (int)FKIOCTL))) { + return (EFAULT); + } + mutex_enter(&so->so_lock); + if (value != 0) { + so->so_state |= SS_NDELAY; + } else { + so->so_state &= ~SS_NDELAY; + } + mutex_exit(&so->so_lock); + return (0); + + case FIOASYNC: + if (so_copyin((void *)arg, &value, sizeof (int32_t), + (mode & (int)FKIOCTL))) { + return (EFAULT); + } + mutex_enter(&so->so_lock); + + if (value) { + /* Turn on SIGIO */ + so->so_state |= SS_ASYNC; + } else { + /* Turn off SIGIO */ + so->so_state &= ~SS_ASYNC; + } + mutex_exit(&so->so_lock); + return (0); + + case SIOCSPGRP: + case FIOSETOWN: + if (so_copyin((void *)arg, &pid, sizeof (pid_t), + (mode & (int)FKIOCTL))) { + return (EFAULT); + } + mutex_enter(&so->so_lock); + + error = (pid != so->so_pgrp) ? socket_chgpgrp(so, pid) : 0; + mutex_exit(&so->so_lock); + return (error); + + case SIOCGPGRP: + case FIOGETOWN: + if (so_copyout(&so->so_pgrp, (void *)arg, + sizeof (pid_t), (mode & (int)FKIOCTL))) + return (EFAULT); + return (0); + + case SIOCATMARK: + intval = 0; + error = sdp_ioctl( + (struct sdp_conn_struct_t *)so->so_proto_handle, cmd, + &intval, cr); + if (so_copyout(&intval, (void *)arg, sizeof (int), + (mode & (int)FKIOCTL))) + return (EFAULT); + return (0); + + + case SIOCSENABLESDP: { + int32_t enable; + + /* + * System wide enable SDP + */ + + if (so_copyin((void *)arg, &enable, sizeof (int32_t), + mode & (int)FKIOCTL)) + return (EFAULT); + + error = sdp_ioctl( + (struct sdp_conn_struct_t *)so->so_proto_handle, cmd, + &enable, cr); + if (so_copyout(&enable, (void *)arg, + sizeof (int32_t), (mode & (int)FKIOCTL))) + return (EFAULT); + return (0); + } + /* from strioctl */ + case FIONREAD: + /* + * Return number of bytes of data in all data messages + * in queue in "arg". + * For stream socket, amount of available data. + */ + if (so->so_state & SS_ACCEPTCONN) { + intval = 0; + } else { + mutex_enter(&so->so_lock); + intval = sdp_polldata( + (struct sdp_conn_struct_t *)so->so_proto_handle, + SDP_READ); + mutex_exit(&so->so_lock); + } + if (so_copyout(&intval, (void *)arg, sizeof (intval), + (mode & (int)FKIOCTL))) + return (EFAULT); + return (0); + default: + return (EINVAL); + } +} + +/* + * Check socktpi_poll() on why so_lock is not held in this function. + */ +static int +sosdp_poll(struct sonode *so, short events, int anyyet, short *reventsp, + struct pollhead **phpp) +{ + short origevents = events; + int so_state; + + so_state = so->so_state; + + ASSERT(so->so_version != SOV_STREAM); + + if (!(so_state & SS_ISCONNECTED) && (so->so_type == SOCK_STREAM)) { + /* + * Not connected yet - turn off write side events + */ + events &= ~(POLLOUT|POLLWRBAND); + } + + /* + * Check for errors + */ + if (so->so_error != 0 && + ((POLLIN|POLLRDNORM|POLLOUT) & origevents) != 0) { + *reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents; + return (0); + } + + *reventsp = 0; + + /* + * Don't mark socket as writable until TX queued data is + * below watermark. + */ + if (so->so_type == SOCK_STREAM) { + if (sdp_polldata( + (struct sdp_conn_struct_t *)so->so_proto_handle, + SDP_XMIT)) { + *reventsp |= POLLOUT & events; + } + } else { + *reventsp = 0; + goto done; + } + + if (sdp_polldata((struct sdp_conn_struct_t *)so->so_proto_handle, + SDP_READ)) { + *reventsp |= (POLLIN|POLLRDNORM) & events; + } + + if ((so_state & SS_CANTRCVMORE) || (so->so_acceptq_head != NULL)) { + *reventsp |= (POLLIN|POLLRDNORM) & events; + } + +done: + if (!*reventsp && !anyyet) { + *phpp = &so->so_poll_list; + } + + return (0); +} + +/* ARGSUSED */ +static int +sosdp_close(struct sonode *so, int flag, struct cred *cr) +{ + int error = 0; + + mutex_enter(&so->so_lock); + so_lock_single(so); + /* + * Need to set flags as there might be ops in progress on + * this socket. + * + * If socket already disconnected/disconnecting, + * don't send signal (again). + */ + soisdisconnected(so, 0); + mutex_exit(&so->so_lock); + + /* + * Initiate connection shutdown. + */ + error = sdp_disconnect((struct sdp_conn_struct_t *)so->so_proto_handle, + flag); + + mutex_enter(&so->so_lock); + so_unlock_single(so, SOLOCKED); + so_notify_disconnected(so, error); + + return (error); +} + +/* ARGSUSED */ +void +sosdp_fini(struct sonode *so, struct cred *cr) +{ + dprint(3, ("sosdp_fini: so:%p so_proto_handle:%p", (void *)so, + (void *)so->so_proto_handle)); + + ASSERT(so->so_ops == &sosdp_sonodeops); + + if (so->so_proto_handle != NULL) + sdp_close((struct sdp_conn_struct_t *)so->so_proto_handle); + so->so_proto_handle = NULL; + + mutex_enter(&so->so_lock); + + so_acceptq_flush(so); + + mutex_exit(&so->so_lock); + + sonode_fini(so); +} + /* * Upcalls from SDP */ @@ -1254,83 +1191,37 @@ done: static void * sdp_sock_newconn(void *parenthandle, void *connind) { - struct sdp_sonode *lss = parenthandle; - struct sonode *lso = &lss->ss_so; + struct sonode *lso = parenthandle; struct sonode *nso; - struct sdp_sonode *nss; - mblk_t *mp; int error; ASSERT(lso->so_state & SS_ACCEPTCONN); - ASSERT(lso->so_priv != NULL); /* closed conn */ + ASSERT(lso->so_proto_handle != NULL); /* closed conn */ ASSERT(lso->so_type == SOCK_STREAM); - dprint(3, ("sosdp_newconn A: so:%p priv:%p", (void *)lso, - lso->so_priv)); + dprint(3, ("sosdp_newconn A: so:%p so_proto_handle:%p", (void *)lso, + (void *)lso->so_proto_handle)); /* * Check current # of queued conns against backlog */ - if (lss->ss_rxqueued >= lso->so_backlog) { - return (NULL); - } - - /* - * Need to create a new socket. - */ - mp = allocb(sizeof (connind), BPRI_MED); - if (mp == NULL) { - eprintsoline(lso, ENOMEM); + if (lso->so_rcv_queued >= lso->so_backlog) { return (NULL); } - DB_TYPE(mp) = M_PROTO; - VN_HOLD(lso->so_accessvp); - nso = sosdp_create(lso->so_accessvp, lso->so_family, lso->so_type, - lso->so_protocol, lso->so_version, lso, &error); + nso = socket_newconn(lso, connind, NULL, SOCKET_NOSLEEP, &error); if (nso == NULL) { - VN_RELE(lso->so_accessvp); - freeb(mp); eprintsoline(lso, error); return (NULL); } dprint(2, ("sdp_stream_newconn: new %p\n", (void *)nso)); - nss = SOTOSDO(nso); - - /* - * Inherit socket properties - */ - mutex_enter(&lso->so_lock); - mutex_enter(&nso->so_lock); - nso->so_state |= (SS_ISBOUND | SS_ISCONNECTED | - (lso->so_state & SS_ASYNC)); - sosdp_so_inherit(lss, nss); - nso->so_priv = connind; - - mutex_exit(&nso->so_lock); - - ++lss->ss_rxqueued; - mutex_exit(&lso->so_lock); - - /* - * Copy pointer to new socket to connind queue message - */ - *(struct sonode **)mp->b_wptr = nso; - mp->b_wptr += sizeof (nso); - - /* - * Wake people who're waiting incoming conns. Note that - * soqueueconnind gets so_lock. - */ - soqueueconnind(lso, mp); - pollwakeup(&lss->ss_poll_list, POLLIN|POLLRDNORM); + (void) so_acceptq_enqueue(lso, nso); mutex_enter(&lso->so_lock); - sosdp_sendsig(lss, SDPSIG_READ); - mutex_exit(&lso->so_lock); - return (nss); + so_notify_newconn(lso); + return (nso); } /* @@ -1339,26 +1230,19 @@ sdp_sock_newconn(void *parenthandle, void *connind) static void sdp_sock_connected(void *handle) { - struct sdp_sonode *ss = handle; - struct sonode *so = &ss->ss_so; + struct sonode *so = handle; ASSERT(so->so_type == SOCK_STREAM); - dprint(3, ("sosdp_connected C: so:%p priv:%p", (void *)so, - so->so_priv)); + dprint(3, ("sosdp_connected C: so:%p so_proto_handle:%p", (void *)so, + (void *)so->so_proto_handle)); mutex_enter(&so->so_lock); - ASSERT(so->so_priv); /* closed conn */ + ASSERT(so->so_proto_handle); /* closed conn */ ASSERT(!(so->so_state & SS_ACCEPTCONN)); soisconnected(so); - sosdp_sendsig(ss, SDPSIG_WRITE); - mutex_exit(&so->so_lock); - - /* - * Wake ones who're waiting for conn to become established. - */ - pollwakeup(&ss->ss_poll_list, POLLOUT); + so_notify_connected(so); } /* @@ -1368,32 +1252,17 @@ sdp_sock_connected(void *handle) static void sdp_sock_disconnected(void *handle, int error) { - int event = 0; - struct sdp_sonode *ss = handle; - struct sonode *so = &ss->ss_so; + struct sonode *so = handle; ASSERT(so->so_type == SOCK_STREAM); - dprint(2, ("sosdp_disconnected C: so:%p priv:%p error:%d", - (void *)so, so->so_priv, error)); + dprint(2, ("sosdp_disconnected C: so:%p so_proto_handle:%p error:%d", + (void *)so, (void *)so->so_proto_handle, error)); mutex_enter(&so->so_lock); - ASSERT(so->so_priv != NULL); /* closed conn */ - - /* - * If socket is already disconnected/disconnecting, - * don't (re)send signal. - */ - if (!(so->so_state & SS_CANTRCVMORE)) - event |= SDPSIG_READ; - if (!(so->so_state & SS_CANTSENDMORE)) - event |= SDPSIG_WRITE; - if (event != 0) - sosdp_sendsig(ss, event); + ASSERT(so->so_proto_handle != NULL); /* closed conn */ soisdisconnected(so, error); - mutex_exit(&so->so_lock); - - pollwakeup(&ss->ss_poll_list, POLLIN|POLLRDNORM|POLLOUT); + so_notify_disconnected(so, error); } /* @@ -1403,15 +1272,12 @@ sdp_sock_disconnected(void *handle, int error) static int sdp_sock_recv(void *handle, mblk_t *mp, int flags) { - struct sdp_sonode *ss = handle; - struct sonode *so = &ss->ss_so; + struct sonode *so = handle; ASSERT(so->so_type == SOCK_STREAM); mutex_enter(&so->so_lock); - sosdp_sendsig(ss, SDPSIG_READ); - mutex_exit(&so->so_lock); - pollwakeup(&ss->ss_poll_list, POLLIN|POLLRDNORM); + so_notify_data(so, 0); return (so->so_rcvbuf); } @@ -1422,13 +1288,12 @@ sdp_sock_recv(void *handle, mblk_t *mp, int flags) static void sdp_sock_xmitted(void *handle, int writeable) { - struct sdp_sonode *ss = handle; - struct sonode *so = &ss->ss_so; + struct sonode *so = handle; - dprint(4, ("sosdp_sock_xmitted: so:%p priv:%p txq:%d", - (void *)so, so->so_priv, writeable)); + dprint(4, ("sosdp_sock_xmitted: so:%p so_proto_handle:%p txq:%d", + (void *)so, (void *)so->so_proto_handle, writeable)); mutex_enter(&so->so_lock); - ASSERT(so->so_priv != NULL); /* closed conn */ + ASSERT(so->so_proto_handle != NULL); /* closed conn */ /* @@ -1436,9 +1301,7 @@ sdp_sock_xmitted(void *handle, int writeable) * watermark. */ if (!writeable) { - sosdp_sendsig(ss, SDPSIG_WRITE); - mutex_exit(&so->so_lock); - pollwakeup(&ss->ss_poll_list, POLLOUT); + so_notify_writable(so); } else { mutex_exit(&so->so_lock); } @@ -1451,16 +1314,14 @@ sdp_sock_xmitted(void *handle, int writeable) static void sdp_sock_urgdata(void *handle) { - struct sdp_sonode *ss = handle; - - ASSERT(ss->ss_so.so_type == SOCK_STREAM); + struct sonode *so = handle; - mutex_enter(&ss->ss_so.so_lock); + ASSERT(so->so_type == SOCK_STREAM); - ASSERT(ss->ss_so.so_priv != NULL); /* closed conn */ - sosdp_sendsig(ss, SDPSIG_URG); + mutex_enter(&so->so_lock); - mutex_exit(&ss->ss_so.so_lock); + ASSERT(so->so_proto_handle != NULL); /* closed conn */ + so_notify_oobsig(so); } /* @@ -1469,31 +1330,26 @@ sdp_sock_urgdata(void *handle) static void sdp_sock_ordrel(void *handle) { - struct sdp_sonode *ss = handle; - /* LINTED */ - struct sonode *so = &ss->ss_so; - - ASSERT(ss->ss_so.so_type == SOCK_STREAM); - - dprint(4, ("sdp_sock_ordrel : so:%p, priv:%p", - (void *)so, so->so_priv)); - mutex_enter(&ss->ss_so.so_lock); - socantrcvmore(&ss->ss_so); - mutex_exit(&ss->ss_so.so_lock); - pollwakeup(&ss->ss_poll_list, POLLIN|POLLRDNORM); + struct sonode *so = handle; + + ASSERT(so->so_type == SOCK_STREAM); + + dprint(4, ("sdp_sock_ordrel : so:%p, so_proto_handle:%p", + (void *)so, (void *)so->so_proto_handle)); + mutex_enter(&so->so_lock); + socantrcvmore(so); + so_notify_eof(so); } static void sdp_sock_connfail(void *handle, int error) { + struct sonode *so = handle; - struct sdp_sonode *ss = handle; - struct sonode *so = &ss->ss_so; - - dprint(3, ("sosdp_conn Failed: so:%p priv:%p", (void *)so, - so->so_priv)); + dprint(3, ("sosdp_conn Failed: so:%p so_proto_handle:%p", (void *)so, + (void *)so->so_proto_handle)); mutex_enter(&so->so_lock); - ASSERT(so->so_priv != NULL); /* closed conn */ + ASSERT(so->so_proto_handle != NULL); /* closed conn */ so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_error = (ushort_t)error; mutex_exit(&so->so_lock); diff --git a/usr/src/uts/common/inet/sockmods/socksdp.h b/usr/src/uts/common/inet/sockmods/socksdp.h new file mode 100644 index 0000000000..ba6bd109e8 --- /dev/null +++ b/usr/src/uts/common/inet/sockmods/socksdp.h @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SOCKSDP_H_ +#define _SOCKSDP_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +extern sonodeops_t sosdp_sonodeops; +extern sdp_upcalls_t sosdp_sock_upcalls; + +extern void sosdp_fini(struct sonode *, struct cred *); +extern void sosdp_so_inherit(struct sonode *, struct sonode *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SOCKSDP_H_ */ diff --git a/usr/src/uts/common/inet/sockmods/socksdpsubr.c b/usr/src/uts/common/inet/sockmods/socksdpsubr.c new file mode 100644 index 0000000000..8917878ec5 --- /dev/null +++ b/usr/src/uts/common/inet/sockmods/socksdpsubr.c @@ -0,0 +1,60 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/t_lock.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/debug.h> +#include <sys/errno.h> +#include <sys/strsubr.h> +#include <sys/cmn_err.h> +#include <sys/sysmacros.h> + +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/strsun.h> +#include <sys/signal.h> + +#include <inet/sdp_itf.h> +#include "socksdp.h" + +/* + * Inherit socket properties + */ +void +sosdp_so_inherit(struct sonode *lso, struct sonode *nso) +{ + nso->so_options = lso->so_options & (SO_DEBUG|SO_REUSEADDR| + SO_KEEPALIVE|SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK| + SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER); + nso->so_sndbuf = lso->so_sndbuf; + nso->so_rcvbuf = lso->so_rcvbuf; + nso->so_pgrp = lso->so_pgrp; + + nso->so_rcvlowat = lso->so_rcvlowat; + nso->so_sndlowat = lso->so_sndlowat; +} diff --git a/usr/src/uts/common/inet/spdsock.h b/usr/src/uts/common/inet/spdsock.h index a5f18bd1c4..7622e56a45 100644 --- a/usr/src/uts/common/inet/spdsock.h +++ b/usr/src/uts/common/inet/spdsock.h @@ -26,8 +26,6 @@ #ifndef _INET_SPDSOCK_H #define _INET_SPDSOCK_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/netstack.h> #ifdef __cplusplus @@ -112,8 +110,7 @@ extern uint_t spdsock_max_optsize; extern int spdsock_opt_get(queue_t *, int, int, uchar_t *); extern int spdsock_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *, - uint_t *, uchar_t *, void *, cred_t *, - mblk_t *); + uint_t *, uchar_t *, void *, cred_t *, mblk_t *); #ifdef __cplusplus } diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c index 559abd9178..396068a2d9 100644 --- a/usr/src/uts/common/inet/squeue.c +++ b/usr/src/uts/common/inet/squeue.c @@ -1240,3 +1240,142 @@ squeue_getprivate(squeue_t *sqp, sqprivate_t p) return (&sqp->sq_private[p]); } + +/* ARGSUSED */ +void +squeue_wakeup_conn(void *arg, mblk_t *mp, void *arg2) +{ + conn_t *connp = (conn_t *)arg; + squeue_t *sqp = connp->conn_sqp; + + /* + * Mark the squeue as paused before waking up the thread stuck + * in squeue_synch_enter(). + */ + mutex_enter(&sqp->sq_lock); + sqp->sq_state |= SQS_PAUSE; + + /* + * Notify the thread that it's OK to proceed; that is done by + * clearing the MSGWAITSYNC flag. The synch thread will free the mblk. + */ + ASSERT(mp->b_flag & MSGWAITSYNC); + mp->b_flag &= ~MSGWAITSYNC; + cv_broadcast(&connp->conn_sq_cv); + + /* + * We are doing something on behalf of another thread, so we have to + * pause and wait until it finishes. + */ + while (sqp->sq_state & SQS_PAUSE) { + cv_wait(&sqp->sq_synch_cv, &sqp->sq_lock); + } + mutex_exit(&sqp->sq_lock); +} + +/* ARGSUSED */ +int +squeue_synch_enter(squeue_t *sqp, void *arg, uint8_t tag) +{ + conn_t *connp = (conn_t *)arg; + + mutex_enter(&sqp->sq_lock); + if (sqp->sq_first == NULL && !(sqp->sq_state & SQS_PROC)) { + /* + * We are OK to proceed if the squeue is empty, and + * no one owns the squeue. + * + * The caller won't own the squeue as this is called from the + * application. + */ + ASSERT(sqp->sq_run == NULL); + + sqp->sq_state |= SQS_PROC; + sqp->sq_run = curthread; + mutex_exit(&sqp->sq_lock); + +#if SQUEUE_DEBUG + sqp->sq_curmp = NULL; + sqp->sq_curproc = NULL; + sqp->sq_connp = connp; +#endif + connp->conn_on_sqp = B_TRUE; + return (0); + } else { + mblk_t *mp; + + mp = allocb(0, BPRI_MED); + if (mp == NULL) { + mutex_exit(&sqp->sq_lock); + return (ENOMEM); + } + + /* + * We mark the mblk as awaiting synchronous squeue access + * by setting the MSGWAITSYNC flag. Once squeue_wakeup_conn + * fires, MSGWAITSYNC is cleared, at which point we know we + * have exclusive access. + */ + mp->b_flag |= MSGWAITSYNC; + + CONN_INC_REF(connp); + SET_SQUEUE(mp, squeue_wakeup_conn, connp); + ENQUEUE_CHAIN(sqp, mp, mp, 1); + + ASSERT(sqp->sq_run != curthread); + + /* Wait until the enqueued mblk get processed. */ + while (mp->b_flag & MSGWAITSYNC) + cv_wait(&connp->conn_sq_cv, &sqp->sq_lock); + mutex_exit(&sqp->sq_lock); + + freeb(mp); + + return (0); + } +} + +/* ARGSUSED */ +void +squeue_synch_exit(squeue_t *sqp, void *arg) +{ + conn_t *connp = (conn_t *)arg; + + mutex_enter(&sqp->sq_lock); + if (sqp->sq_run == curthread) { + ASSERT(sqp->sq_state & SQS_PROC); + + sqp->sq_state &= ~SQS_PROC; + sqp->sq_run = NULL; + connp->conn_on_sqp = B_FALSE; + + if (sqp->sq_first == NULL) { + mutex_exit(&sqp->sq_lock); + } else { + /* + * If this was a normal thread, then it would + * (most likely) continue processing the pending + * requests. Since the just completed operation + * was executed synchronously, the thread should + * not be delayed. To compensate, wake up the + * worker thread right away when there are outstanding + * requests. + */ + sqp->sq_awaken = lbolt; + cv_signal(&sqp->sq_worker_cv); + mutex_exit(&sqp->sq_lock); + } + } else { + /* + * The caller doesn't own the squeue, clear the SQS_PAUSE flag, + * and wake up the squeue owner, such that owner can continue + * processing. + */ + ASSERT(sqp->sq_state & SQS_PAUSE); + sqp->sq_state &= ~SQS_PAUSE; + + /* There should be only one thread blocking on sq_synch_cv. */ + cv_signal(&sqp->sq_synch_cv); + mutex_exit(&sqp->sq_lock); + } +} diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h index 3a557048d6..76d1864d62 100644 --- a/usr/src/uts/common/inet/tcp.h +++ b/usr/src/uts/common/inet/tcp.h @@ -35,6 +35,7 @@ extern "C" { #include <netinet/ip6.h> #include <netinet/tcp.h> #include <sys/socket.h> +#include <sys/socket_proto.h> #include <sys/sodirect.h> #include <sys/multidata.h> #include <sys/md5.h> @@ -201,7 +202,6 @@ typedef struct tcp_s { #define TCP_OFO_FIN_VALID 0x8 /* Has TCP received an out of order FIN? */ - int32_t tcp_xmit_hiwater; /* Send buffer high water mark. */ timeout_id_t tcp_timer_tid; /* Control block for timer service */ uchar_t tcp_timer_backoff; /* Backoff shift count. */ @@ -340,7 +340,10 @@ typedef struct tcp_s { struct tcp_s *tcp_listener; /* Our listener */ - int32_t tcp_xmit_lowater; /* Send buffer low water mark. */ + size_t tcp_xmit_hiwater; /* Send buffer high water mark. */ + size_t tcp_xmit_lowater; /* Send buffer low water mark. */ + size_t tcp_recv_hiwater; /* Recv high water mark */ + size_t tcp_recv_lowater; /* Recv low water mark */ uint32_t tcp_irs; /* Initial recv seq num */ uint32_t tcp_fss; /* Final/fin send seq num */ @@ -491,6 +494,7 @@ typedef struct tcp_s { struct tcp_s *tcp_acceptor_hash; /* Acceptor hash chain */ struct tcp_s **tcp_ptpahn; /* Pointer to previous accept hash next. */ struct tcp_s *tcp_bind_hash; /* Bind hash chain */ + struct tcp_s *tcp_bind_hash_port; /* tcp_t's bound to the same lport */ struct tcp_s **tcp_ptpbhn; boolean_t tcp_ire_ill_check_done; @@ -599,6 +603,15 @@ typedef struct tcp_s { boolean_t tcp_flow_stopped; /* + * The socket generation number is bumped when an outgoing connection + * attempts is made, and it sent up to the socket when the + * connection was successfully established, or an error occured. The + * generation is used to ensure that the socket does not miss the + * asynchronous notification. + */ + sock_connid_t tcp_connid; + + /* * tcp_sodirect is used by tcp on the receive side to push mblk_t(s) * directly to sockfs. Also, to schedule asynchronous copyout directly * to a pending user-land uio buffer. diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index 4bb50d2344..ce7d9fb395 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -58,6 +58,7 @@ #include <sys/errno.h> #include <sys/signal.h> #include <sys/socket.h> +#include <sys/socketvar.h> #include <sys/sockio.h> #include <sys/isa_defs.h> #include <sys/md5.h> @@ -78,7 +79,7 @@ #include <inet/ip_impl.h> #include <inet/ip6.h> #include <inet/ip_ndp.h> -#include <inet/mi.h> +#include <inet/proto_set.h> #include <inet/mib2.h> #include <inet/nd.h> #include <inet/optcom.h> @@ -386,11 +387,8 @@ kstat_t *tcp_g_kstat; * tcp write side. */ #define CALL_IP_WPUT(connp, q, mp) { \ - tcp_stack_t *tcps; \ - \ - tcps = connp->conn_netstack->netstack_tcp; \ ASSERT(((q)->q_flag & QREADR) == 0); \ - TCP_DBGSTAT(tcps, tcp_ip_output); \ + TCP_DBGSTAT(connp->conn_netstack->netstack_tcp, tcp_ip_output); \ connp->conn_send(connp, (mp), (q), IP_WPUT); \ } @@ -650,6 +648,19 @@ typedef struct tcp_opt_s { } tcp_opt_t; /* + * TCP option struct passing information b/w lisenter and eager. + */ +struct tcp_options { + uint_t to_flags; + ssize_t to_boundif; /* IPV6_BOUND_IF */ + sock_upper_handle_t to_handle; +}; + +#define TCPOPT_BOUNDIF 0x00000001 /* set IPV6_BOUND_IF */ +#define TCPOPT_RECVPKTINFO 0x00000002 /* set IPV6_RECVPKTINFO */ +#define TCPOPT_UPPERHANDLE 0x00000004 /* set upper handle */ + +/* * RFC1323-recommended phrasing of TSTAMP option, for easier parsing */ @@ -742,6 +753,7 @@ void tcp_input(void *arg, mblk_t *mp, void *arg2); void tcp_rput_data(void *arg, mblk_t *mp, void *arg2); static void tcp_close_output(void *arg, mblk_t *mp, void *arg2); void tcp_output(void *arg, mblk_t *mp, void *arg2); +void tcp_output_urgent(void *arg, mblk_t *mp, void *arg2); static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2); static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2); static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2); @@ -750,7 +762,7 @@ static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2); /* Prototype for TCP functions */ static void tcp_random_init(void); int tcp_random(void); -static void tcp_accept(tcp_t *tcp, mblk_t *mp); +static void tcp_tli_accept(tcp_t *tcp, mblk_t *mp); static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager); static int tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp); @@ -761,12 +773,12 @@ static void tcp_closei_local(tcp_t *tcp); static void tcp_close_detached(tcp_t *tcp); static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp, mblk_t **defermp); -static void tcp_connect(tcp_t *tcp, mblk_t *mp); -static void tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, - in_port_t dstport, uint_t srcid); -static void tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, +static void tcp_tpi_connect(tcp_t *tcp, mblk_t *mp); +static int tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, + in_port_t dstport, uint_t srcid, cred_t *cr, pid_t pid); +static int tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, in_port_t dstport, uint32_t flowinfo, uint_t srcid, - uint32_t scope_id); + uint32_t scope_id, cred_t *cr, pid_t pid); static int tcp_clean_death(tcp_t *tcp, int err, uint8_t tag); static void tcp_def_q_set(tcp_t *tcp, mblk_t *mp); static void tcp_disconnect(tcp_t *tcp, mblk_t *mp); @@ -803,11 +815,9 @@ static int tcp_header_init_ipv6(tcp_t *tcp); int tcp_init(tcp_t *tcp, queue_t *q); static int tcp_init_values(tcp_t *tcp); static mblk_t *tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic); -static mblk_t *tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim, - t_scalar_t addr_length); static void tcp_ip_ire_mark_advice(tcp_t *tcp); static void tcp_ip_notify(tcp_t *tcp); -static mblk_t *tcp_ire_mp(mblk_t *mp); +static mblk_t *tcp_ire_mp(mblk_t **mpp); static void tcp_iss_init(tcp_t *tcp); static void tcp_keepalive_killer(void *arg); static int tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt); @@ -816,8 +826,8 @@ static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp, int *t_errorp, int *sys_errorp); static boolean_t tcp_allow_connopt_set(int level, int name); int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr); -int tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr); -int tcp_opt_set(queue_t *q, uint_t optset_context, int level, +int tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr); +int tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk); @@ -842,7 +852,8 @@ static void tcp_reinit_values(tcp_t *tcp); static void tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval, tcp_t *thisstream, cred_t *cr); -static uint_t tcp_rcv_drain(queue_t *q, tcp_t *tcp); +static uint_t tcp_rwnd_reopen(tcp_t *tcp); +static uint_t tcp_rcv_drain(tcp_t *tcp); static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags); static boolean_t tcp_send_rst_chk(tcp_stack_t *); static void tcp_ss_rexmit(tcp_t *tcp); @@ -868,7 +879,8 @@ static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random); static in_port_t tcp_get_next_priv_port(const tcp_t *); static void tcp_wput_sock(queue_t *q, mblk_t *mp); -void tcp_wput_accept(queue_t *q, mblk_t *mp); +static void tcp_wput_fallback(queue_t *q, mblk_t *mp); +void tcp_tpi_accept(queue_t *q, mblk_t *mp); static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent); static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp); static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp); @@ -901,9 +913,7 @@ static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *, boolean_t, boolean_t); static void tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, boolean_t ipsec_mctl); -static mblk_t *tcp_setsockopt_mp(int level, int cmd, - char *opt, int optlen); -static int tcp_build_hdrs(queue_t *, tcp_t *); +static int tcp_build_hdrs(tcp_t *); static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, uint32_t seg_ack, int seg_len, tcph_t *tcph); @@ -943,7 +953,7 @@ static int tcp_squeue_switch(int); static int tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t); static int tcp_openv4(queue_t *, dev_t *, int, int, cred_t *); static int tcp_openv6(queue_t *, dev_t *, int, int, cred_t *); -static int tcp_close(queue_t *, int); +static int tcp_tpi_close(queue_t *, int); static int tcpclose_accept(queue_t *); static void tcp_squeue_add(squeue_t *); @@ -958,6 +968,19 @@ extern void tcp_kssl_input(tcp_t *, mblk_t *); void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2); void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2); +static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t, + sock_upper_handle_t, cred_t *); +static int tcp_listen(sock_lower_handle_t, int, cred_t *); +static int tcp_post_ip_bind(tcp_t *, mblk_t *, int); +static int tcp_do_listen(conn_t *, int, cred_t *); +static int tcp_do_connect(conn_t *, const struct sockaddr *, socklen_t, + cred_t *, pid_t); +static int tcp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *, + boolean_t); +static int tcp_do_unbind(conn_t *); +static int tcp_bind_check(conn_t *, struct sockaddr *, socklen_t, cred_t *, + boolean_t); + /* * Routines related to the TCP_IOC_ABORT_CONN ioctl command. * @@ -1001,11 +1024,11 @@ static struct module_info tcp_winfo = { * We have separate open functions for the /dev/tcp and /dev/tcp6 devices. */ struct qinit tcp_rinitv4 = { - NULL, (pfi_t)tcp_rsrv, tcp_openv4, tcp_close, NULL, &tcp_rinfo + NULL, (pfi_t)tcp_rsrv, tcp_openv4, tcp_tpi_close, NULL, &tcp_rinfo }; struct qinit tcp_rinitv6 = { - NULL, (pfi_t)tcp_rsrv, tcp_openv6, tcp_close, NULL, &tcp_rinfo + NULL, (pfi_t)tcp_rsrv, tcp_openv6, tcp_tpi_close, NULL, &tcp_rinfo }; struct qinit tcp_winit = { @@ -1017,6 +1040,11 @@ struct qinit tcp_sock_winit = { (pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo }; +/* TCP entry point during fallback */ +struct qinit tcp_fallback_sock_winit = { + (pfi_t)tcp_wput_fallback, NULL, NULL, NULL, NULL, &tcp_winfo +}; + /* * Entry points for TCP as a acceptor STREAM opened by sockfs when doing * an accept. Avoid allocating data structures since eager has already @@ -1027,7 +1055,7 @@ struct qinit tcp_acceptor_rinit = { }; struct qinit tcp_acceptor_winit = { - (pfi_t)tcp_wput_accept, NULL, NULL, NULL, NULL, &tcp_winfo + (pfi_t)tcp_tpi_accept, NULL, NULL, NULL, NULL, &tcp_winfo }; /* @@ -1036,7 +1064,7 @@ struct qinit tcp_acceptor_winit = { * have a separate one for tcp_openv6. */ struct qinit tcp_loopback_rinit = { - (pfi_t)0, (pfi_t)tcp_rsrv, tcp_openv4, tcp_close, (pfi_t)0, + (pfi_t)0, (pfi_t)tcp_rsrv, tcp_openv4, tcp_tpi_close, (pfi_t)0, &tcp_rinfo, NULL, tcp_fuse_rrw, tcp_fuse_rinfop, STRUIOT_STANDARD }; @@ -1050,6 +1078,8 @@ struct streamtab tcpinfov6 = { &tcp_rinitv6, &tcp_winit }; +sock_downcalls_t sock_tcp_downcalls; + /* * Have to ensure that tcp_g_q_close is not done by an * interrupt thread. @@ -1907,6 +1937,7 @@ tcp_time_wait_collector(void *arg) CALLOUT_FLAG_ROUNDUP); mutex_exit(&tcp_time_wait->tcp_time_wait_lock); } + /* * Reply to a clients T_CONN_RES TPI message. This function * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES @@ -1914,7 +1945,7 @@ tcp_time_wait_collector(void *arg) * Read the block comment on top of tcp_conn_request(). */ static void -tcp_accept(tcp_t *listener, mblk_t *mp) +tcp_tli_accept(tcp_t *listener, mblk_t *mp) { tcp_t *acceptor; tcp_t *eager; @@ -1923,6 +1954,7 @@ tcp_accept(tcp_t *listener, mblk_t *mp) t_uscalar_t acceptor_id; t_scalar_t seqnum; mblk_t *opt_mp = NULL; /* T_OPTMGMT_REQ messages */ + struct tcp_options *tcpopt; mblk_t *ok_mp; mblk_t *mp1; tcp_stack_t *tcps = listener->tcp_tcps; @@ -2070,7 +2102,8 @@ tcp_accept(tcp_t *listener, mblk_t *mp) ASSERT(eager->tcp_connp->conn_ref >= 1); /* Pre allocate the stroptions mblk also */ - opt_mp = allocb(sizeof (struct stroptions), BPRI_HI); + opt_mp = allocb(MAX(sizeof (struct tcp_options), + sizeof (struct T_conn_res)), BPRI_HI); if (opt_mp == NULL) { CONN_DEC_REF(acceptor->tcp_connp); CONN_DEC_REF(eager->tcp_connp); @@ -2078,29 +2111,20 @@ tcp_accept(tcp_t *listener, mblk_t *mp) return; } DB_TYPE(opt_mp) = M_SETOPTS; - opt_mp->b_wptr += sizeof (struct stroptions); + opt_mp->b_wptr += sizeof (struct tcp_options); + tcpopt = (struct tcp_options *)opt_mp->b_rptr; + tcpopt->to_flags = 0; /* * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO - * from listener to acceptor. The message is chained on opt_mp - * which will be sent onto eager's squeue. + * from listener to acceptor. */ if (listener->tcp_bound_if != 0) { - /* allocate optmgmt req */ - mp1 = tcp_setsockopt_mp(IPPROTO_IPV6, - IPV6_BOUND_IF, (char *)&listener->tcp_bound_if, - sizeof (int)); - if (mp1 != NULL) - linkb(opt_mp, mp1); + tcpopt->to_flags |= TCPOPT_BOUNDIF; + tcpopt->to_boundif = listener->tcp_bound_if; } if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) { - uint_t on = 1; - - /* allocate optmgmt req */ - mp1 = tcp_setsockopt_mp(IPPROTO_IPV6, - IPV6_RECVPKTINFO, (char *)&on, sizeof (on)); - if (mp1 != NULL) - linkb(opt_mp, mp1); + tcpopt->to_flags |= TCPOPT_RECVPKTINFO; } /* Re-use mp1 to hold a copy of mp, in case reallocb fails */ @@ -2341,6 +2365,7 @@ tcp_accept(tcp_t *listener, mblk_t *mp) finish: ASSERT(acceptor->tcp_detached); ASSERT(tcps->tcps_g_q != NULL); + ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp)); acceptor->tcp_rq = tcps->tcps_g_q; acceptor->tcp_wq = WR(tcps->tcps_g_q); (void) tcp_clean_death(acceptor, 0, 2); @@ -2995,39 +3020,24 @@ error: return (0); } -/* - * tcp_bind is called (holding the writer lock) by tcp_wput_proto to process a - * O_T_BIND_REQ/T_BIND_REQ message. - */ static void -tcp_bind(tcp_t *tcp, mblk_t *mp) +tcp_tpi_bind(tcp_t *tcp, mblk_t *mp) { + int error; + conn_t *connp = tcp->tcp_connp; + struct sockaddr *sa; + mblk_t *mp1; + struct T_bind_req *tbr; + int backlog; + socklen_t len; sin_t *sin; sin6_t *sin6; - mblk_t *mp1; - in_port_t requested_port; - in_port_t allocated_port; - struct T_bind_req *tbr; - boolean_t bind_to_req_port_only; - boolean_t backlog_update = B_FALSE; - boolean_t user_specified; - in6_addr_t v6addr; - ipaddr_t v4addr; - uint_t origipversion; - int err; - queue_t *q = tcp->tcp_wq; - conn_t *connp = tcp->tcp_connp; - mlp_type_t addrtype, mlptype; - zone_t *zone; - cred_t *cr; - in_port_t mlp_port; - tcp_stack_t *tcps = tcp->tcp_tcps; ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { if (tcp->tcp_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, - "tcp_bind: bad req, len %u", + "tcp_tpi_bind: bad req, len %u", (uint_t)(mp->b_wptr - mp->b_rptr)); } tcp_err_ack(tcp, mp, TPROTO, 0); @@ -3041,442 +3051,80 @@ tcp_bind(tcp_t *tcp, mblk_t *mp) } mp = mp1; tbr = (struct T_bind_req *)mp->b_rptr; - if (tcp->tcp_state >= TCPS_BOUND) { - if ((tcp->tcp_state == TCPS_BOUND || - tcp->tcp_state == TCPS_LISTEN) && - tcp->tcp_conn_req_max != tbr->CONIND_number && - tbr->CONIND_number > 0) { - /* - * Handle listen() increasing CONIND_number. - * This is more "liberal" then what the TPI spec - * requires but is needed to avoid a t_unbind - * when handling listen() since the port number - * might be "stolen" between the unbind and bind. - */ - backlog_update = B_TRUE; - goto do_bind; - } - if (tcp->tcp_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, - "tcp_bind: bad state, %d", tcp->tcp_state); - } - tcp_err_ack(tcp, mp, TOUTSTATE, 0); - return; - } - origipversion = tcp->tcp_ipversion; - switch (tbr->ADDR_length) { - case 0: /* request for a generic port */ + backlog = tbr->CONIND_number; + len = tbr->ADDR_length; + + switch (len) { + case 0: /* request for a generic port */ tbr->ADDR_offset = sizeof (struct T_bind_req); if (tcp->tcp_family == AF_INET) { tbr->ADDR_length = sizeof (sin_t); sin = (sin_t *)&tbr[1]; *sin = sin_null; sin->sin_family = AF_INET; + sa = (struct sockaddr *)sin; + len = sizeof (sin_t); mp->b_wptr = (uchar_t *)&sin[1]; - tcp->tcp_ipversion = IPV4_VERSION; - IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &v6addr); } else { ASSERT(tcp->tcp_family == AF_INET6); tbr->ADDR_length = sizeof (sin6_t); sin6 = (sin6_t *)&tbr[1]; *sin6 = sin6_null; sin6->sin6_family = AF_INET6; + sa = (struct sockaddr *)sin6; + len = sizeof (sin6_t); mp->b_wptr = (uchar_t *)&sin6[1]; - tcp->tcp_ipversion = IPV6_VERSION; - V6_SET_ZERO(v6addr); } - requested_port = 0; break; - case sizeof (sin_t): /* Complete IPv4 address */ - sin = (sin_t *)mi_offset_param(mp, tbr->ADDR_offset, + case sizeof (sin_t): /* Complete IPv4 address */ + sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, sizeof (sin_t)); - if (sin == NULL || !OK_32PTR((char *)sin)) { - if (tcp->tcp_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "tcp_bind: bad address parameter, " - "offset %d, len %d", - tbr->ADDR_offset, tbr->ADDR_length); - } - tcp_err_ack(tcp, mp, TPROTO, 0); - return; - } - /* - * With sockets sockfs will accept bogus sin_family in - * bind() and replace it with the family used in the socket - * call. - */ - if (sin->sin_family != AF_INET || - tcp->tcp_family != AF_INET) { - tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); - return; - } - requested_port = ntohs(sin->sin_port); - tcp->tcp_ipversion = IPV4_VERSION; - v4addr = sin->sin_addr.s_addr; - IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr); break; case sizeof (sin6_t): /* Complete IPv6 address */ - sin6 = (sin6_t *)mi_offset_param(mp, + sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, sizeof (sin6_t)); - if (sin6 == NULL || !OK_32PTR((char *)sin6)) { - if (tcp->tcp_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "tcp_bind: bad IPv6 address parameter, " - "offset %d, len %d", tbr->ADDR_offset, - tbr->ADDR_length); - } - tcp_err_ack(tcp, mp, TSYSERR, EINVAL); - return; - } - if (sin6->sin6_family != AF_INET6 || - tcp->tcp_family != AF_INET6) { - tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); - return; - } - requested_port = ntohs(sin6->sin6_port); - tcp->tcp_ipversion = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ? - IPV4_VERSION : IPV6_VERSION; - v6addr = sin6->sin6_addr; break; default: if (tcp->tcp_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, - "tcp_bind: bad address length, %d", + "tcp_tpi_bind: bad address length, %d", tbr->ADDR_length); } tcp_err_ack(tcp, mp, TBADADDR, 0); return; } - tcp->tcp_bound_source_v6 = v6addr; - - /* Check for change in ipversion */ - if (origipversion != tcp->tcp_ipversion) { - ASSERT(tcp->tcp_family == AF_INET6); - err = tcp->tcp_ipversion == IPV6_VERSION ? - tcp_header_init_ipv6(tcp) : tcp_header_init_ipv4(tcp); - if (err) { - tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); - return; - } - } - - /* - * Initialize family specific fields. Copy of the src addr. - * in tcp_t is needed for the lookup funcs. - */ - if (tcp->tcp_ipversion == IPV6_VERSION) { - tcp->tcp_ip6h->ip6_src = v6addr; - } else { - IN6_V4MAPPED_TO_IPADDR(&v6addr, tcp->tcp_ipha->ipha_src); - } - tcp->tcp_ip_src_v6 = v6addr; - /* - * For O_T_BIND_REQ: - * Verify that the target port/addr is available, or choose - * another. - * For T_BIND_REQ: - * Verify that the target port/addr is available or fail. - * In both cases when it succeeds the tcp is inserted in the - * bind hash table. This ensures that the operation is atomic - * under the lock on the hash bucket. - */ - bind_to_req_port_only = requested_port != 0 && - tbr->PRIM_type != O_T_BIND_REQ; - /* - * Get a valid port (within the anonymous range and should not - * be a privileged one) to use if the user has not given a port. - * If multiple threads are here, they may all start with - * with the same initial port. But, it should be fine as long as - * tcp_bindi will ensure that no two threads will be assigned - * the same port. - * - * NOTE: XXX If a privileged process asks for an anonymous port, we - * still check for ports only in the range > tcp_smallest_non_priv_port, - * unless TCP_ANONPRIVBIND option is set. - */ - mlptype = mlptSingle; - mlp_port = requested_port; - if (requested_port == 0) { - requested_port = tcp->tcp_anon_priv_bind ? - tcp_get_next_priv_port(tcp) : - tcp_update_next_port(tcps->tcps_next_port_to_try, - tcp, B_TRUE); - if (requested_port == 0) { - tcp_err_ack(tcp, mp, TNOADDR, 0); - return; - } - user_specified = B_FALSE; - - /* - * If the user went through one of the RPC interfaces to create - * this socket and RPC is MLP in this zone, then give him an - * anonymous MLP. - */ - cr = DB_CREDDEF(mp, tcp->tcp_cred); - if (connp->conn_anon_mlp && is_system_labeled()) { - zone = crgetzone(cr); - addrtype = tsol_mlp_addr_type(zone->zone_id, - IPV6_VERSION, &v6addr, - tcps->tcps_netstack->netstack_ip); - if (addrtype == mlptSingle) { - tcp_err_ack(tcp, mp, TNOADDR, 0); - return; - } - mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, - PMAPPORT, addrtype); - mlp_port = PMAPPORT; - } - } else { - int i; - boolean_t priv = B_FALSE; - - /* - * If the requested_port is in the well-known privileged range, - * verify that the stream was opened by a privileged user. - * Note: No locks are held when inspecting tcp_g_*epriv_ports - * but instead the code relies on: - * - the fact that the address of the array and its size never - * changes - * - the atomic assignment of the elements of the array - */ - cr = DB_CREDDEF(mp, tcp->tcp_cred); - if (requested_port < tcps->tcps_smallest_nonpriv_port) { - priv = B_TRUE; - } else { - for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { - if (requested_port == - tcps->tcps_g_epriv_ports[i]) { - priv = B_TRUE; - break; - } - } - } - if (priv) { - if (secpolicy_net_privaddr(cr, requested_port, - IPPROTO_TCP) != 0) { - if (tcp->tcp_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "tcp_bind: no priv for port %d", - requested_port); - } - tcp_err_ack(tcp, mp, TACCES, 0); - return; - } - } - user_specified = B_TRUE; - - if (is_system_labeled()) { - zone = crgetzone(cr); - addrtype = tsol_mlp_addr_type(zone->zone_id, - IPV6_VERSION, &v6addr, - tcps->tcps_netstack->netstack_ip); - if (addrtype == mlptSingle) { - tcp_err_ack(tcp, mp, TNOADDR, 0); - return; - } - mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, - requested_port, addrtype); - } - } - - if (mlptype != mlptSingle) { - if (secpolicy_net_bindmlp(cr) != 0) { - if (tcp->tcp_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "tcp_bind: no priv for multilevel port %d", - requested_port); - } - tcp_err_ack(tcp, mp, TACCES, 0); - return; - } - - /* - * If we're specifically binding a shared IP address and the - * port is MLP on shared addresses, then check to see if this - * zone actually owns the MLP. Reject if not. - */ - if (mlptype == mlptShared && addrtype == mlptShared) { - /* - * No need to handle exclusive-stack zones since - * ALL_ZONES only applies to the shared stack. - */ - zoneid_t mlpzone; - - mlpzone = tsol_mlp_findzone(IPPROTO_TCP, - htons(mlp_port)); - if (connp->conn_zoneid != mlpzone) { - if (tcp->tcp_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "tcp_bind: attempt to bind port " - "%d on shared addr in zone %d " - "(should be %d)", - mlp_port, connp->conn_zoneid, - mlpzone); - } - tcp_err_ack(tcp, mp, TACCES, 0); - return; - } - } - - if (!user_specified) { - err = tsol_mlp_anon(zone, mlptype, connp->conn_ulp, - requested_port, B_TRUE); - if (err != 0) { - if (tcp->tcp_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "tcp_bind: cannot establish anon " - "MLP for port %d", - requested_port); - } - tcp_err_ack(tcp, mp, TSYSERR, err); - return; - } - connp->conn_anon_port = B_TRUE; - } - connp->conn_mlp_type = mlptype; - } - - allocated_port = tcp_bindi(tcp, requested_port, &v6addr, - tcp->tcp_reuseaddr, B_FALSE, bind_to_req_port_only, user_specified); - - if (allocated_port == 0) { - connp->conn_mlp_type = mlptSingle; - if (connp->conn_anon_port) { - connp->conn_anon_port = B_FALSE; - (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp, - requested_port, B_FALSE); - } - if (bind_to_req_port_only) { - if (tcp->tcp_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "tcp_bind: requested addr busy"); - } - tcp_err_ack(tcp, mp, TADDRBUSY, 0); - } else { - /* If we are out of ports, fail the bind. */ - if (tcp->tcp_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "tcp_bind: out of ports?"); - } - tcp_err_ack(tcp, mp, TNOADDR, 0); - } - return; - } - ASSERT(tcp->tcp_state == TCPS_BOUND); -do_bind: - if (!backlog_update) { - if (tcp->tcp_family == AF_INET) - sin->sin_port = htons(allocated_port); - else - sin6->sin6_port = htons(allocated_port); - } - if (tcp->tcp_family == AF_INET) { - if (tbr->CONIND_number != 0) { - mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, - sizeof (sin_t)); - } else { - /* Just verify the local IP address */ - mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, IP_ADDR_LEN); - } - } else { - if (tbr->CONIND_number != 0) { - mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, - sizeof (sin6_t)); + error = tcp_bind_check(connp, sa, len, DB_CRED(mp), + tbr->PRIM_type != O_T_BIND_REQ); + if (error == 0) { + if (tcp->tcp_family == AF_INET) { + sin = (sin_t *)sa; + sin->sin_port = tcp->tcp_lport; } else { - /* Just verify the local IP address */ - mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, - IPV6_ADDR_LEN); - } - } - if (mp1 == NULL) { - if (connp->conn_anon_port) { - connp->conn_anon_port = B_FALSE; - (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp, - requested_port, B_FALSE); + sin6 = (sin6_t *)sa; + sin6->sin6_port = tcp->tcp_lport; } - connp->conn_mlp_type = mlptSingle; - tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); - return; - } - - tbr->PRIM_type = T_BIND_ACK; - mp->b_datap->db_type = M_PCPROTO; - /* Chain in the reply mp for tcp_rput() */ - mp1->b_cont = mp; - mp = mp1; - - tcp->tcp_conn_req_max = tbr->CONIND_number; - if (tcp->tcp_conn_req_max) { - if (tcp->tcp_conn_req_max < tcps->tcps_conn_req_min) - tcp->tcp_conn_req_max = tcps->tcps_conn_req_min; - if (tcp->tcp_conn_req_max > tcps->tcps_conn_req_max_q) - tcp->tcp_conn_req_max = tcps->tcps_conn_req_max_q; - /* - * If this is a listener, do not reset the eager list - * and other stuffs. Note that we don't check if the - * existing eager list meets the new tcp_conn_req_max - * requirement. - */ - if (tcp->tcp_state != TCPS_LISTEN) { - tcp->tcp_state = TCPS_LISTEN; - /* Initialize the chain. Don't need the eager_lock */ - tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; - tcp->tcp_eager_next_drop_q0 = tcp; - tcp->tcp_eager_prev_drop_q0 = tcp; - tcp->tcp_second_ctimer_threshold = - tcps->tcps_ip_abort_linterval; + if (backlog > 0) { + error = tcp_do_listen(connp, backlog, DB_CRED(mp)); } } - - /* - * We can call ip_bind directly which returns a T_BIND_ACK mp. The - * processing continues in tcp_rput_other(). - * - * We need to make sure that the conn_recv is set to a non-null - * value before we insert the conn into the classifier table. - * This is to avoid a race with an incoming packet which does an - * ipcl_classify(). - */ - connp->conn_recv = tcp_conn_request; - if (tcp->tcp_family == AF_INET6) { - ASSERT(tcp->tcp_connp->conn_af_isv6); - mp = ip_bind_v6(q, mp, tcp->tcp_connp, &tcp->tcp_sticky_ipp); - } else { - ASSERT(!tcp->tcp_connp->conn_af_isv6); - mp = ip_bind_v4(q, mp, tcp->tcp_connp); - } - /* - * If the bind cannot complete immediately - * IP will arrange to call tcp_rput_other - * when the bind completes. - */ - if (mp != NULL) { - tcp_rput_other(tcp, mp); +done: + if (error > 0) { + tcp_err_ack(tcp, mp, TSYSERR, error); + } else if (error < 0) { + tcp_err_ack(tcp, mp, -error, 0); } else { - /* - * Bind will be resumed later. Need to ensure - * that conn doesn't disappear when that happens. - * This will be decremented in ip_resume_tcp_bind(). - */ - CONN_INC_REF(tcp->tcp_connp); + mp->b_datap->db_type = M_PCPROTO; + tbr->PRIM_type = T_BIND_ACK; + putnext(tcp->tcp_rq, mp); } } - /* * If the "bind_to_req_port_only" parameter is set, if the requested port * number is available, return it, If not return 0 @@ -3560,12 +3208,14 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, mutex_enter(&tbf->tf_lock); for (ltcp = tbf->tf_tcp; ltcp != NULL; ltcp = ltcp->tcp_bind_hash) { + if (lport == ltcp->tcp_lport) + break; + } + + for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) { boolean_t not_socket; boolean_t exclbind; - if (lport != ltcp->tcp_lport) - continue; - lconnp = ltcp->tcp_connp; /* @@ -3817,6 +3467,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) { mblk_t *mp; queue_t *q; + conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; sodirect_t *sodp; @@ -3857,7 +3508,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) */ tcp_closei_local(tcp); if (!tcp->tcp_tconnind_started) { - CONN_DEC_REF(tcp->tcp_connp); + CONN_DEC_REF(connp); } else { tcp->tcp_state = TCPS_BOUND; } @@ -3879,7 +3530,10 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) q = tcp->tcp_rq; /* Trash all inbound data */ - flushq(q, FLUSHALL); + if (!IPCL_IS_NONSTR(connp)) { + ASSERT(q != NULL); + flushq(q, FLUSHALL); + } /* * If we are at least part way open and there is error @@ -3900,16 +3554,22 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, "tcp_clean_death: discon err %d", err); } - mp = mi_tpi_discon_ind(NULL, err, 0); - if (mp != NULL) { - putnext(q, mp); + if (IPCL_IS_NONSTR(connp)) { + /* Direct socket, use upcall */ + (*connp->conn_upcalls->su_disconnected)( + connp->conn_upper_handle, tcp->tcp_connid, err); } else { - if (tcp->tcp_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "tcp_clean_death, sending M_ERROR"); + mp = mi_tpi_discon_ind(NULL, err, 0); + if (mp != NULL) { + putnext(q, mp); + } else { + if (tcp->tcp_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_clean_death, sending M_ERROR"); + } + (void) putnextctl1(q, M_ERROR, EPROTO); } - (void) putnextctl1(q, M_ERROR, EPROTO); } if (tcp->tcp_state <= TCPS_SYN_RCVD) { /* SYN_SENT or SYN_RCVD */ @@ -3921,6 +3581,9 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) } tcp_reinit(tcp); + if (IPCL_IS_NONSTR(connp)) + (void) tcp_do_unbind(connp); + return (-1); } @@ -3954,7 +3617,6 @@ tcp_stop_lingering(tcp_t *tcp) */ tcp_timers_stop(tcp); - tcp->tcp_detached = B_TRUE; ASSERT(tcps->tcps_g_q != NULL); tcp->tcp_rq = tcps->tcps_g_q; @@ -3984,8 +3646,10 @@ finish: mutex_enter(&tcp->tcp_closelock); tcp->tcp_detached = B_TRUE; ASSERT(tcps->tcps_g_q != NULL); + tcp->tcp_rq = tcps->tcps_g_q; tcp->tcp_wq = WR(tcps->tcps_g_q); + tcp->tcp_closed = 1; cv_signal(&tcp->tcp_closecv); mutex_exit(&tcp->tcp_closelock); @@ -4005,21 +3669,17 @@ tcp_close_linger_timeout(void *arg) tcp_stop_lingering(tcp); } -static int -tcp_close(queue_t *q, int flags) +static void +tcp_close_common(conn_t *connp, int flags) { - conn_t *connp = Q_TO_CONN(q); tcp_t *tcp = connp->conn_tcp; mblk_t *mp = &tcp->tcp_closemp; boolean_t conn_ioctl_cleanup_reqd = B_FALSE; mblk_t *bp; - ASSERT(WR(q)->q_next == NULL); ASSERT(connp->conn_ref >= 2); /* - * We are being closed as /dev/tcp or /dev/tcp6. - * * Mark the conn as closing. ill_pending_mp_add will not * add any mp to the pending mp list, after this conn has * started closing. Same for sq_pending_mp_add @@ -4106,11 +3766,35 @@ tcp_close(queue_t *q, int flags) if (conn_ioctl_cleanup_reqd) conn_ioctl_cleanup(connp); + tcp->tcp_cpid = -1; +} + +static int +tcp_tpi_close(queue_t *q, int flags) +{ + conn_t *connp; + + ASSERT(WR(q)->q_next == NULL); + + if (flags & SO_FALLBACK) { + /* + * stream is being closed while in fallback + * simply free the resources that were allocated + */ + inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); + qprocsoff(q); + goto done; + } + + connp = Q_TO_CONN(q); + /* + * We are being closed as /dev/tcp or /dev/tcp6. + */ + tcp_close_common(connp, flags); + qprocsoff(q); inet_minor_free(connp->conn_minor_arena, connp->conn_dev); - tcp->tcp_cpid = -1; - /* * Drop IP's reference on the conn. This is the last reference * on the connp if the state was less than established. If the @@ -4124,6 +3808,7 @@ tcp_close(queue_t *q, int flags) * packets in squeue for the timewait state. */ CONN_DEC_REF(connp); +done: q->q_ptr = WR(q)->q_ptr = NULL; return (0); } @@ -4615,11 +4300,13 @@ tcp_free(tcp_t *tcp) } if (tcp->tcp_fused_sigurg_mp != NULL) { + ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); freeb(tcp->tcp_fused_sigurg_mp); tcp->tcp_fused_sigurg_mp = NULL; } if (tcp->tcp_ordrel_mp != NULL) { + ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); freeb(tcp->tcp_ordrel_mp); tcp->tcp_ordrel_mp = NULL; } @@ -4761,10 +4448,19 @@ tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp, DB_CPID(mp) = DB_CPID(idmp); } - if (defermp == NULL) - putnext(tcp->tcp_rq, mp); - else + if (defermp == NULL) { + conn_t *connp = tcp->tcp_connp; + if (IPCL_IS_NONSTR(connp)) { + (*connp->conn_upcalls->su_connected) + (connp->conn_upper_handle, tcp->tcp_connid, cr, + DB_CPID(mp)); + freemsg(mp); + } else { + putnext(tcp->tcp_rq, mp); + } + } else { *defermp = mp; + } if (tcp->tcp_conn.tcp_opts_conn_req != NULL) tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); @@ -4946,10 +4642,13 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, /* Inherit information from the "parent" */ tcp->tcp_ipversion = ltcp->tcp_ipversion; tcp->tcp_family = ltcp->tcp_family; + tcp->tcp_wq = ltcp->tcp_wq; tcp->tcp_rq = ltcp->tcp_rq; + tcp->tcp_mss = tcps->tcps_mss_def_ipv6; tcp->tcp_detached = B_TRUE; + SOCK_CONNID_INIT(tcp->tcp_connid); if ((err = tcp_init_values(tcp)) != 0) { freemsg(tpi_mp); return (err); @@ -5100,6 +4799,12 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, tcp->tcp_kssl_pending = B_TRUE; } + /* Inherit the listener's non-STREAMS flag */ + if (IPCL_IS_NONSTR(lconnp)) { + connp->conn_flags |= IPCL_NONSTR; + connp->conn_upcalls = lconnp->conn_upcalls; + } + return (0); } @@ -5159,6 +4864,7 @@ tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, tcp->tcp_rq = ltcp->tcp_rq; tcp->tcp_mss = tcps->tcps_mss_def_ipv4; tcp->tcp_detached = B_TRUE; + SOCK_CONNID_INIT(tcp->tcp_connid); if ((err = tcp_init_values(tcp)) != 0) { freemsg(tpi_mp); return (err); @@ -5219,6 +4925,12 @@ tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, tcp->tcp_kssl_pending = B_TRUE; } + /* Inherit the listener's non-STREAMS flag */ + if (IPCL_IS_NONSTR(lconnp)) { + connp->conn_flags |= IPCL_NONSTR; + connp->conn_upcalls = lconnp->conn_upcalls; + } + return (0); } @@ -5474,7 +5186,7 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr) if (tsol_update_sticky(&tcp->tcp_sticky_ipp, &tcp->tcp_label_len, optbuf) != 0) return (B_FALSE); - if (tcp_build_hdrs(tcp->tcp_rq, tcp) != 0) + if (tcp_build_hdrs(tcp) != 0) return (B_FALSE); } @@ -5732,12 +5444,13 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) eager = econnp->conn_tcp; /* - * Pre-allocate the T_ordrel_ind mblk so that at close time, we - * will always have that to send up. Otherwise, we need to do + * Pre-allocate the T_ordrel_ind mblk for TPI socket so that at close + * time, we will always have that to send up. Otherwise, we need to do * special handling in case the allocation fails at that time. */ ASSERT(eager->tcp_ordrel_mp == NULL); - if ((eager->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) + if (!IPCL_IS_NONSTR(econnp) && + (eager->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) goto error3; /* Inherit various TCP parameters from the listener */ @@ -5839,7 +5552,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) * There should be no ire in the mp as we are being called after * receiving the SYN. */ - ASSERT(tcp_ire_mp(mp) == NULL); + ASSERT(tcp_ire_mp(&mp) == NULL); /* * Adapt our mss, ttl, ... according to information provided in IRE. @@ -5871,7 +5584,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) * we should not inherit receive window size from listener. */ eager->tcp_rwnd = MSS_ROUNDUP( - (eager->tcp_rwnd == 0 ? tcp->tcp_rq->q_hiwat : + (eager->tcp_rwnd == 0 ? tcp->tcp_recv_hiwater: eager->tcp_rwnd), eager->tcp_mss); if (eager->tcp_snd_ws_ok) tcp_set_ws_value(eager); @@ -5899,6 +5612,8 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) * */ /* Set the TCP options */ + eager->tcp_recv_hiwater = tcp->tcp_recv_hiwater; + eager->tcp_recv_lowater = tcp->tcp_recv_lowater; eager->tcp_xmit_hiwater = tcp->tcp_xmit_hiwater; eager->tcp_dgram_errind = tcp->tcp_dgram_errind; eager->tcp_oobinline = tcp->tcp_oobinline; @@ -5906,6 +5621,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) eager->tcp_broadcast = tcp->tcp_broadcast; eager->tcp_useloopback = tcp->tcp_useloopback; eager->tcp_dontroute = tcp->tcp_dontroute; + eager->tcp_debug = tcp->tcp_debug; eager->tcp_linger = tcp->tcp_linger; eager->tcp_lingertime = tcp->tcp_lingertime; if (tcp->tcp_ka_enabled) @@ -5979,6 +5695,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) goto error; } DB_CPID(mp1) = tcp->tcp_cpid; + mblk_setcred(mp1, tcp->tcp_cred); eager->tcp_cpid = tcp->tcp_cpid; eager->tcp_open_time = lbolt64; @@ -6168,9 +5885,9 @@ done: * Successful connect request processing begins when our client passes * a T_CONN_REQ message into tcp_wput() and ends when tcp_rput() passes * our T_OK_ACK reply message upstream. The control flow looks like this: - * upstream -> tcp_wput() -> tcp_wput_proto() -> tcp_connect() -> IP - * upstream <- tcp_rput() <- IP - * After various error checks are completed, tcp_connect() lays + * upstream -> tcp_wput() -> tcp_wput_proto() -> tcp_tpi_connect() -> IP + * upstream <- tcp_rput() <- IP + * After various error checks are completed, tcp_tpi_connect() lays * the target address and port into the composite header template, * preallocates the T_OK_ACK reply message, construct a full 12 byte bind * request followed by an IRE request, and passes the three mblk message @@ -6185,15 +5902,14 @@ done: * above. */ static void -tcp_connect(tcp_t *tcp, mblk_t *mp) +tcp_tpi_connect(tcp_t *tcp, mblk_t *mp) { sin_t *sin; - sin6_t *sin6; queue_t *q = tcp->tcp_wq; struct T_conn_req *tcr; - ipaddr_t *dstaddrp; - in_port_t dstport; - uint_t srcid; + struct sockaddr *sa; + socklen_t len; + int error; tcr = (struct T_conn_req *)mp->b_rptr; @@ -6287,46 +6003,24 @@ tcp_connect(tcp_t *tcp, mblk_t *mp) /* FALLTHRU */ case sizeof (sin_t): - sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset, + sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, sizeof (sin_t)); - if (sin == NULL || !OK_32PTR((char *)sin)) { - tcp_err_ack(tcp, mp, TSYSERR, EINVAL); - return; - } - if (tcp->tcp_family != AF_INET || - sin->sin_family != AF_INET) { - tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); - return; - } - if (sin->sin_port == 0) { - tcp_err_ack(tcp, mp, TBADADDR, 0); - return; - } - if (tcp->tcp_connp && tcp->tcp_connp->conn_ipv6_v6only) { - tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); - return; - } - + len = sizeof (sin_t); break; case sizeof (sin6_t): - sin6 = (sin6_t *)mi_offset_param(mp, tcr->DEST_offset, + sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, sizeof (sin6_t)); - if (sin6 == NULL || !OK_32PTR((char *)sin6)) { - tcp_err_ack(tcp, mp, TSYSERR, EINVAL); - return; - } - if (tcp->tcp_family != AF_INET6 || - sin6->sin6_family != AF_INET6) { - tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); - return; - } - if (sin6->sin6_port == 0) { - tcp_err_ack(tcp, mp, TBADADDR, 0); - return; - } + len = sizeof (sin6_t); break; } + + error = proto_verify_ip_addr(tcp->tcp_family, sa, len); + if (error != 0) { + tcp_err_ack(tcp, mp, TSYSERR, error); + return; + } + /* * TODO: If someone in TCPS_TIME_WAIT has this dst/port we * should key on their sequence number and cut them loose. @@ -6394,80 +6088,17 @@ tcp_connect(tcp_t *tcp, mblk_t *mp) } } - /* - * If we're connecting to an IPv4-mapped IPv6 address, we need to - * make sure that the template IP header in the tcp structure is an - * IPv4 header, and that the tcp_ipversion is IPV4_VERSION. We - * need to this before we call tcp_bindi() so that the port lookup - * code will look for ports in the correct port space (IPv4 and - * IPv6 have separate port spaces). - */ - if (tcp->tcp_family == AF_INET6 && tcp->tcp_ipversion == IPV6_VERSION && - IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { - int err = 0; - - err = tcp_header_init_ipv4(tcp); - if (err != 0) { - mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM); - goto connect_failed; - } - if (tcp->tcp_lport != 0) - *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; - } - - if (tcp->tcp_issocket) { - /* - * TCP is _D_SODIRECT and sockfs is directly above so save - * the shared sonode sodirect_t pointer (if any) to enable - * TCP sodirect. - */ - tcp->tcp_sodirect = SOD_QTOSODP(tcp->tcp_rq); + /* call the non-TPI version */ + error = tcp_do_connect(tcp->tcp_connp, sa, len, DB_CRED(mp), + DB_CPID(mp)); + if (error < 0) { + mp = mi_tpi_err_ack_alloc(mp, -error, 0); + } else if (error > 0) { + mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error); + } else { + mp = mi_tpi_ok_ack_alloc(mp); } - switch (tcp->tcp_state) { - case TCPS_IDLE: - /* - * We support quick connect, refer to comments in - * tcp_connect_*() - */ - /* FALLTHRU */ - case TCPS_BOUND: - case TCPS_LISTEN: - if (tcp->tcp_family == AF_INET6) { - if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { - tcp_connect_ipv6(tcp, mp, - &sin6->sin6_addr, - sin6->sin6_port, sin6->sin6_flowinfo, - sin6->__sin6_src_id, sin6->sin6_scope_id); - return; - } - /* - * Destination adress is mapped IPv6 address. - * Source bound address should be unspecified or - * IPv6 mapped address as well. - */ - if (!IN6_IS_ADDR_UNSPECIFIED( - &tcp->tcp_bound_source_v6) && - !IN6_IS_ADDR_V4MAPPED(&tcp->tcp_bound_source_v6)) { - mp = mi_tpi_err_ack_alloc(mp, TSYSERR, - EADDRNOTAVAIL); - break; - } - dstaddrp = &V4_PART_OF_V6((sin6->sin6_addr)); - dstport = sin6->sin6_port; - srcid = sin6->__sin6_src_id; - } else { - dstaddrp = &sin->sin_addr.s_addr; - dstport = sin->sin_port; - srcid = 0; - } - - tcp_connect_ipv4(tcp, mp, dstaddrp, dstport, srcid); - return; - default: - mp = mi_tpi_err_ack_alloc(mp, TOUTSTATE, 0); - break; - } /* * Note: Code below is the "failure" case */ @@ -6479,23 +6110,22 @@ connect_failed: tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, TSYSERR, ENOMEM); } - if (tcp->tcp_conn.tcp_opts_conn_req != NULL) - tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); } /* * Handle connect to IPv4 destinations, including connections for AF_INET6 * sockets connecting to IPv4 mapped IPv6 destinations. */ -static void -tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, in_port_t dstport, - uint_t srcid) +static int +tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport, + uint_t srcid, cred_t *cr, pid_t pid) { tcph_t *tcph; - mblk_t *mp1; + mblk_t *mp; ipaddr_t dstaddr = *dstaddrp; int32_t oldstate; uint16_t lport; + int error = 0; tcp_stack_t *tcps = tcp->tcp_tcps; ASSERT(tcp->tcp_ipversion == IPV4_VERSION); @@ -6538,7 +6168,7 @@ tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, in_port_t dstport, */ if (dstaddr == tcp->tcp_ipha->ipha_src && dstport == tcp->tcp_lport) { - mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); + error = -TBADADDR; goto failed; } @@ -6583,91 +6213,77 @@ tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, in_port_t dstport, lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE, B_FALSE, B_FALSE); if (lport == 0) { - mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0); + error = -TNOADDR; goto failed; } } tcp->tcp_state = TCPS_SYN_SENT; - /* - * TODO: allow data with connect requests - * by unlinking M_DATA trailers here and - * linking them in behind the T_OK_ACK mblk. - * The tcp_rput() bind ack handler would then - * feed them to tcp_wput_data() rather than call - * tcp_timer(). - */ - mp = mi_tpi_ok_ack_alloc(mp); - if (!mp) { + mp = allocb(sizeof (ire_t), BPRI_HI); + if (mp == NULL) { tcp->tcp_state = oldstate; + error = ENOMEM; goto failed; } - if (tcp->tcp_family == AF_INET) { - mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, - sizeof (ipa_conn_t)); - } else { - mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, - sizeof (ipa6_conn_t)); + mp->b_wptr += sizeof (ire_t); + mp->b_datap->db_type = IRE_DB_REQ_TYPE; + tcp->tcp_hard_binding = 1; + if (cr == NULL) { + cr = tcp->tcp_cred; + pid = tcp->tcp_cpid; } - if (mp1) { - /* - * We need to make sure that the conn_recv is set to a non-null - * value before we insert the conn_t into the classifier table. - * This is to avoid a race with an incoming packet which does - * an ipcl_classify(). - */ - tcp->tcp_connp->conn_recv = tcp_input; + mblk_setcred(mp, cr); + DB_CPID(mp) = pid; - /* Hang onto the T_OK_ACK for later. */ - linkb(mp1, mp); - mblk_setcred(mp1, tcp->tcp_cred); - if (tcp->tcp_family == AF_INET) - mp1 = ip_bind_v4(tcp->tcp_wq, mp1, tcp->tcp_connp); - else { - mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp, - &tcp->tcp_sticky_ipp); + /* + * We need to make sure that the conn_recv is set to a non-null + * value before we insert the conn_t into the classifier table. + * This is to avoid a race with an incoming packet which does + * an ipcl_classify(). + */ + tcp->tcp_connp->conn_recv = tcp_input; + + if (tcp->tcp_family == AF_INET) { + error = ip_proto_bind_connected_v4(tcp->tcp_connp, &mp, + IPPROTO_TCP, &tcp->tcp_ipha->ipha_src, tcp->tcp_lport, + tcp->tcp_remote, tcp->tcp_fport, B_TRUE, B_TRUE); + } else { + in6_addr_t v6src; + if (tcp->tcp_ipversion == IPV4_VERSION) { + IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, &v6src); + } else { + v6src = tcp->tcp_ip6h->ip6_src; } - BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens); - tcp->tcp_active_open = 1; - /* - * If the bind cannot complete immediately - * IP will arrange to call tcp_rput_other - * when the bind completes. - */ - if (mp1 != NULL) - tcp_rput_other(tcp, mp1); - return; + error = ip_proto_bind_connected_v6(tcp->tcp_connp, &mp, + IPPROTO_TCP, &v6src, tcp->tcp_lport, &tcp->tcp_remote_v6, + &tcp->tcp_sticky_ipp, tcp->tcp_fport, B_TRUE, B_TRUE); } - /* Error case */ - tcp->tcp_state = oldstate; - mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM); + BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens); + tcp->tcp_active_open = 1; + return (tcp_post_ip_bind(tcp, mp, error)); failed: /* return error ack and blow away saved option results if any */ - if (mp != NULL) - putnext(tcp->tcp_rq, mp); - else { - tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, - TSYSERR, ENOMEM); - } if (tcp->tcp_conn.tcp_opts_conn_req != NULL) tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); - + return (error); } /* * Handle connect to IPv6 destinations. */ -static void -tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, - in_port_t dstport, uint32_t flowinfo, uint_t srcid, uint32_t scope_id) +static int +tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, in_port_t dstport, + uint32_t flowinfo, uint_t srcid, uint32_t scope_id, cred_t *cr, pid_t pid) { tcph_t *tcph; - mblk_t *mp1; + mblk_t *mp; ip6_rthdr_t *rth; int32_t oldstate; uint16_t lport; tcp_stack_t *tcps = tcp->tcp_tcps; + int error = 0; + conn_t *connp = tcp->tcp_connp; ASSERT(tcp->tcp_family == AF_INET6); @@ -6678,8 +6294,7 @@ tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, * IPv4-mapped IPv6 address. */ if (tcp->tcp_ipversion != IPV6_VERSION) { - mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); - goto failed; + return (-TBADADDR); } /* @@ -6694,7 +6309,7 @@ tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, /* Handle __sin6_src_id if socket not bound to an IP address */ if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) { ip_srcid_find_id(srcid, &tcp->tcp_ip6h->ip6_src, - tcp->tcp_connp->conn_zoneid, tcps->tcps_netstack); + connp->conn_zoneid, tcps->tcps_netstack); tcp->tcp_ip_src_v6 = tcp->tcp_ip6h->ip6_src; } @@ -6724,7 +6339,7 @@ tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, ipp->ipp_fields |= IPPF_SCOPE_ID; if (ipp->ipp_fields & IPPF_HAS_IP6I) ip2dbg(("tcp_connect_v6: SCOPE_ID set\n")); - reterr = tcp_build_hdrs(tcp->tcp_rq, tcp); + reterr = tcp_build_hdrs(tcp); if (reterr != 0) goto failed; ip1dbg(("tcp_connect_ipv6: tcp_bld_hdrs returned\n")); @@ -6741,7 +6356,7 @@ tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, */ if (IN6_ARE_ADDR_EQUAL(dstaddrp, &tcp->tcp_ip6h->ip6_src) && (dstport == tcp->tcp_lport)) { - mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); + error = -TBADADDR; goto failed; } @@ -6751,7 +6366,6 @@ tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | (flowinfo & ~IPV6_VERS_AND_FLOW_MASK); - /* * Massage a routing header (if present) putting the first hop * in ip6_dst. Compute a starting value for the checksum which @@ -6791,26 +6405,26 @@ tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE, B_FALSE, B_FALSE); if (lport == 0) { - mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0); + error = -TNOADDR; goto failed; } } tcp->tcp_state = TCPS_SYN_SENT; - /* - * TODO: allow data with connect requests - * by unlinking M_DATA trailers here and - * linking them in behind the T_OK_ACK mblk. - * The tcp_rput() bind ack handler would then - * feed them to tcp_wput_data() rather than call - * tcp_timer(). - */ - mp = mi_tpi_ok_ack_alloc(mp); - if (!mp) { - tcp->tcp_state = oldstate; - goto failed; - } - mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, sizeof (ipa6_conn_t)); - if (mp1) { + + mp = allocb(sizeof (ire_t), BPRI_HI); + if (mp != NULL) { + in6_addr_t v6src; + + mp->b_wptr += sizeof (ire_t); + mp->b_datap->db_type = IRE_DB_REQ_TYPE; + if (cr == NULL) { + cr = tcp->tcp_cred; + pid = tcp->tcp_cpid; + } + mblk_setcred(mp, cr); + DB_CPID(mp) = pid; + tcp->tcp_hard_binding = 1; + /* * We need to make sure that the conn_recv is set to a non-null * value before we insert the conn_t into the classifier table. @@ -6819,32 +6433,28 @@ tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, */ tcp->tcp_connp->conn_recv = tcp_input; - /* Hang onto the T_OK_ACK for later. */ - linkb(mp1, mp); - mblk_setcred(mp1, tcp->tcp_cred); - mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp, - &tcp->tcp_sticky_ipp); + if (tcp->tcp_ipversion == IPV4_VERSION) { + IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, &v6src); + } else { + v6src = tcp->tcp_ip6h->ip6_src; + } + error = ip_proto_bind_connected_v6(connp, &mp, IPPROTO_TCP, + &v6src, tcp->tcp_lport, &tcp->tcp_remote_v6, + &tcp->tcp_sticky_ipp, tcp->tcp_fport, B_TRUE, B_TRUE); BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens); tcp->tcp_active_open = 1; - /* ip_bind_v6() may return ACK or ERROR */ - if (mp1 != NULL) - tcp_rput_other(tcp, mp1); - return; + + return (tcp_post_ip_bind(tcp, mp, error)); } /* Error case */ tcp->tcp_state = oldstate; - mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM); + error = ENOMEM; failed: /* return error ack and blow away saved option results if any */ - if (mp != NULL) - putnext(tcp->tcp_rq, mp); - else { - tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, - TSYSERR, ENOMEM); - } if (tcp->tcp_conn.tcp_opts_conn_req != NULL) tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); + return (error); } /* @@ -6870,72 +6480,61 @@ tcp_def_q_set(tcp_t *tcp, mblk_t *mp) mutex_exit(&tcps->tcps_g_q_lock); iocp->ioc_error = EALREADY; } else { - mblk_t *mp1; + int error = 0; + conn_t *connp = tcp->tcp_connp; + ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, 0); - if (mp1 == NULL) { - mutex_exit(&tcps->tcps_g_q_lock); - iocp->ioc_error = ENOMEM; - } else { - tcps->tcps_g_q = tcp->tcp_rq; - mutex_exit(&tcps->tcps_g_q_lock); - iocp->ioc_error = 0; - iocp->ioc_rval = 0; - /* - * We are passing tcp_sticky_ipp as NULL - * as it is not useful for tcp_default queue - * - * Set conn_recv just in case. - */ - tcp->tcp_connp->conn_recv = tcp_conn_request; + tcps->tcps_g_q = tcp->tcp_rq; + mutex_exit(&tcps->tcps_g_q_lock); + iocp->ioc_error = 0; + iocp->ioc_rval = 0; + /* + * We are passing tcp_sticky_ipp as NULL + * as it is not useful for tcp_default queue + * + * Set conn_recv just in case. + */ + tcp->tcp_connp->conn_recv = tcp_conn_request; - mp1 = ip_bind_v6(q, mp1, tcp->tcp_connp, NULL); - if (mp1 != NULL) - tcp_rput_other(tcp, mp1); + ASSERT(connp->conn_af_isv6); + connp->conn_ulp = IPPROTO_TCP; + + if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_TCP].connf_head != + NULL || connp->conn_mac_exempt) { + error = -TBADADDR; + } else { + connp->conn_srcv6 = ipv6_all_zeros; + ipcl_proto_insert_v6(connp, IPPROTO_TCP); } + + (void) tcp_post_ip_bind(tcp, NULL, error); } qreply(q, mp); } -/* - * Our client hereby directs us to reject the connection request - * that tcp_conn_request() marked with 'seqnum'. Rejection consists - * of sending the appropriate RST, not an ICMP error. - */ -static void -tcp_disconnect(tcp_t *tcp, mblk_t *mp) +static int +tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum) { tcp_t *ltcp = NULL; - t_scalar_t seqnum; conn_t *connp; tcp_stack_t *tcps = tcp->tcp_tcps; - ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); - if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) { - tcp_err_ack(tcp, mp, TPROTO, 0); - return; - } - /* * Right now, upper modules pass down a T_DISCON_REQ to TCP, * when the stream is in BOUND state. Do not send a reset, * since the destination IP address is not valid, and it can * be the initialized value of all zeros (broadcast address). * - * If TCP has sent down a bind request to IP and has not - * received the reply, reject the request. Otherwise, TCP - * will be confused. + * XXX There won't be any pending bind request to IP. */ - if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_hard_binding) { + if (tcp->tcp_state <= TCPS_BOUND) { if (tcp->tcp_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_disconnect: bad state, %d", tcp->tcp_state); } - tcp_err_ack(tcp, mp, TOUTSTATE, 0); - return; + return (TOUTSTATE); } - seqnum = ((struct T_discon_req *)mp->b_rptr)->SEQ_number; if (seqnum == -1 || tcp->tcp_conn_req_max == 0) { @@ -7009,25 +6608,42 @@ tcp_disconnect(tcp_t *tcp, mblk_t *mp) tcp_reinit(tcp); - if (old_state >= TCPS_ESTABLISHED) { + return (0); + } else if (!tcp_eager_blowoff(tcp, seqnum)) { + return (TBADSEQ); + } + return (0); +} + +/* + * Our client hereby directs us to reject the connection request + * that tcp_conn_request() marked with 'seqnum'. Rejection consists + * of sending the appropriate RST, not an ICMP error. + */ +static void +tcp_disconnect(tcp_t *tcp, mblk_t *mp) +{ + t_scalar_t seqnum; + int error; + + ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); + if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) { + tcp_err_ack(tcp, mp, TPROTO, 0); + return; + } + seqnum = ((struct T_discon_req *)mp->b_rptr)->SEQ_number; + error = tcp_disconnect_common(tcp, seqnum); + if (error != 0) + tcp_err_ack(tcp, mp, error, 0); + else { + if (tcp->tcp_state >= TCPS_ESTABLISHED) { /* Send M_FLUSH according to TPI */ (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW); } mp = mi_tpi_ok_ack_alloc(mp); if (mp) putnext(tcp->tcp_rq, mp); - return; - } else if (!tcp_eager_blowoff(tcp, seqnum)) { - tcp_err_ack(tcp, mp, TBADSEQ, 0); - return; } - if (tcp->tcp_state >= TCPS_ESTABLISHED) { - /* Send M_FLUSH according to TPI */ - (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW); - } - mp = mi_tpi_ok_ack_alloc(mp); - if (mp) - putnext(tcp->tcp_rq, mp); } /* @@ -7566,6 +7182,24 @@ tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp) /* TODO: Default ETSDU is 1. Is that correct for tcp? */ } +static void +tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap, + t_uscalar_t cap_bits1) +{ + tcap->CAP_bits1 = 0; + + if (cap_bits1 & TC1_INFO) { + tcp_copy_info(&tcap->INFO_ack, tcp); + tcap->CAP_bits1 |= TC1_INFO; + } + + if (cap_bits1 & TC1_ACCEPTOR_ID) { + tcap->ACCEPTOR_id = tcp->tcp_acceptor_id; + tcap->CAP_bits1 |= TC1_ACCEPTOR_ID; + } + +} + /* * This routine responds to T_CAPABILITY_REQ messages. It is called by * tcp_wput. Much of the T_CAPABILITY_ACK information is copied from @@ -7591,17 +7225,7 @@ tcp_capability_req(tcp_t *tcp, mblk_t *mp) return; tcap = (struct T_capability_ack *)mp->b_rptr; - tcap->CAP_bits1 = 0; - - if (cap_bits1 & TC1_INFO) { - tcp_copy_info(&tcap->INFO_ack, tcp); - tcap->CAP_bits1 |= TC1_INFO; - } - - if (cap_bits1 & TC1_ACCEPTOR_ID) { - tcap->ACCEPTOR_id = tcp->tcp_acceptor_id; - tcap->CAP_bits1 |= TC1_ACCEPTOR_ID; - } + tcp_do_capability_ack(tcp, tcap, cap_bits1); putnext(tcp->tcp_rq, mp); } @@ -7822,10 +7446,12 @@ tcp_reinit(tcp_t *tcp) tcp->tcp_urp_mark_mp = NULL; } if (tcp->tcp_fused_sigurg_mp != NULL) { + ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); freeb(tcp->tcp_fused_sigurg_mp); tcp->tcp_fused_sigurg_mp = NULL; } if (tcp->tcp_ordrel_mp != NULL) { + ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); freeb(tcp->tcp_ordrel_mp); tcp->tcp_ordrel_mp = NULL; } @@ -7925,7 +7551,10 @@ tcp_reinit(tcp_t *tcp) tcp->tcp_ip_src_v6 = tcp->tcp_bound_source_v6; ASSERT(tcp->tcp_ptpbhn != NULL); - tcp->tcp_rq->q_hiwat = tcps->tcps_recv_hiwat; + if (!IPCL_IS_NONSTR(tcp->tcp_connp)) + tcp->tcp_rq->q_hiwat = tcps->tcps_recv_hiwat; + tcp->tcp_recv_hiwater = tcps->tcps_recv_hiwat; + tcp->tcp_recv_lowater = tcp_rinfo.mi_lowat; tcp->tcp_rwnd = tcps->tcps_recv_hiwat; tcp->tcp_mss = tcp->tcp_ipversion != IPV4_VERSION ? tcps->tcps_mss_def_ipv6 : tcps->tcps_mss_def_ipv4; @@ -7952,6 +7581,7 @@ tcp_reinit_values(tcp) #define PRESERVE(x) ((x) = (x)) #endif /* lint */ + PRESERVE(tcp->tcp_bind_hash_port); PRESERVE(tcp->tcp_bind_hash); PRESERVE(tcp->tcp_ptpbhn); PRESERVE(tcp->tcp_acceptor_hash); @@ -8239,6 +7869,8 @@ tcp_reinit_values(tcp) DONTCARE(tcp->tcmp_stk[0]); #endif + PRESERVE(tcp->tcp_connid); + #undef DONTCARE #undef PRESERVE @@ -9072,156 +8704,6 @@ noticmpv6: } /* - * IP recognizes seven kinds of bind requests: - * - * - A zero-length address binds only to the protocol number. - * - * - A 4-byte address is treated as a request to - * validate that the address is a valid local IPv4 - * address, appropriate for an application to bind to. - * IP does the verification, but does not make any note - * of the address at this time. - * - * - A 16-byte address contains is treated as a request - * to validate a local IPv6 address, as the 4-byte - * address case above. - * - * - A 16-byte sockaddr_in to validate the local IPv4 address and also - * use it for the inbound fanout of packets. - * - * - A 24-byte sockaddr_in6 to validate the local IPv6 address and also - * use it for the inbound fanout of packets. - * - * - A 12-byte address (ipa_conn_t) containing complete IPv4 fanout - * information consisting of local and remote addresses - * and ports. In this case, the addresses are both - * validated as appropriate for this operation, and, if - * so, the information is retained for use in the - * inbound fanout. - * - * - A 36-byte address address (ipa6_conn_t) containing complete IPv6 - * fanout information, like the 12-byte case above. - * - * IP will also fill in the IRE request mblk with information - * regarding our peer. In all cases, we notify IP of our protocol - * type by appending a single protocol byte to the bind request. - */ -static mblk_t * -tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim, t_scalar_t addr_length) -{ - char *cp; - mblk_t *mp; - struct T_bind_req *tbr; - ipa_conn_t *ac; - ipa6_conn_t *ac6; - sin_t *sin; - sin6_t *sin6; - - ASSERT(bind_prim == O_T_BIND_REQ || bind_prim == T_BIND_REQ); - ASSERT((tcp->tcp_family == AF_INET && - tcp->tcp_ipversion == IPV4_VERSION) || - (tcp->tcp_family == AF_INET6 && - (tcp->tcp_ipversion == IPV4_VERSION || - tcp->tcp_ipversion == IPV6_VERSION))); - - mp = allocb(sizeof (*tbr) + addr_length + 1, BPRI_HI); - if (!mp) - return (mp); - mp->b_datap->db_type = M_PROTO; - tbr = (struct T_bind_req *)mp->b_rptr; - tbr->PRIM_type = bind_prim; - tbr->ADDR_offset = sizeof (*tbr); - tbr->CONIND_number = 0; - tbr->ADDR_length = addr_length; - cp = (char *)&tbr[1]; - switch (addr_length) { - case sizeof (ipa_conn_t): - ASSERT(tcp->tcp_family == AF_INET); - ASSERT(tcp->tcp_ipversion == IPV4_VERSION); - - mp->b_cont = allocb(sizeof (ire_t), BPRI_HI); - if (mp->b_cont == NULL) { - freemsg(mp); - return (NULL); - } - mp->b_cont->b_wptr += sizeof (ire_t); - mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE; - - /* cp known to be 32 bit aligned */ - ac = (ipa_conn_t *)cp; - ac->ac_laddr = tcp->tcp_ipha->ipha_src; - ac->ac_faddr = tcp->tcp_remote; - ac->ac_fport = tcp->tcp_fport; - ac->ac_lport = tcp->tcp_lport; - tcp->tcp_hard_binding = 1; - break; - - case sizeof (ipa6_conn_t): - ASSERT(tcp->tcp_family == AF_INET6); - - mp->b_cont = allocb(sizeof (ire_t), BPRI_HI); - if (mp->b_cont == NULL) { - freemsg(mp); - return (NULL); - } - mp->b_cont->b_wptr += sizeof (ire_t); - mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE; - - /* cp known to be 32 bit aligned */ - ac6 = (ipa6_conn_t *)cp; - if (tcp->tcp_ipversion == IPV4_VERSION) { - IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, - &ac6->ac6_laddr); - } else { - ac6->ac6_laddr = tcp->tcp_ip6h->ip6_src; - } - ac6->ac6_faddr = tcp->tcp_remote_v6; - ac6->ac6_fport = tcp->tcp_fport; - ac6->ac6_lport = tcp->tcp_lport; - tcp->tcp_hard_binding = 1; - break; - - case sizeof (sin_t): - /* - * NOTE: IPV6_ADDR_LEN also has same size. - * Use family to discriminate. - */ - if (tcp->tcp_family == AF_INET) { - sin = (sin_t *)cp; - - *sin = sin_null; - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = tcp->tcp_bound_source; - sin->sin_port = tcp->tcp_lport; - break; - } else { - *(in6_addr_t *)cp = tcp->tcp_bound_source_v6; - } - break; - - case sizeof (sin6_t): - ASSERT(tcp->tcp_family == AF_INET6); - sin6 = (sin6_t *)cp; - - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - sin6->sin6_addr = tcp->tcp_bound_source_v6; - sin6->sin6_port = tcp->tcp_lport; - break; - - case IP_ADDR_LEN: - ASSERT(tcp->tcp_ipversion == IPV4_VERSION); - *(uint32_t *)cp = tcp->tcp_ipha->ipha_src; - break; - - } - /* Add protocol number to end */ - cp[addr_length] = (char)IPPROTO_TCP; - mp->b_wptr = (uchar_t *)&cp[addr_length + 1]; - return (mp); -} - -/* * Notify IP that we are having trouble with this connection. IP should * blow the IRE away and start over. */ @@ -9268,25 +8750,29 @@ tcp_ip_notify(tcp_t *tcp) /* Unlink and return any mblk that looks like it contains an ire */ static mblk_t * -tcp_ire_mp(mblk_t *mp) +tcp_ire_mp(mblk_t **mpp) { - mblk_t *prev_mp; + mblk_t *mp = *mpp; + mblk_t *prev_mp = NULL; for (;;) { - prev_mp = mp; - mp = mp->b_cont; - if (mp == NULL) - break; switch (DB_TYPE(mp)) { case IRE_DB_TYPE: case IRE_DB_REQ_TYPE: - if (prev_mp != NULL) + if (mp == *mpp) { + *mpp = mp->b_cont; + } else { prev_mp->b_cont = mp->b_cont; + } mp->b_cont = NULL; return (mp); default: break; } + prev_mp = mp; + mp = mp->b_cont; + if (mp == NULL) + break; } return (mp); } @@ -9408,10 +8894,10 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) queue_t *q = tcp->tcp_rq; int32_t mss = tcp->tcp_mss; int maxpsz; + conn_t *connp = tcp->tcp_connp; if (TCP_IS_DETACHED(tcp)) return (mss); - if (tcp->tcp_fused) { maxpsz = tcp_fuse_maxpsz_set(tcp); mss = INFPSZ; @@ -9435,6 +8921,7 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) * head to break down larger than SMSS writes into SMSS- * size mblks, up to tcp_maxpsz_multiplier mblks at a time. */ + /* XXX tune this with ndd tcp_maxpsz_multiplier */ maxpsz = tcp->tcp_maxpsz * mss; if (maxpsz > tcp->tcp_xmit_hiwater/2) { maxpsz = tcp->tcp_xmit_hiwater/2; @@ -9442,12 +8929,15 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) maxpsz = MSS_ROUNDUP(maxpsz, mss); } } - (void) setmaxps(q, maxpsz); - tcp->tcp_wq->q_maxpsz = maxpsz; - if (set_maxblk) - (void) mi_set_sth_maxblk(q, mss); + (void) proto_set_maxpsz(q, connp, maxpsz); + if (!(IPCL_IS_NONSTR(connp))) { + /* XXX do it in set_maxpsz()? */ + tcp->tcp_wq->q_maxpsz = maxpsz; + } + if (set_maxblk) + (void) proto_set_tx_maxblk(q, connp, mss); return (mss); } @@ -9687,116 +9177,74 @@ tcp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) return (tcp_open(q, devp, flag, sflag, credp, B_TRUE)); } -static int -tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, - boolean_t isv6) +static conn_t * +tcp_create_common(queue_t *q, cred_t *credp, boolean_t isv6, + boolean_t issocket, int *errorp) { tcp_t *tcp = NULL; conn_t *connp; int err; - vmem_t *minor_arena = NULL; - dev_t conn_dev; zoneid_t zoneid; - tcp_stack_t *tcps = NULL; + tcp_stack_t *tcps; + squeue_t *sqp; - if (q->q_ptr != NULL) - return (0); + ASSERT(errorp != NULL); + /* + * Find the proper zoneid and netstack. + */ + /* + * Special case for install: miniroot needs to be able to + * access files via NFS as though it were always in the + * global zone. + */ + if (credp == kcred && nfs_global_client_only != 0) { + zoneid = GLOBAL_ZONEID; + tcps = netstack_find_by_stackid(GLOBAL_NETSTACKID)-> + netstack_tcp; + ASSERT(tcps != NULL); + } else { + netstack_t *ns; - if (sflag == MODOPEN) - return (EINVAL); + ns = netstack_find_by_cred(credp); + ASSERT(ns != NULL); + tcps = ns->netstack_tcp; + ASSERT(tcps != NULL); - if (!(flag & SO_ACCEPTOR)) { /* - * Special case for install: miniroot needs to be able to - * access files via NFS as though it were always in the - * global zone. + * For exclusive stacks we set the zoneid to zero + * to make TCP operate as if in the global zone. */ - if (credp == kcred && nfs_global_client_only != 0) { + if (tcps->tcps_netstack->netstack_stackid != + GLOBAL_NETSTACKID) zoneid = GLOBAL_ZONEID; - tcps = netstack_find_by_stackid(GLOBAL_NETSTACKID)-> - netstack_tcp; - ASSERT(tcps != NULL); - } else { - netstack_t *ns; - - ns = netstack_find_by_cred(credp); - ASSERT(ns != NULL); - tcps = ns->netstack_tcp; - ASSERT(tcps != NULL); - - /* - * For exclusive stacks we set the zoneid to zero - * to make TCP operate as if in the global zone. - */ - if (tcps->tcps_netstack->netstack_stackid != - GLOBAL_NETSTACKID) - zoneid = GLOBAL_ZONEID; - else - zoneid = crgetzoneid(credp); - } - /* - * For stackid zero this is done from strplumb.c, but - * non-zero stackids are handled here. - */ - if (tcps->tcps_g_q == NULL && - tcps->tcps_netstack->netstack_stackid != - GLOBAL_NETSTACKID) { - tcp_g_q_setup(tcps); - } - } - - if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) && - ((conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) { - minor_arena = ip_minor_arena_la; - } else { - /* - * Either minor numbers in the large arena were exhausted - * or a non socket application is doing the open. - * Try to allocate from the small arena. - */ - if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) { - if (tcps != NULL) - netstack_rele(tcps->tcps_netstack); - return (EBUSY); - } - minor_arena = ip_minor_arena_sa; + else + zoneid = crgetzoneid(credp); } - ASSERT(minor_arena != NULL); - - *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); - - if (flag & SO_ACCEPTOR) { - /* No netstack_find_by_cred, hence no netstack_rele needed */ - ASSERT(tcps == NULL); - q->q_qinfo = &tcp_acceptor_rinit; - /* - * the conn_dev and minor_arena will be subsequently used by - * tcp_wput_accept() and tcpclose_accept() to figure out the - * minor device number for this connection from the q_ptr. - */ - RD(q)->q_ptr = (void *)conn_dev; - WR(q)->q_qinfo = &tcp_acceptor_winit; - WR(q)->q_ptr = (void *)minor_arena; - qprocson(q); - return (0); + /* + * For stackid zero this is done from strplumb.c, but + * non-zero stackids are handled here. + */ + if (tcps->tcps_g_q == NULL && + tcps->tcps_netstack->netstack_stackid != + GLOBAL_NETSTACKID) { + tcp_g_q_setup(tcps); } - connp = (conn_t *)tcp_get_conn(IP_SQUEUE_GET(lbolt), tcps); + sqp = IP_SQUEUE_GET((uint_t)gethrtime()); + connp = (conn_t *)tcp_get_conn(sqp, tcps); /* * Both tcp_get_conn and netstack_find_by_cred incremented refcnt, * so we drop it by one. */ netstack_rele(tcps->tcps_netstack); if (connp == NULL) { - inet_minor_free(minor_arena, conn_dev); - q->q_ptr = NULL; - return (ENOSR); + *errorp = ENOSR; + return (NULL); } - connp->conn_sqp = IP_SQUEUE_GET(lbolt); + connp->conn_sqp = sqp; connp->conn_initial_sqp = connp->conn_sqp; tcp = connp->conn_tcp; - q->q_ptr = WR(q)->q_ptr = connp; if (isv6) { connp->conn_flags |= (IPCL_TCP6|IPCL_ISV6); connp->conn_send = ip_output_v6; @@ -9838,45 +9286,135 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, if (getpflags(NET_MAC_AWARE, credp) != 0) connp->conn_mac_exempt = B_TRUE; - connp->conn_dev = conn_dev; - connp->conn_minor_arena = minor_arena; + connp->conn_dev = NULL; + if (issocket) { + connp->conn_flags |= IPCL_SOCKET; + tcp->tcp_issocket = 1; + } - ASSERT(q->q_qinfo == &tcp_rinitv4 || q->q_qinfo == &tcp_rinitv6); - ASSERT(WR(q)->q_qinfo == &tcp_winit); + tcp->tcp_recv_hiwater = tcps->tcps_recv_hiwat; + tcp->tcp_rwnd = tcps->tcps_recv_hiwat; + tcp->tcp_recv_lowater = tcp_rinfo.mi_lowat; - if (flag & SO_SOCKSTR) { + /* Non-zero default values */ + connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; + + if (q == NULL) { /* - * No need to insert a socket in tcp acceptor hash. - * If it was a socket acceptor stream, we dealt with - * it above. A socket listener can never accept a - * connection and doesn't need acceptor_id. + * Create a helper stream for non-STREAMS socket. */ - connp->conn_flags |= IPCL_SOCKET; - tcp->tcp_issocket = 1; - WR(q)->q_qinfo = &tcp_sock_winit; + err = ip_create_helper_stream(connp, tcps->tcps_ldi_ident); + if (err != 0) { + ip1dbg(("tcp_create: create of IP helper stream " + "failed\n")); + CONN_DEC_REF(connp); + *errorp = err; + return (NULL); + } + q = connp->conn_rq; } else { -#ifdef _ILP32 - tcp->tcp_acceptor_id = (t_uscalar_t)RD(q); -#else - tcp->tcp_acceptor_id = conn_dev; -#endif /* _ILP32 */ - tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); + RD(q)->q_hiwat = tcps->tcps_recv_hiwat; } + SOCK_CONNID_INIT(tcp->tcp_connid); err = tcp_init(tcp, q); if (err != 0) { - inet_minor_free(connp->conn_minor_arena, connp->conn_dev); - tcp_acceptor_hash_remove(tcp); CONN_DEC_REF(connp); + *errorp = err; + return (NULL); + } + + return (connp); +} + +static int +tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, + boolean_t isv6) +{ + tcp_t *tcp = NULL; + conn_t *connp = NULL; + int err; + vmem_t *minor_arena = NULL; + dev_t conn_dev; + boolean_t issocket; + + if (q->q_ptr != NULL) + return (0); + + if (sflag == MODOPEN) + return (EINVAL); + + if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) && + ((conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) { + minor_arena = ip_minor_arena_la; + } else { + /* + * Either minor numbers in the large arena were exhausted + * or a non socket application is doing the open. + * Try to allocate from the small arena. + */ + if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) { + return (EBUSY); + } + minor_arena = ip_minor_arena_sa; + } + + ASSERT(minor_arena != NULL); + + *devp = makedevice(getmajor(*devp), (minor_t)conn_dev); + + if (flag & SO_FALLBACK) { + /* + * Non streams socket needs a stream to fallback to + */ + RD(q)->q_ptr = (void *)conn_dev; + WR(q)->q_qinfo = &tcp_fallback_sock_winit; + WR(q)->q_ptr = (void *)minor_arena; + qprocson(q); + return (0); + } else if (flag & SO_ACCEPTOR) { + q->q_qinfo = &tcp_acceptor_rinit; + /* + * the conn_dev and minor_arena will be subsequently used by + * tcp_wput_accept() and tcpclose_accept() to figure out the + * minor device number for this connection from the q_ptr. + */ + RD(q)->q_ptr = (void *)conn_dev; + WR(q)->q_qinfo = &tcp_acceptor_winit; + WR(q)->q_ptr = (void *)minor_arena; + qprocson(q); + return (0); + } + + issocket = flag & SO_SOCKSTR; + connp = tcp_create_common(q, credp, isv6, issocket, &err); + + if (connp == NULL) { + inet_minor_free(minor_arena, conn_dev); q->q_ptr = WR(q)->q_ptr = NULL; return (err); } - RD(q)->q_hiwat = tcps->tcps_recv_hiwat; - tcp->tcp_rwnd = tcps->tcps_recv_hiwat; + q->q_ptr = WR(q)->q_ptr = connp; + + connp->conn_dev = conn_dev; + connp->conn_minor_arena = minor_arena; + + ASSERT(q->q_qinfo == &tcp_rinitv4 || q->q_qinfo == &tcp_rinitv6); + ASSERT(WR(q)->q_qinfo == &tcp_winit); + + if (issocket) { + WR(q)->q_qinfo = &tcp_sock_winit; + } else { + tcp = connp->conn_tcp; +#ifdef _ILP32 + tcp->tcp_acceptor_id = (t_uscalar_t)RD(q); +#else + tcp->tcp_acceptor_id = conn_dev; +#endif /* _ILP32 */ + tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); + } - /* Non-zero default values */ - connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; /* * Put the ref for TCP. Ref for IP was already put * by ipcl_conn_create. Also Make the conn_t globally @@ -9922,7 +9460,7 @@ tcp_allow_connopt_set(int level, int name) } /* - * This routine gets default values of certain options whose default + * this routine gets default values of certain options whose default * values are maintained by protocol specific code */ /* ARGSUSED */ @@ -9975,15 +9513,10 @@ tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr) return (sizeof (int)); } - -/* - * TCP routine to get the values of options. - */ -int -tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr) +static int +tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) { int *i1 = (int *)ptr; - conn_t *connp = Q_TO_CONN(q); tcp_t *tcp = connp->conn_tcp; ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp; @@ -10028,7 +9561,7 @@ tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr) *i1 = tcp->tcp_xmit_hiwater; break; case SO_RCVBUF: - *i1 = RD(q)->q_hiwat; + *i1 = tcp->tcp_recv_hiwater; break; case SO_SND_COPYAVOID: *i1 = tcp->tcp_snd_zcopy_on ? @@ -10052,6 +9585,8 @@ tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr) case SO_DOMAIN: *i1 = tcp->tcp_family; break; + case SO_ACCEPTCONN: + *i1 = (tcp->tcp_state == TCPS_LISTEN); default: return (-1); } @@ -10293,22 +9828,84 @@ tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr) } /* + * TCP routine to get the values of options. + */ +int +tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) +{ + return (tcp_opt_get(Q_TO_CONN(q), level, name, ptr)); +} + +/* returns UNIX error, the optlen is a value-result arg */ +int +tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, + void *optvalp, socklen_t *optlen, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + squeue_t *sqp = connp->conn_sqp; + int error; + t_uscalar_t max_optbuf_len; + void *optvalp_buf; + int len; + + error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, + tcp_opt_obj.odb_opt_des_arr, + tcp_opt_obj.odb_opt_arr_cnt, + tcp_opt_obj.odb_topmost_tpiprovider, + B_FALSE, B_TRUE, cr); + if (error != 0) { + if (error < 0) { + error = proto_tlitosyserr(-error); + } + return (error); + } + + optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); + + error = squeue_synch_enter(sqp, connp, 0); + if (error == ENOMEM) { + return (ENOMEM); + } + + len = tcp_opt_get(connp, level, option_name, optvalp_buf); + squeue_synch_exit(sqp, connp); + + if (len < 0) { + /* + * Pass on to IP + */ + kmem_free(optvalp_buf, max_optbuf_len); + return (ip_get_options(connp, level, option_name, + optvalp, optlen, cr)); + } else { + /* + * update optlen and copy option value + */ + t_uscalar_t size = MIN(len, *optlen); + bcopy(optvalp_buf, optvalp, size); + bcopy(&size, optlen, sizeof (size)); + + kmem_free(optvalp_buf, max_optbuf_len); + return (0); + } +} + +/* * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements. * Parameters are assumed to be verified by the caller. */ /* ARGSUSED */ int -tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, +tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, - void *thisdg_attrs, cred_t *cr, mblk_t *mblk) + void *thisdg_attrs, cred_t *cr) { - conn_t *connp = Q_TO_CONN(q); tcp_t *tcp = connp->conn_tcp; int *i1 = (int *)invalp; boolean_t onoff = (*i1 == 0) ? 0 : 1; boolean_t checkonly; int reterr; - tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; + tcp_stack_t *tcps = tcp->tcp_tcps; switch (optset_context) { case SETFN_OPTCOM_CHECKONLY: @@ -10371,7 +9968,6 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, * of passed in length is done. It is assumed *_optcom_req() * routines do the right thing. */ - switch (level) { case SOL_SOCKET: switch (name) { @@ -10408,7 +10004,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, break; case SO_KEEPALIVE: if (checkonly) { - /* T_CHECK case */ + /* check only case */ break; } @@ -10462,8 +10058,11 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, } break; case SO_OOBINLINE: - if (!checkonly) + if (!checkonly) { tcp->tcp_oobinline = onoff; + if (IPCL_IS_NONSTR(tcp->tcp_connp)) + proto_set_rx_oob_opt(connp, onoff); + } break; case SO_DGRAM_ERRIND: if (!checkonly) @@ -10740,7 +10339,6 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, /* * Only sticky options; no ancillary data */ - ASSERT(thisdg_attrs == NULL); ipp = &tcp->tcp_sticky_ipp; switch (name) { @@ -10764,22 +10362,15 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, (uint8_t)*i1; ipp->ipp_fields |= IPPF_UNICAST_HOPS; } - reterr = tcp_build_hdrs(q, tcp); + reterr = tcp_build_hdrs(tcp); if (reterr != 0) return (reterr); } break; case IPV6_BOUND_IF: if (!checkonly) { - int error = 0; - tcp->tcp_bound_if = *i1; - error = ip_opt_set_ill(tcp->tcp_connp, *i1, - B_TRUE, checkonly, level, name, mblk); - if (error != 0) { - *outlenp = 0; - return (error); - } + PASS_OPT_TO_IP(connp); } break; /* @@ -10795,6 +10386,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, ~TCP_IPV6_RECVPKTINFO; /* Force it to be sent up with the next msg */ tcp->tcp_recvifindex = 0; + PASS_OPT_TO_IP(connp); } break; case IPV6_RECVTCLASS: @@ -10805,6 +10397,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, else tcp->tcp_ipv6_recvancillary &= ~TCP_IPV6_RECVTCLASS; + PASS_OPT_TO_IP(connp); } break; case IPV6_RECVHOPLIMIT: @@ -10817,6 +10410,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, ~TCP_IPV6_RECVHOPLIMIT; /* Force it to be sent up with the next msg */ tcp->tcp_recvhops = 0xffffffffU; + PASS_OPT_TO_IP(connp); } break; case IPV6_RECVHOPOPTS: @@ -10827,6 +10421,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, else tcp->tcp_ipv6_recvancillary &= ~TCP_IPV6_RECVHOPOPTS; + PASS_OPT_TO_IP(connp); } break; case IPV6_RECVDSTOPTS: @@ -10837,6 +10432,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, else tcp->tcp_ipv6_recvancillary &= ~TCP_IPV6_RECVDSTOPTS; + PASS_OPT_TO_IP(connp); } break; case _OLD_IPV6_RECVDSTOPTS: @@ -10857,6 +10453,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, else tcp->tcp_ipv6_recvancillary &= ~TCP_IPV6_RECVRTHDR; + PASS_OPT_TO_IP(connp); } break; case IPV6_RECVRTHDRDSTOPTS: @@ -10867,6 +10464,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, else tcp->tcp_ipv6_recvancillary &= ~TCP_IPV6_RECVRTDSTOPTS; + PASS_OPT_TO_IP(connp); } break; case IPV6_PKTINFO: @@ -10890,11 +10488,11 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr)) return (EINVAL); /* - * ip6_set_pktinfo() validates the source - * address and interface index. + * IP will validate the source address and + * interface index. */ - reterr = ip6_set_pktinfo(cr, tcp->tcp_connp, - pkti, mblk); + reterr = ip_set_options(tcp->tcp_connp, level, + name, invalp, inlen, cr); if (reterr != 0) return (reterr); ipp->ipp_ifindex = pkti->ipi6_ifindex; @@ -10908,9 +10506,10 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, else ipp->ipp_fields &= ~IPPF_ADDR; } - reterr = tcp_build_hdrs(q, tcp); + reterr = tcp_build_hdrs(tcp); if (reterr != 0) return (reterr); + PASS_OPT_TO_IP(connp); break; case IPV6_TCLASS: if (inlen != 0 && inlen != sizeof (int)) @@ -10931,7 +10530,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, } ipp->ipp_fields |= IPPF_TCLASS; } - reterr = tcp_build_hdrs(q, tcp); + reterr = tcp_build_hdrs(tcp); if (reterr != 0) return (reterr); break; @@ -10962,9 +10561,10 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, else ipp->ipp_fields &= ~IPPF_NEXTHOP; } - reterr = tcp_build_hdrs(q, tcp); + reterr = tcp_build_hdrs(tcp); if (reterr != 0) return (reterr); + PASS_OPT_TO_IP(connp); break; case IPV6_HOPOPTS: { ip6_hbh_t *hopts = (ip6_hbh_t *)invalp; @@ -10989,7 +10589,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, ipp->ipp_fields &= ~IPPF_HOPOPTS; else ipp->ipp_fields |= IPPF_HOPOPTS; - reterr = tcp_build_hdrs(q, tcp); + reterr = tcp_build_hdrs(tcp); if (reterr != 0) return (reterr); break; @@ -11017,7 +10617,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, ipp->ipp_fields &= ~IPPF_RTDSTOPTS; else ipp->ipp_fields |= IPPF_RTDSTOPTS; - reterr = tcp_build_hdrs(q, tcp); + reterr = tcp_build_hdrs(tcp); if (reterr != 0) return (reterr); break; @@ -11045,7 +10645,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, ipp->ipp_fields &= ~IPPF_DSTOPTS; else ipp->ipp_fields |= IPPF_DSTOPTS; - reterr = tcp_build_hdrs(q, tcp); + reterr = tcp_build_hdrs(tcp); if (reterr != 0) return (reterr); break; @@ -11073,14 +10673,15 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, ipp->ipp_fields &= ~IPPF_RTHDR; else ipp->ipp_fields |= IPPF_RTHDR; - reterr = tcp_build_hdrs(q, tcp); + reterr = tcp_build_hdrs(tcp); if (reterr != 0) return (reterr); break; } case IPV6_V6ONLY: - if (!checkonly) + if (!checkonly) { tcp->tcp_connp->conn_ipv6_v6only = onoff; + } break; case IPV6_USE_MIN_MTU: if (inlen != sizeof (int)) @@ -11140,6 +10741,80 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, return (0); } +/* ARGSUSED */ +int +tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, + uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, + void *thisdg_attrs, cred_t *cr, mblk_t *mblk) +{ + conn_t *connp = Q_TO_CONN(q); + + return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp, + outlenp, outvalp, thisdg_attrs, cr)); +} + +int +tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, + const void *optvalp, socklen_t optlen, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + squeue_t *sqp = connp->conn_sqp; + int error; + + /* + * Entering the squeue synchronously can result in a context switch, + * which can cause a rather sever performance degradation. So we try to + * handle whatever options we can without entering the squeue. + */ + if (level == IPPROTO_TCP) { + switch (option_name) { + case TCP_NODELAY: + if (optlen != sizeof (int32_t)) + return (EINVAL); + mutex_enter(&connp->conn_tcp->tcp_non_sq_lock); + connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 : + connp->conn_tcp->tcp_mss; + mutex_exit(&connp->conn_tcp->tcp_non_sq_lock); + return (0); + default: + break; + } + } + + error = squeue_synch_enter(sqp, connp, 0); + if (error == ENOMEM) { + return (ENOMEM); + } + + error = proto_opt_check(level, option_name, optlen, NULL, + tcp_opt_obj.odb_opt_des_arr, + tcp_opt_obj.odb_opt_arr_cnt, + tcp_opt_obj.odb_topmost_tpiprovider, + B_TRUE, B_FALSE, cr); + + if (error != 0) { + if (error < 0) { + error = proto_tlitosyserr(-error); + } + squeue_synch_exit(sqp, connp); + return (error); + } + + error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name, + optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp, + NULL, cr); + squeue_synch_exit(sqp, connp); + + if (error < 0) { + /* + * Pass on to ip + */ + error = ip_set_options(connp, level, option_name, optvalp, + optlen, cr); + } + return (error); +} + /* * Update tcp_sticky_hdrs based on tcp_sticky_ipp. * The headers include ip6i_t (if needed), ip6_t, any sticky extension @@ -11148,7 +10823,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, * Returns failure if can't allocate memory. */ static int -tcp_build_hdrs(queue_t *q, tcp_t *tcp) +tcp_build_hdrs(tcp_t *tcp) { char *hdrs; uint_t hdrs_len; @@ -11157,6 +10832,7 @@ tcp_build_hdrs(queue_t *q, tcp_t *tcp) ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp; in6_addr_t src, dst; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; /* * save the existing tcp header and source/dest IP addresses @@ -11241,7 +10917,8 @@ tcp_build_hdrs(queue_t *q, tcp_t *tcp) } /* Try to get everything in a single mblk */ - (void) mi_set_sth_wroff(RD(q), hdrs_len + tcps->tcps_wroff_xtra); + (void) proto_set_tx_wroff(tcp->tcp_rq, connp, + hdrs_len + tcps->tcps_wroff_xtra); return (0); } @@ -11368,6 +11045,7 @@ tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, uchar_t *ptr, uint_t len) uint8_t *ip_optp; tcph_t *new_tcph; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; if ((len > TCP_MAX_IP_OPTIONS_LENGTH) || (len & 0x3)) return (EINVAL); @@ -11408,7 +11086,7 @@ tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, uchar_t *ptr, uint_t len) tcp->tcp_hdr_len = len + tcph_len; if (!TCP_IS_DETACHED(tcp)) { /* Always allocate room for all options. */ - (void) mi_set_sth_wroff(tcp->tcp_rq, + (void) proto_set_tx_wroff(tcp->tcp_rq, connp, TCP_MAX_COMBINED_HEADER_LENGTH + tcps->tcps_wroff_xtra); } return (0); @@ -11721,26 +11399,55 @@ tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp) tcp->tcp_reass_tail = mp; } +static uint_t +tcp_rwnd_reopen(tcp_t *tcp) +{ + uint_t ret = 0; + uint_t thwin; + + /* Learn the latest rwnd information that we sent to the other side. */ + thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) + << tcp->tcp_rcv_ws; + /* This is peer's calculated send window (our receive window). */ + thwin -= tcp->tcp_rnxt - tcp->tcp_rack; + /* + * Increase the receive window to max. But we need to do receiver + * SWS avoidance. This means that we need to check the increase of + * of receive window is at least 1 MSS. + */ + if (tcp->tcp_recv_hiwater - thwin >= tcp->tcp_mss) { + /* + * If the window that the other side knows is less than max + * deferred acks segments, send an update immediately. + */ + if (thwin < tcp->tcp_rack_cur_max * tcp->tcp_mss) { + BUMP_MIB(&tcp->tcp_tcps->tcps_mib, tcpOutWinUpdate); + ret = TH_ACK_NEEDED; + } + tcp->tcp_rwnd = tcp->tcp_recv_hiwater; + } + return (ret); +} + /* * Send up all messages queued on tcp_rcv_list. */ static uint_t -tcp_rcv_drain(queue_t *q, tcp_t *tcp) +tcp_rcv_drain(tcp_t *tcp) { mblk_t *mp; uint_t ret = 0; - uint_t thwin; #ifdef DEBUG uint_t cnt = 0; #endif - tcp_stack_t *tcps = tcp->tcp_tcps; + queue_t *q = tcp->tcp_rq; /* Can't drain on an eager connection */ if (tcp->tcp_listener != NULL) return (ret); - /* Can't be sodirect enabled */ - ASSERT(SOD_NOT_ENABLED(tcp)); + /* Can't be a non-STREAMS connection or sodirect enabled */ + ASSERT((!IPCL_IS_NONSTR(tcp->tcp_connp)) && SOD_NOT_ENABLED(tcp)); /* No need for the push timer now. */ if (tcp->tcp_push_tid != 0) { @@ -11758,7 +11465,8 @@ tcp_rcv_drain(queue_t *q, tcp_t *tcp) * some work. */ if ((tcp->tcp_fused || tcp->tcp_fused_sigurg)) { - ASSERT(tcp->tcp_fused_sigurg_mp != NULL); + ASSERT(IPCL_IS_NONSTR(tcp->tcp_connp) || + tcp->tcp_fused_sigurg_mp != NULL); if (tcp_fuse_rcv_drain(q, tcp, tcp->tcp_fused ? NULL : &tcp->tcp_fused_sigurg_mp)) return (ret); @@ -11779,32 +11487,16 @@ tcp_rcv_drain(queue_t *q, tcp_t *tcp) } putnext(q, mp); } +#ifdef DEBUG ASSERT(cnt == tcp->tcp_rcv_cnt); +#endif tcp->tcp_rcv_last_head = NULL; tcp->tcp_rcv_last_tail = NULL; tcp->tcp_rcv_cnt = 0; - /* Learn the latest rwnd information that we sent to the other side. */ - thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) - << tcp->tcp_rcv_ws; - /* This is peer's calculated send window (our receive window). */ - thwin -= tcp->tcp_rnxt - tcp->tcp_rack; - /* - * Increase the receive window to max. But we need to do receiver - * SWS avoidance. This means that we need to check the increase of - * of receive window is at least 1 MSS. - */ - if (canputnext(q) && (q->q_hiwat - thwin >= tcp->tcp_mss)) { - /* - * If the window that the other side knows is less than max - * deferred acks segments, send an update immediately. - */ - if (thwin < tcp->tcp_rack_cur_max * tcp->tcp_mss) { - BUMP_MIB(&tcps->tcps_mib, tcpOutWinUpdate); - ret = TH_ACK_NEEDED; - } - tcp->tcp_rwnd = q->q_hiwat; - } + if (canputnext(q)) + return (tcp_rwnd_reopen(tcp)); + return (ret); } @@ -12993,8 +12685,27 @@ tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2) tcp->tcp_remote)] = tcp->tcp_remote; } mutex_exit(&listener->tcp_eager_lock); - if (need_send_conn_ind) - putnext(listener->tcp_rq, mp); + if (need_send_conn_ind) { + if (IPCL_IS_NONSTR(lconnp)) { + ASSERT(tcp->tcp_listener == listener); + ASSERT(tcp->tcp_saved_listener == listener); + if ((*lconnp->conn_upcalls->su_newconn) + (lconnp->conn_upper_handle, + (sock_lower_handle_t)tcp->tcp_connp, + &sock_tcp_downcalls, DB_CRED(mp), DB_CPID(mp), + &tcp->tcp_connp->conn_upcalls) != NULL) { + /* + * Keep the message around + * in case of fallback + */ + tcp->tcp_conn.tcp_eager_conn_ind = mp; + } else { + freemsg(mp); + } + } else { + putnext(listener->tcp_rq, mp); + } + } } mblk_t * @@ -13223,6 +12934,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) rptr = mp->b_rptr; } ASSERT(DB_TYPE(mp) == M_DATA); + ASSERT(mp->b_next == NULL); tcph = (tcph_t *)&rptr[ip_hdr_len]; seg_seq = ABE32_TO_U32(tcph->th_seq); @@ -13339,8 +13051,8 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) * The following changes our rwnd to be a multiple of the * MIN(peer MSS, our MSS) for performance reason. */ - (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(tcp->tcp_rq->q_hiwat, - tcp->tcp_mss)); + (void) tcp_rwnd_set(tcp, + MSS_ROUNDUP(tcp->tcp_recv_hiwater, tcp->tcp_mss)); /* Is the other end ECN capable? */ if (tcp->tcp_ecn_ok) { @@ -13361,12 +13073,13 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) if (!TCP_IS_DETACHED(tcp)) { /* Allocate room for SACK options if needed. */ if (tcp->tcp_snd_sack_ok) { - (void) mi_set_sth_wroff(tcp->tcp_rq, - tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN + + (void) proto_set_tx_wroff(tcp->tcp_rq, connp, + tcp->tcp_hdr_len + + TCPOPT_MAX_SACK_LEN + (tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra)); } else { - (void) mi_set_sth_wroff(tcp->tcp_rq, + (void) proto_set_tx_wroff(tcp->tcp_rq, connp, tcp->tcp_hdr_len + (tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra)); @@ -13466,8 +13179,18 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) BUMP_LOCAL(tcp->tcp_obsegs); BUMP_MIB(&tcps->tcps_mib, tcpOutAck); - /* Send up T_CONN_CON */ - putnext(tcp->tcp_rq, mp1); + if (!IPCL_IS_NONSTR(connp)) { + /* Send up T_CONN_CON */ + putnext(tcp->tcp_rq, mp1); + } else { + (*connp->conn_upcalls-> + su_connected) + (connp->conn_upper_handle, + tcp->tcp_connid, + DB_CRED(mp1), + DB_CPID(mp1)); + freemsg(mp1); + } freemsg(mp); return; @@ -13481,7 +13204,15 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) */ TCP_STAT(tcps, tcp_fusion_unfusable); tcp->tcp_unfusable = B_TRUE; - putnext(tcp->tcp_rq, mp1); + if (!IPCL_IS_NONSTR(connp)) { + putnext(tcp->tcp_rq, mp1); + } else { + (*connp->conn_upcalls->su_connected) + (connp->conn_upper_handle, + tcp->tcp_connid, DB_CRED(mp1), + DB_CPID(mp1)); + freemsg(mp1); + } } /* @@ -13835,31 +13566,40 @@ try_again:; if ((flags & TH_URG) && (!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq, tcp->tcp_urp_last))) { - mp1 = allocb(0, BPRI_MED); - if (mp1 == NULL) { - freemsg(mp); - return; - } - if (!TCP_IS_DETACHED(tcp) && - !putnextctl1(tcp->tcp_rq, M_PCSIG, - SIGURG)) { - /* Try again on the rexmit. */ - freemsg(mp1); - freemsg(mp); - return; + if (IPCL_IS_NONSTR(connp)) { + if (!TCP_IS_DETACHED(tcp)) { + (*connp->conn_upcalls-> + su_signal_oob) + (connp->conn_upper_handle, + urp); + } + } else { + mp1 = allocb(0, BPRI_MED); + if (mp1 == NULL) { + freemsg(mp); + return; + } + if (!TCP_IS_DETACHED(tcp) && + !putnextctl1(tcp->tcp_rq, + M_PCSIG, SIGURG)) { + /* Try again on the rexmit. */ + freemsg(mp1); + freemsg(mp); + return; + } + /* + * If the next byte would be the mark + * then mark with MARKNEXT else mark + * with NOTMARKNEXT. + */ + if (gap == 0 && urp == 0) + mp1->b_flag |= MSGMARKNEXT; + else + mp1->b_flag |= MSGNOTMARKNEXT; + freemsg(tcp->tcp_urp_mark_mp); + tcp->tcp_urp_mark_mp = mp1; + flags |= TH_SEND_URP_MARK; } - /* - * If the next byte would be the mark - * then mark with MARKNEXT else mark - * with NOTMARKNEXT. - */ - if (gap == 0 && urp == 0) - mp1->b_flag |= MSGMARKNEXT; - else - mp1->b_flag |= MSGNOTMARKNEXT; - freemsg(tcp->tcp_urp_mark_mp); - tcp->tcp_urp_mark_mp = mp1; - flags |= TH_SEND_URP_MARK; tcp->tcp_urp_last_valid = B_TRUE; tcp->tcp_urp_last = urp + seg_seq; } @@ -14070,50 +13810,60 @@ ok:; if (flags & TH_URG && urp >= 0) { if (!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) { - /* - * If we haven't generated the signal yet for this - * urgent pointer value, do it now. Also, send up a - * zero-length M_DATA indicating whether or not this is - * the mark. The latter is not needed when a - * T_EXDATA_IND is sent up. However, if there are - * allocation failures this code relies on the sender - * retransmitting and the socket code for determining - * the mark should not block waiting for the peer to - * transmit. Thus, for simplicity we always send up the - * mark indication. - */ - mp1 = allocb(0, BPRI_MED); - if (mp1 == NULL) { - freemsg(mp); - return; - } - if (!TCP_IS_DETACHED(tcp) && - !putnextctl1(tcp->tcp_rq, M_PCSIG, SIGURG)) { - /* Try again on the rexmit. */ - freemsg(mp1); - freemsg(mp); - return; - } - /* - * Mark with NOTMARKNEXT for now. - * The code below will change this to MARKNEXT - * if we are at the mark. - * - * If there are allocation failures (e.g. in dupmsg - * below) the next time tcp_rput_data sees the urgent - * segment it will send up the MSG*MARKNEXT message. - */ - mp1->b_flag |= MSGNOTMARKNEXT; - freemsg(tcp->tcp_urp_mark_mp); - tcp->tcp_urp_mark_mp = mp1; - flags |= TH_SEND_URP_MARK; + if (IPCL_IS_NONSTR(connp)) { + if (!TCP_IS_DETACHED(tcp)) { + (*connp->conn_upcalls->su_signal_oob) + (connp->conn_upper_handle, urp); + } + } else { + /* + * If we haven't generated the signal yet for + * this urgent pointer value, do it now. Also, + * send up a zero-length M_DATA indicating + * whether or not this is the mark. The latter + * is not needed when a T_EXDATA_IND is sent up. + * However, if there are allocation failures + * this code relies on the sender retransmitting + * and the socket code for determining the mark + * should not block waiting for the peer to + * transmit. Thus, for simplicity we always + * send up the mark indication. + */ + mp1 = allocb(0, BPRI_MED); + if (mp1 == NULL) { + freemsg(mp); + return; + } + if (!TCP_IS_DETACHED(tcp) && + !putnextctl1(tcp->tcp_rq, M_PCSIG, + SIGURG)) { + /* Try again on the rexmit. */ + freemsg(mp1); + freemsg(mp); + return; + } + /* + * Mark with NOTMARKNEXT for now. + * The code below will change this to MARKNEXT + * if we are at the mark. + * + * If there are allocation failures (e.g. in + * dupmsg below) the next time tcp_rput_data + * sees the urgent segment it will send up the + * MSGMARKNEXT message. + */ + mp1->b_flag |= MSGNOTMARKNEXT; + freemsg(tcp->tcp_urp_mark_mp); + tcp->tcp_urp_mark_mp = mp1; + flags |= TH_SEND_URP_MARK; #ifdef DEBUG - (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, - "tcp_rput: sent M_PCSIG 2 seq %x urp %x " - "last %x, %s", - seg_seq, urp, tcp->tcp_urp_last, - tcp_display(tcp, NULL, DISP_PORT_ONLY)); + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, + "tcp_rput: sent M_PCSIG 2 seq %x urp %x " + "last %x, %s", + seg_seq, urp, tcp->tcp_urp_last, + tcp_display(tcp, NULL, DISP_PORT_ONLY)); #endif /* DEBUG */ + } tcp->tcp_urp_last_valid = B_TRUE; tcp->tcp_urp_last = urp + seg_seq; } else if (tcp->tcp_urp_mark_mp != NULL) { @@ -14218,7 +13968,15 @@ ok:; * This segment contains only the urgent byte. We * have to allocate the T_exdata_ind, if we can. */ - if (!tcp->tcp_urp_mp) { + if (IPCL_IS_NONSTR(connp)) { + int error; + + (*connp->conn_upcalls->su_recv) + (connp->conn_upper_handle, mp, seg_len, + MSG_OOB, &error, NULL); + mp = NULL; + goto update_ack; + } else if (!tcp->tcp_urp_mp) { struct T_exdata_ind *tei; mp1 = allocb(sizeof (struct T_exdata_ind), BPRI_MED); @@ -14299,15 +14057,16 @@ ok:; seg_len, flags, tcp_display(tcp, NULL, DISP_PORT_ONLY)); #endif /* DEBUG */ - } else { - /* Data left until we hit mark */ + } #ifdef DEBUG + else { + /* Data left until we hit mark */ (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_rput: URP %d bytes left, %s", urp - seg_len, tcp_display(tcp, NULL, DISP_PORT_ONLY)); -#endif /* DEBUG */ } +#endif /* DEBUG */ } process_ack: @@ -15194,6 +14953,7 @@ est: mp = mp->b_cont; freeb(mp1); } +update_ack: tcph = tcp->tcp_tcph; tcp->tcp_rack_cnt++; { @@ -15239,6 +14999,9 @@ est: tcp->tcp_rnxt += seg_len; U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack); + if (mp == NULL) + goto xmit_check; + /* Update SACK list */ if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { tcp_sack_remove(tcp->tcp_sack_list, tcp->tcp_rnxt, @@ -15297,17 +15060,28 @@ est: if (!(sodp->sod_state & SOD_ENABLED) || (tcp->tcp_kssl_ctx != NULL && DB_TYPE(mp) == M_DATA)) { - mutex_exit(sodp->sod_lockp); sodp = NULL; } + mutex_exit(sodp->sod_lockp); } if (mp->b_datap->db_type != M_DATA || (flags & TH_MARKNEXT_NEEDED)) { - if (sodp != NULL) { - if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) { - sodp->sod_uioa.uioa_state &= UIOA_CLR; - sodp->sod_uioa.uioa_state |= UIOA_FINI; + if (IPCL_IS_NONSTR(connp)) { + int error; + + if ((*connp->conn_upcalls->su_recv) + (connp->conn_upper_handle, mp, + seg_len, 0, &error, NULL) <= 0) { + if (error == ENOSPC) { + tcp->tcp_rwnd -= seg_len; + } else if (error == EOPNOTSUPP) { + tcp_rcv_enqueue(tcp, mp, + seg_len); + } } + } else if (sodp != NULL) { + mutex_enter(sodp->sod_lockp); + SOD_UIOAFINI(sodp); if (!SOD_QEMPTY(sodp) && (sodp->sod_state & SOD_WAKE_NOT)) { flags |= tcp_rcv_sod_wakeup(tcp, sodp); @@ -15316,7 +15090,7 @@ est: mutex_exit(sodp->sod_lockp); } } else if (tcp->tcp_rcv_list != NULL) { - flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); + flags |= tcp_rcv_drain(tcp); } ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); @@ -15338,23 +15112,44 @@ est: DTRACE_PROBE1(kssl_mblk__ksslinput_data1, mblk_t *, mp); tcp_kssl_input(tcp, mp); - } else { + } else if (!IPCL_IS_NONSTR(connp)) { + /* Already handled non-STREAMS case. */ putnext(tcp->tcp_rq, mp); if (!canputnext(tcp->tcp_rq)) tcp->tcp_rwnd -= seg_len; } } else if ((tcp->tcp_kssl_ctx != NULL) && (DB_TYPE(mp) == M_DATA)) { - /* Do SSL processing first */ - DTRACE_PROBE1(kssl_mblk__ksslinput_data2, - mblk_t *, mp); + /* Does this need SSL processing first? */ + DTRACE_PROBE1(kssl_mblk__ksslinput_data2, mblk_t *, mp); tcp_kssl_input(tcp, mp); + } else if (IPCL_IS_NONSTR(connp)) { + /* Non-STREAMS socket */ + boolean_t push = flags & (TH_PUSH|TH_FIN); + int error; + + if ((*connp->conn_upcalls->su_recv)( + connp->conn_upper_handle, + mp, seg_len, 0, &error, &push) <= 0) { + if (error == ENOSPC) { + tcp->tcp_rwnd -= seg_len; + } else if (error == EOPNOTSUPP) { + tcp_rcv_enqueue(tcp, mp, seg_len); + } + } else if (push) { + /* + * PUSH bit set and sockfs is not + * flow controlled + */ + flags |= tcp_rwnd_reopen(tcp); + } } else if (sodp != NULL) { /* * Sodirect so all mblk_t's are queued on the * socket directly, check for wakeup of blocked * reader (if any), and last if flow-controled. */ + mutex_enter(sodp->sod_lockp); flags |= tcp_rcv_sod_enqueue(tcp, sodp, mp, seg_len); if ((sodp->sod_state & SOD_WAKE_NEED) || (flags & (TH_PUSH|TH_FIN))) { @@ -15368,7 +15163,7 @@ est: mutex_exit(sodp->sod_lockp); } } else if ((flags & (TH_PUSH|TH_FIN)) || - tcp->tcp_rcv_cnt + seg_len >= tcp->tcp_rq->q_hiwat >> 3) { + tcp->tcp_rcv_cnt + seg_len >= tcp->tcp_recv_hiwater >> 3) { if (tcp->tcp_rcv_list != NULL) { /* * Enqueue the new segment first and then @@ -15379,12 +15174,12 @@ est: * This way can remove the else part later * on. * - * We don't this to avoid one more call to + * We don't do this to avoid one more call to * canputnext() as tcp_rcv_drain() needs to * call canputnext(). */ tcp_rcv_enqueue(tcp, mp, seg_len); - flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); + flags |= tcp_rcv_drain(tcp); } else { putnext(tcp->tcp_rq, mp); if (!canputnext(tcp->tcp_rq)) @@ -15394,6 +15189,8 @@ est: /* * Enqueue all packets when processing an mblk * from the co queue and also enqueue normal packets. + * For packets which belong to SSL stream do SSL + * processing first. */ tcp_rcv_enqueue(tcp, mp, seg_len); } @@ -15409,7 +15206,8 @@ est: * such that the Q is empty now even though data was added * above. */ - if (((sodp != NULL && !SOD_QEMPTY(sodp) && + if (!IPCL_IS_NONSTR(connp) && + ((sodp != NULL && !SOD_QEMPTY(sodp) && (sodp->sod_state & SOD_WAKE_NOT)) || (sodp == NULL && tcp->tcp_rcv_list != NULL)) && tcp->tcp_push_tid == 0) { @@ -15495,6 +15293,7 @@ xmit_check: ack_check: if (flags & TH_SEND_URP_MARK) { ASSERT(tcp->tcp_urp_mark_mp); + ASSERT(!IPCL_IS_NONSTR(connp)); /* * Send up any queued data and then send the mark message */ @@ -15514,7 +15313,7 @@ ack_check: flags |= tcp_rcv_sod_wakeup(tcp, sodp); /* sod_wakeup() does the mutex_exit() */ } else if (tcp->tcp_rcv_list != NULL) { - flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); + flags |= tcp_rcv_drain(tcp); ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); @@ -15568,6 +15367,14 @@ ack_check: ASSERT(tcp->tcp_listener == NULL); + if (IPCL_IS_NONSTR(connp)) { + ASSERT(tcp->tcp_ordrel_mp == NULL); + tcp->tcp_ordrel_done = B_TRUE; + (*connp->conn_upcalls->su_opctl) + (connp->conn_upper_handle, SOCK_OPCTL_SHUT_RECV, 0); + goto done; + } + SOD_PTR_ENTER(tcp, sodp); if (sodp != NULL) { if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) { @@ -15588,7 +15395,7 @@ ack_check: /* * Push any mblk(s) enqueued from co processing. */ - flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); + flags |= tcp_rcv_drain(tcp); ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); @@ -15934,7 +15741,7 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp) * thus we clear out all addresses and ports. */ static void -tcp_bind_failed(tcp_t *tcp, mblk_t *mp, int error) +tcp_tpi_bind_failed(tcp_t *tcp, mblk_t *mp, int error) { queue_t *q = tcp->tcp_rq; tcph_t *tcph; @@ -15980,7 +15787,7 @@ tcp_bind_failed(tcp_t *tcp, mblk_t *mp, int error) tea->ERROR_prim = T_CONN_REQ; break; default: - panic("tcp_bind_failed: unexpected TPI type"); + panic("tcp_tpi_bind_failed: unexpected TPI type"); /*NOTREACHED*/ } @@ -16015,17 +15822,9 @@ tcp_bind_failed(tcp_t *tcp, mblk_t *mp, int error) void tcp_rput_other(tcp_t *tcp, mblk_t *mp) { - mblk_t *mp1; uchar_t *rptr = mp->b_rptr; queue_t *q = tcp->tcp_rq; struct T_error_ack *tea; - uint32_t mss; - mblk_t *syn_mp; - mblk_t *mdti; - mblk_t *lsoi; - int retval; - mblk_t *ire_mp; - tcp_stack_t *tcps = tcp->tcp_tcps; switch (mp->b_datap->db_type) { case M_PROTO: @@ -16037,190 +15836,11 @@ tcp_rput_other(tcp_t *tcp, mblk_t *mp) switch (tea->PRIM_type) { case T_BIND_ACK: /* - * Adapt Multidata information, if any. The - * following tcp_mdt_update routine will free - * the message. - */ - if ((mdti = tcp_mdt_info_mp(mp)) != NULL) { - tcp_mdt_update(tcp, &((ip_mdt_info_t *)mdti-> - b_rptr)->mdt_capab, B_TRUE); - freemsg(mdti); - } - - /* - * Check to update LSO information with tcp, and - * tcp_lso_update routine will free the message. - */ - if ((lsoi = tcp_lso_info_mp(mp)) != NULL) { - tcp_lso_update(tcp, &((ip_lso_info_t *)lsoi-> - b_rptr)->lso_capab); - freemsg(lsoi); - } - - /* Get the IRE, if we had requested for it */ - ire_mp = tcp_ire_mp(mp); - - if (tcp->tcp_hard_binding) { - tcp->tcp_hard_binding = B_FALSE; - tcp->tcp_hard_bound = B_TRUE; - CL_INET_CONNECT(tcp); - } else { - if (ire_mp != NULL) - freeb(ire_mp); - goto after_syn_sent; - } - - retval = tcp_adapt_ire(tcp, ire_mp); - if (ire_mp != NULL) - freeb(ire_mp); - if (retval == 0) { - tcp_bind_failed(tcp, mp, - (int)((tcp->tcp_state >= TCPS_SYN_SENT) ? - ENETUNREACH : EADDRNOTAVAIL)); - return; - } - /* - * Don't let an endpoint connect to itself. - * Also checked in tcp_connect() but that - * check can't handle the case when the - * local IP address is INADDR_ANY. - */ - if (tcp->tcp_ipversion == IPV4_VERSION) { - if ((tcp->tcp_ipha->ipha_dst == - tcp->tcp_ipha->ipha_src) && - (BE16_EQL(tcp->tcp_tcph->th_lport, - tcp->tcp_tcph->th_fport))) { - tcp_bind_failed(tcp, mp, EADDRNOTAVAIL); - return; - } - } else { - if (IN6_ARE_ADDR_EQUAL( - &tcp->tcp_ip6h->ip6_dst, - &tcp->tcp_ip6h->ip6_src) && - (BE16_EQL(tcp->tcp_tcph->th_lport, - tcp->tcp_tcph->th_fport))) { - tcp_bind_failed(tcp, mp, EADDRNOTAVAIL); - return; - } - } - ASSERT(tcp->tcp_state == TCPS_SYN_SENT); - /* - * This should not be possible! Just for - * defensive coding... - */ - if (tcp->tcp_state != TCPS_SYN_SENT) - goto after_syn_sent; - - if (is_system_labeled() && - !tcp_update_label(tcp, CONN_CRED(tcp->tcp_connp))) { - tcp_bind_failed(tcp, mp, EHOSTUNREACH); - return; - } - - ASSERT(q == tcp->tcp_rq); - /* - * tcp_adapt_ire() does not adjust - * for TCP/IP header length. - */ - mss = tcp->tcp_mss - tcp->tcp_hdr_len; - - /* - * Just make sure our rwnd is at - * least tcp_recv_hiwat_mss * MSS - * large, and round up to the nearest - * MSS. - * - * We do the round up here because - * we need to get the interface - * MTU first before we can do the - * round up. - */ - tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss), - tcps->tcps_recv_hiwat_minmss * mss); - q->q_hiwat = tcp->tcp_rwnd; - tcp_set_ws_value(tcp); - U32_TO_ABE16((tcp->tcp_rwnd >> tcp->tcp_rcv_ws), - tcp->tcp_tcph->th_win); - if (tcp->tcp_rcv_ws > 0 || tcps->tcps_wscale_always) - tcp->tcp_snd_ws_ok = B_TRUE; - - /* - * Set tcp_snd_ts_ok to true - * so that tcp_xmit_mp will - * include the timestamp - * option in the SYN segment. - */ - if (tcps->tcps_tstamp_always || - (tcp->tcp_rcv_ws && tcps->tcps_tstamp_if_wscale)) { - tcp->tcp_snd_ts_ok = B_TRUE; - } - - /* - * tcp_snd_sack_ok can be set in - * tcp_adapt_ire() if the sack metric - * is set. So check it here also. - */ - if (tcps->tcps_sack_permitted == 2 || - tcp->tcp_snd_sack_ok) { - if (tcp->tcp_sack_info == NULL) { - tcp->tcp_sack_info = - kmem_cache_alloc( - tcp_sack_info_cache, - KM_SLEEP); - } - tcp->tcp_snd_sack_ok = B_TRUE; - } - - /* - * Should we use ECN? Note that the current - * default value (SunOS 5.9) of tcp_ecn_permitted - * is 1. The reason for doing this is that there - * are equipments out there that will drop ECN - * enabled IP packets. Setting it to 1 avoids - * compatibility problems. - */ - if (tcps->tcps_ecn_permitted == 2) - tcp->tcp_ecn_ok = B_TRUE; - - TCP_TIMER_RESTART(tcp, tcp->tcp_rto); - syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, - tcp->tcp_iss, B_FALSE, NULL, B_FALSE); - if (syn_mp) { - cred_t *cr; - pid_t pid; - - /* - * Obtain the credential from the - * thread calling connect(); the credential - * lives on in the second mblk which - * originated from T_CONN_REQ and is echoed - * with the T_BIND_ACK from ip. If none - * can be found, default to the creator - * of the socket. - */ - if (mp->b_cont == NULL || - (cr = DB_CRED(mp->b_cont)) == NULL) { - cr = tcp->tcp_cred; - pid = tcp->tcp_cpid; - } else { - pid = DB_CPID(mp->b_cont); - } - mblk_setcred(syn_mp, cr); - DB_CPID(syn_mp) = pid; - tcp_send_data(tcp, tcp->tcp_wq, syn_mp); - } - after_syn_sent: - /* - * A trailer mblk indicates a waiting client upstream. - * We complete here the processing begun in - * either tcp_bind() or tcp_connect() by passing - * upstream the reply message they supplied. + * AF_INET socket should not be here. */ - mp1 = mp; - mp = mp->b_cont; - freeb(mp1); - if (mp) - break; + ASSERT(tcp->tcp_family != AF_INET && + tcp->tcp_family != AF_INET6); + (void) tcp_post_ip_bind(tcp, mp->b_cont, 0); return; case T_ERROR_ACK: if (tcp->tcp_debug) { @@ -16233,25 +15853,11 @@ tcp_rput_other(tcp_t *tcp, mblk_t *mp) switch (tea->ERROR_prim) { case O_T_BIND_REQ: case T_BIND_REQ: - tcp_bind_failed(tcp, mp, + ASSERT(tcp->tcp_family != AF_INET); + tcp_tpi_bind_failed(tcp, mp, (int)((tcp->tcp_state >= TCPS_SYN_SENT) ? ENETUNREACH : EADDRNOTAVAIL)); return; - case T_UNBIND_REQ: - tcp->tcp_hard_binding = B_FALSE; - tcp->tcp_hard_bound = B_FALSE; - if (mp->b_cont) { - freemsg(mp->b_cont); - mp->b_cont = NULL; - } - if (tcp->tcp_unbind_pending) - tcp->tcp_unbind_pending = 0; - else { - /* From tcp_ip_unbind() - free */ - freemsg(mp); - return; - } - break; case T_SVR4_OPTMGMT_REQ: if (tcp->tcp_drop_opt_ack_cnt > 0) { /* T_OPTMGMT_REQ generated by TCP */ @@ -16285,6 +15891,7 @@ tcp_rput_other(tcp_t *tcp, mblk_t *mp) } break; default: + ASSERT(tea->ERROR_prim != T_UNBIND_REQ); break; } break; @@ -16302,6 +15909,7 @@ tcp_rput_other(tcp_t *tcp, mblk_t *mp) * bind. Otherwise accept could possibly run and free * this tcp struct. */ + ASSERT(q != NULL); putnext(q, mp); } @@ -16345,7 +15953,7 @@ tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2) */ TCP_FUSE_SYNCSTR_PLUG_DRAIN(tcp); if (tcp->tcp_rcv_list != NULL) - (void) tcp_rcv_drain(tcp->tcp_rq, tcp); + (void) tcp_rcv_drain(tcp); if (peer_tcp > tcp) { mutex_enter(&peer_tcp->tcp_non_sq_lock); @@ -16487,8 +16095,20 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) * purposes in tcp_fuse_output(). */ sth_hiwat = tcp_fuse_set_rcv_hiwat(tcp, rwnd); - if (!tcp_detached) - (void) mi_set_sth_hiwat(tcp->tcp_rq, sth_hiwat); + if (!tcp_detached) { + (void) proto_set_rx_hiwat(tcp->tcp_rq, tcp->tcp_connp, + sth_hiwat); + if (IPCL_IS_NONSTR(tcp->tcp_connp)) { + conn_t *connp = tcp->tcp_connp; + struct sock_proto_props sopp; + + sopp.sopp_flags = SOCKOPT_RCVTHRESH; + sopp.sopp_rcvthresh = sth_hiwat >> 3; + + (*connp->conn_upcalls->su_set_proto_props) + (connp->conn_upper_handle, &sopp); + } + } /* * In the fusion case, the maxpsz stream head value of @@ -16500,10 +16120,11 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) return (rwnd); } - if (tcp_detached) + if (tcp_detached) { old_max_rwnd = tcp->tcp_rwnd; - else - old_max_rwnd = tcp->tcp_rq->q_hiwat; + } else { + old_max_rwnd = tcp->tcp_recv_hiwater; + } /* * Insist on a receive window that is at least @@ -16570,17 +16191,20 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) if (tcp_detached) return (rwnd); /* - * We set the maximum receive window into rq->q_hiwat. + * We set the maximum receive window into rq->q_hiwat if it is + * a STREAMS socket. * This is not actually used for flow control. */ - tcp->tcp_rq->q_hiwat = rwnd; + if (!IPCL_IS_NONSTR(tcp->tcp_connp)) + tcp->tcp_rq->q_hiwat = rwnd; + tcp->tcp_recv_hiwater = rwnd; /* - * Set the Stream head high water mark. This doesn't have to be + * Set the STREAM head high water mark. This doesn't have to be * here, since we are simply using default values, but we would * prefer to choose these values algorithmically, with a likely * relationship to rwnd. */ - (void) mi_set_sth_hiwat(tcp->tcp_rq, + (void) proto_set_rx_hiwat(tcp->tcp_rq, tcp->tcp_connp, MAX(rwnd, tcps->tcps_sth_rcv_hiwat)); return (rwnd); } @@ -16939,7 +16563,7 @@ tcp_snmp_state(tcp_t *tcp) static char tcp_report_header[] = "TCP " MI_COL_HDRPAD_STR - "zone dest snxt suna " + "zone dest snxt suna " "swnd rnxt rack rwnd rto mss w sw rw t " "recent [lport,fport] state"; @@ -17127,7 +16751,7 @@ static int tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) { tf_t *tbf; - tcp_t *tcp; + tcp_t *tcp, *ltcp; int i; zoneid_t zoneid; tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; @@ -17153,15 +16777,18 @@ tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) for (i = 0; i < TCP_BIND_FANOUT_SIZE; i++) { tbf = &tcps->tcps_bind_fanout[i]; mutex_enter(&tbf->tf_lock); - for (tcp = tbf->tf_tcp; tcp != NULL; - tcp = tcp->tcp_bind_hash) { - if (zoneid != GLOBAL_ZONEID && - zoneid != tcp->tcp_connp->conn_zoneid) - continue; - CONN_INC_REF(tcp->tcp_connp); - tcp_report_item(mp->b_cont, tcp, i, - Q_TO_TCP(q), cr); - CONN_DEC_REF(tcp->tcp_connp); + for (ltcp = tbf->tf_tcp; ltcp != NULL; + ltcp = ltcp->tcp_bind_hash) { + for (tcp = ltcp; tcp != NULL; + tcp = tcp->tcp_bind_hash_port) { + if (zoneid != GLOBAL_ZONEID && + zoneid != tcp->tcp_connp->conn_zoneid) + continue; + CONN_INC_REF(tcp->tcp_connp); + tcp_report_item(mp->b_cont, tcp, i, + Q_TO_TCP(q), cr); + CONN_DEC_REF(tcp->tcp_connp); + } } mutex_exit(&tbf->tf_lock); } @@ -17201,7 +16828,7 @@ tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) (void) mi_mpprintf(mp, " TCP " MI_COL_HDRPAD_STR - "zone IP addr port seqnum backlog (q0/q/max)"); + "zone IP addr port seqnum backlog (q0/q/max)"); ipst = tcps->tcps_netstack->netstack_ip; @@ -17717,19 +17344,18 @@ tcp_timer(void *arg) } -/* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */ -static void -tcp_unbind(tcp_t *tcp, mblk_t *mp) +static int +tcp_do_unbind(conn_t *connp) { - conn_t *connp; + tcp_t *tcp = connp->conn_tcp; + int error = 0; switch (tcp->tcp_state) { case TCPS_BOUND: case TCPS_LISTEN: break; default: - tcp_err_ack(tcp, mp, TOUTSTATE, 0); - return; + return (-TOUTSTATE); } /* @@ -17752,14 +17378,32 @@ tcp_unbind(tcp_t *tcp, mblk_t *mp) tcp_bind_hash_remove(tcp); tcp->tcp_state = TCPS_IDLE; tcp->tcp_mdt = B_FALSE; - /* Send M_FLUSH according to TPI */ - (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW); + connp = tcp->tcp_connp; connp->conn_mdt_ok = B_FALSE; ipcl_hash_remove(connp); bzero(&connp->conn_ports, sizeof (connp->conn_ports)); - mp = mi_tpi_ok_ack_alloc(mp); - putnext(tcp->tcp_rq, mp); + + return (error); +} + +/* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */ +static void +tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp) +{ + int error = tcp_do_unbind(tcp->tcp_connp); + + if (error > 0) { + tcp_err_ack(tcp, mp, TSYSERR, error); + } else if (error < 0) { + tcp_err_ack(tcp, mp, -error, 0); + } else { + /* Send M_FLUSH according to TPI */ + (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW); + + mp = mi_tpi_ok_ack_alloc(mp); + putnext(tcp->tcp_rq, mp); + } } /* @@ -18025,9 +17669,9 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) /* find out how much we can send */ /* BEGIN CSTYLED */ /* - * un-acked usable + * un-acked usable * |--------------|-----------------| - * tcp_suna tcp_snxt tcp_suna+tcp_swnd + * tcp_suna tcp_snxt tcp_suna+tcp_swnd */ /* END CSTYLED */ @@ -18229,10 +17873,6 @@ slow: tcp_wput_data(tcp, NULL, B_FALSE); } -/* - * The function called through squeue to get behind eager's perimeter to - * finish the accept processing. - */ /* ARGSUSED */ void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) @@ -18240,17 +17880,33 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; queue_t *q = tcp->tcp_rq; - mblk_t *mp1; - mblk_t *stropt_mp = mp; - struct stroptions *stropt; - uint_t thwin; - tcp_stack_t *tcps = tcp->tcp_tcps; + struct tcp_options *tcpopt; + tcp_stack_t *tcps = tcp->tcp_tcps; + + /* socket options */ + uint_t sopp_flags; + ssize_t sopp_rxhiwat; + ssize_t sopp_maxblk; + ushort_t sopp_wroff; + ushort_t sopp_tail; + ushort_t sopp_copyopt; + + tcpopt = (struct tcp_options *)mp->b_rptr; /* * Drop the eager's ref on the listener, that was placed when * this eager began life in tcp_conn_request. */ CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp); + if (IPCL_IS_NONSTR(connp)) { + /* Safe to free conn_ind message */ + freemsg(tcp->tcp_conn.tcp_eager_conn_ind); + tcp->tcp_conn.tcp_eager_conn_ind = NULL; + + /* The listener tells us which upper handle to use */ + ASSERT(tcpopt->to_flags & TCPOPT_UPPERHANDLE); + connp->conn_upper_handle = tcpopt->to_handle; + } tcp->tcp_detached = B_FALSE; @@ -18267,37 +17923,47 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) */ ASSERT(tcp->tcp_listener == NULL); if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) { - struct T_discon_ind *tdi; - - (void) putnextctl1(q, M_FLUSH, FLUSHRW); - /* - * Let us reuse the incoming mblk to avoid memory - * allocation failure problems. We know that the - * size of the incoming mblk i.e. stroptions is greater - * than sizeof T_discon_ind. So the reallocb below - * can't fail. - */ - freemsg(mp->b_cont); - mp->b_cont = NULL; - ASSERT(DB_REF(mp) == 1); - mp = reallocb(mp, sizeof (struct T_discon_ind), - B_FALSE); - ASSERT(mp != NULL); - DB_TYPE(mp) = M_PROTO; - ((union T_primitives *)mp->b_rptr)->type = T_DISCON_IND; - tdi = (struct T_discon_ind *)mp->b_rptr; - if (tcp->tcp_issocket) { - tdi->DISCON_reason = ECONNREFUSED; - tdi->SEQ_number = 0; + if (IPCL_IS_NONSTR(connp)) { + ASSERT(tcp->tcp_issocket); + (*connp->conn_upcalls->su_disconnected)( + connp->conn_upper_handle, tcp->tcp_connid, + ECONNREFUSED); + freemsg(mp); } else { - tdi->DISCON_reason = ENOPROTOOPT; - tdi->SEQ_number = - tcp->tcp_conn_req_seqnum; + struct T_discon_ind *tdi; + + (void) putnextctl1(q, M_FLUSH, FLUSHRW); + /* + * Let us reuse the incoming mblk to avoid + * memory allocation failure problems. We know + * that the size of the incoming mblk i.e. + * stroptions is greater than sizeof + * T_discon_ind. So the reallocb below can't + * fail. + */ + freemsg(mp->b_cont); + mp->b_cont = NULL; + ASSERT(DB_REF(mp) == 1); + mp = reallocb(mp, sizeof (struct T_discon_ind), + B_FALSE); + ASSERT(mp != NULL); + DB_TYPE(mp) = M_PROTO; + ((union T_primitives *)mp->b_rptr)->type = + T_DISCON_IND; + tdi = (struct T_discon_ind *)mp->b_rptr; + if (tcp->tcp_issocket) { + tdi->DISCON_reason = ECONNREFUSED; + tdi->SEQ_number = 0; + } else { + tdi->DISCON_reason = ENOPROTOOPT; + tdi->SEQ_number = + tcp->tcp_conn_req_seqnum; + } + mp->b_wptr = mp->b_rptr + + sizeof (struct T_discon_ind); + putnext(q, mp); + return; } - mp->b_wptr = mp->b_rptr + sizeof (struct T_discon_ind); - putnext(q, mp); - } else { - freemsg(mp); } if (tcp->tcp_hard_binding) { tcp->tcp_hard_binding = B_FALSE; @@ -18306,19 +17972,21 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) return; } - mp1 = stropt_mp->b_cont; - stropt_mp->b_cont = NULL; - ASSERT(DB_TYPE(stropt_mp) == M_SETOPTS); - stropt = (struct stroptions *)stropt_mp->b_rptr; + if (tcpopt->to_flags & TCPOPT_BOUNDIF) { + int boundif = tcpopt->to_boundif; + uint_t len = sizeof (int); - while (mp1 != NULL) { - mp = mp1; - mp1 = mp1->b_cont; - mp->b_cont = NULL; - tcp->tcp_drop_opt_ack_cnt++; - CALL_IP_WPUT(connp, tcp->tcp_wq, mp); + (void) tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, IPPROTO_IPV6, + IPV6_BOUND_IF, len, (uchar_t *)&boundif, &len, + (uchar_t *)&boundif, NULL, tcp->tcp_cred); + } + if (tcpopt->to_flags & TCPOPT_RECVPKTINFO) { + uint_t on = 1; + uint_t len = sizeof (uint_t); + (void) tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, IPPROTO_IPV6, + IPV6_RECVPKTINFO, len, (uchar_t *)&on, &len, + (uchar_t *)&on, NULL, tcp->tcp_cred); } - mp = NULL; /* * For a loopback connection with tcp_direct_sockfs on, note that @@ -18331,42 +17999,50 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) * Set the max window size (tcp_rq->q_hiwat) of the acceptor * properly. This is the first time we know of the acceptor' * queue. So we do it here. + * + * XXX */ if (tcp->tcp_rcv_list == NULL) { /* * Recv queue is empty, tcp_rwnd should not have changed. * That means it should be equal to the listener's tcp_rwnd. */ - tcp->tcp_rq->q_hiwat = tcp->tcp_rwnd; + if (!IPCL_IS_NONSTR(connp)) + tcp->tcp_rq->q_hiwat = tcp->tcp_rwnd; + tcp->tcp_recv_hiwater = tcp->tcp_rwnd; } else { #ifdef DEBUG - uint_t cnt = 0; + mblk_t *tmp; + mblk_t *mp1; + uint_t cnt = 0; mp1 = tcp->tcp_rcv_list; - while ((mp = mp1) != NULL) { - mp1 = mp->b_next; - cnt += msgdsize(mp); + while ((tmp = mp1) != NULL) { + mp1 = tmp->b_next; + cnt += msgdsize(tmp); } ASSERT(cnt != 0 && tcp->tcp_rcv_cnt == cnt); #endif /* There is some data, add them back to get the max. */ - tcp->tcp_rq->q_hiwat = tcp->tcp_rwnd + tcp->tcp_rcv_cnt; + if (!IPCL_IS_NONSTR(connp)) + tcp->tcp_rq->q_hiwat = tcp->tcp_rwnd + tcp->tcp_rcv_cnt; + tcp->tcp_recv_hiwater = tcp->tcp_rwnd + tcp->tcp_rcv_cnt; } /* * This is the first time we run on the correct * queue after tcp_accept. So fix all the q parameters * here. */ - stropt->so_flags = SO_HIWAT | SO_MAXBLK | SO_WROFF; - stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); + sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_MAXBLK | SOCKOPT_WROFF; + sopp_maxblk = tcp_maxpsz_set(tcp, B_FALSE); /* * Record the stream head's high water mark for this endpoint; * this is used for flow-control purposes. */ - stropt->so_hiwat = tcp->tcp_fused ? - tcp_fuse_set_rcv_hiwat(tcp, q->q_hiwat) : - MAX(q->q_hiwat, tcps->tcps_sth_rcv_hiwat); + sopp_rxhiwat = tcp->tcp_fused ? + tcp_fuse_set_rcv_hiwat(tcp, tcp->tcp_recv_hiwater) : + MAX(tcp->tcp_recv_hiwater, tcps->tcps_sth_rcv_hiwat); /* * Determine what write offset value to use depending on SACK and @@ -18382,17 +18058,17 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) * since it would reduce the amount of work done by kmem. * Non-fused tcp loopback case is handled separately below. */ - stropt->so_wroff = 0; + sopp_wroff = 0; /* * Update the peer's transmit parameters according to * our recently calculated high water mark value. */ (void) tcp_maxpsz_set(tcp->tcp_loopback_peer, B_TRUE); } else if (tcp->tcp_snd_sack_ok) { - stropt->so_wroff = tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN + + sopp_wroff = tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN + (tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra); } else { - stropt->so_wroff = tcp->tcp_hdr_len + (tcp->tcp_loopback ? 0 : + sopp_wroff = tcp->tcp_hdr_len + (tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra); } @@ -18408,20 +18084,62 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) * costs. */ if (tcp->tcp_kssl_ctx != NULL) { - stropt->so_wroff += SSL3_WROFFSET; + sopp_wroff += SSL3_WROFFSET; - stropt->so_flags |= SO_TAIL; - stropt->so_tail = SSL3_MAX_TAIL_LEN; + sopp_flags |= SOCKOPT_TAIL; + sopp_tail = SSL3_MAX_TAIL_LEN; - stropt->so_flags |= SO_COPYOPT; - stropt->so_copyopt = ZCVMUNSAFE; + sopp_flags |= SOCKOPT_ZCOPY; + sopp_copyopt = ZCVMUNSAFE; - stropt->so_maxblk = SSL3_MAX_RECORD_LEN; + sopp_maxblk = SSL3_MAX_RECORD_LEN; } /* Send the options up */ - putnext(q, stropt_mp); + if (IPCL_IS_NONSTR(connp)) { + struct sock_proto_props sopp; + + sopp.sopp_flags = sopp_flags; + sopp.sopp_wroff = sopp_wroff; + sopp.sopp_maxblk = sopp_maxblk; + sopp.sopp_rxhiwat = sopp_rxhiwat; + if (sopp_flags & SOCKOPT_TAIL) { + ASSERT(tcp->tcp_kssl_ctx != NULL); + ASSERT(sopp_flags & SOCKOPT_ZCOPY); + sopp.sopp_tail = sopp_tail; + sopp.sopp_zcopyflag = sopp_copyopt; + } + (*connp->conn_upcalls->su_set_proto_props) + (connp->conn_upper_handle, &sopp); + } else { + struct stroptions *stropt; + mblk_t *stropt_mp = allocb(sizeof (struct stroptions), BPRI_HI); + if (stropt_mp == NULL) { + tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); + return; + } + DB_TYPE(stropt_mp) = M_SETOPTS; + stropt = (struct stroptions *)stropt_mp->b_rptr; + stropt_mp->b_wptr += sizeof (struct stroptions); + stropt = (struct stroptions *)stropt_mp->b_rptr; + stropt->so_flags |= SO_HIWAT | SO_WROFF | SO_MAXBLK; + stropt->so_hiwat = sopp_rxhiwat; + stropt->so_wroff = sopp_wroff; + stropt->so_maxblk = sopp_maxblk; + + if (sopp_flags & SOCKOPT_TAIL) { + ASSERT(tcp->tcp_kssl_ctx != NULL); + + stropt->so_flags |= SO_TAIL | SO_COPYOPT; + stropt->so_tail = sopp_tail; + stropt->so_copyopt = sopp_copyopt; + } + + /* Send the options up */ + putnext(q, stropt_mp); + } + freemsg(mp); /* * Pass up any data and/or a fin that has been received. * @@ -18432,43 +18150,77 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) * code, the rwnd may never open up again! */ if (tcp->tcp_rcv_list != NULL) { - /* We drain directly in case of fused tcp loopback */ - sodirect_t *sodp; - - if (!tcp->tcp_fused && canputnext(q)) { - tcp->tcp_rwnd = q->q_hiwat; - thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) - << tcp->tcp_rcv_ws; - thwin -= tcp->tcp_rnxt - tcp->tcp_rack; - if (tcp->tcp_state >= TCPS_ESTABLISHED && - (q->q_hiwat - thwin >= tcp->tcp_mss)) { - tcp_xmit_ctl(NULL, - tcp, (tcp->tcp_swnd == 0) ? - tcp->tcp_suna : tcp->tcp_snxt, - tcp->tcp_rnxt, TH_ACK); - BUMP_MIB(&tcps->tcps_mib, tcpOutWinUpdate); + if (IPCL_IS_NONSTR(connp)) { + mblk_t *mp; + int space_left; + int error; + boolean_t push = B_TRUE; + + if (!tcp->tcp_fused && (*connp->conn_upcalls->su_recv) + (connp->conn_upper_handle, NULL, 0, 0, &error, + &push) >= 0) { + tcp->tcp_rwnd = tcp->tcp_recv_hiwater; + if (tcp->tcp_state >= TCPS_ESTABLISHED && + tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { + tcp_xmit_ctl(NULL, + tcp, (tcp->tcp_swnd == 0) ? + tcp->tcp_suna : tcp->tcp_snxt, + tcp->tcp_rnxt, TH_ACK); + } } - - } - - SOD_PTR_ENTER(tcp, sodp); - if (sodp != NULL) { - /* Sodirect, move from rcv_list */ - ASSERT(!tcp->tcp_fused); while ((mp = tcp->tcp_rcv_list) != NULL) { + push = B_TRUE; tcp->tcp_rcv_list = mp->b_next; mp->b_next = NULL; - (void) tcp_rcv_sod_enqueue(tcp, sodp, mp, - msgdsize(mp)); + space_left = (*connp->conn_upcalls->su_recv) + (connp->conn_upper_handle, mp, msgdsize(mp), + 0, &error, &push); + if (space_left < 0) { + /* + * At this point the eager is not + * visible to anyone, so fallback + * can not happen. + */ + ASSERT(error != EOPNOTSUPP); + } } tcp->tcp_rcv_last_head = NULL; tcp->tcp_rcv_last_tail = NULL; tcp->tcp_rcv_cnt = 0; - (void) tcp_rcv_sod_wakeup(tcp, sodp); - /* sod_wakeup() did the mutex_exit() */ } else { - /* Not sodirect, drain */ - (void) tcp_rcv_drain(q, tcp); + /* We drain directly in case of fused tcp loopback */ + sodirect_t *sodp; + + if (!tcp->tcp_fused && canputnext(q)) { + tcp->tcp_rwnd = q->q_hiwat; + if (tcp->tcp_state >= TCPS_ESTABLISHED && + tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { + tcp_xmit_ctl(NULL, + tcp, (tcp->tcp_swnd == 0) ? + tcp->tcp_suna : tcp->tcp_snxt, + tcp->tcp_rnxt, TH_ACK); + } + } + + SOD_PTR_ENTER(tcp, sodp); + if (sodp != NULL) { + /* Sodirect, move from rcv_list */ + ASSERT(!tcp->tcp_fused); + while ((mp = tcp->tcp_rcv_list) != NULL) { + tcp->tcp_rcv_list = mp->b_next; + mp->b_next = NULL; + (void) tcp_rcv_sod_enqueue(tcp, sodp, + mp, msgdsize(mp)); + } + tcp->tcp_rcv_last_head = NULL; + tcp->tcp_rcv_last_tail = NULL; + tcp->tcp_rcv_cnt = 0; + (void) tcp_rcv_sod_wakeup(tcp, sodp); + /* sod_wakeup() did the mutex_exit() */ + } else { + /* Not sodirect, drain */ + (void) tcp_rcv_drain(tcp); + } } /* @@ -18502,18 +18254,27 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) } ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { - mp = tcp->tcp_ordrel_mp; - tcp->tcp_ordrel_mp = NULL; tcp->tcp_ordrel_done = B_TRUE; - putnext(q, mp); + if (IPCL_IS_NONSTR(connp)) { + ASSERT(tcp->tcp_ordrel_mp == NULL); + (*connp->conn_upcalls->su_opctl)( + connp->conn_upper_handle, + SOCK_OPCTL_SHUT_RECV, 0); + } else { + mp = tcp->tcp_ordrel_mp; + tcp->tcp_ordrel_mp = NULL; + putnext(q, mp); + } } if (tcp->tcp_hard_binding) { tcp->tcp_hard_binding = B_FALSE; tcp->tcp_hard_bound = B_TRUE; } - /* We can enable synchronous streams now */ - if (tcp->tcp_fused) { + /* We can enable synchronous streams for STREAMS tcp endpoint now */ + if (tcp->tcp_fused && !IPCL_IS_NONSTR(connp) && + tcp->tcp_loopback_peer != NULL && + !IPCL_IS_NONSTR(tcp->tcp_loopback_peer->tcp_connp)) { tcp_fuse_syncstr_enable_pair(tcp); } @@ -18547,6 +18308,8 @@ tcp_send_pending(void *arg, mblk_t *mp, void *arg2) { conn_t *connp = (conn_t *)arg; tcp_t *listener = connp->conn_tcp; + struct T_conn_ind *conn_ind; + tcp_t *tcp; if (listener->tcp_state == TCPS_CLOSED || TCP_IS_DETACHED(listener)) { @@ -18554,8 +18317,6 @@ tcp_send_pending(void *arg, mblk_t *mp, void *arg2) * If listener has closed, it would have caused a * a cleanup/blowoff to happen for the eager. */ - tcp_t *tcp; - struct T_conn_ind *conn_ind; conn_ind = (struct T_conn_ind *)mp->b_rptr; bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, @@ -18571,7 +18332,218 @@ tcp_send_pending(void *arg, mblk_t *mp, void *arg2) freemsg(mp); return; } - putnext(listener->tcp_rq, mp); + if (IPCL_IS_NONSTR(connp)) { + conn_ind = (struct T_conn_ind *)mp->b_rptr; + bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, + conn_ind->OPT_length); + + if ((*connp->conn_upcalls->su_newconn) + (connp->conn_upper_handle, + (sock_lower_handle_t)tcp->tcp_connp, + &sock_tcp_downcalls, DB_CRED(mp), DB_CPID(mp), + &tcp->tcp_connp->conn_upcalls) != NULL) { + /* Keep the message around in case of fallback */ + tcp->tcp_conn.tcp_eager_conn_ind = mp; + } else { + freemsg(mp); + } + } else { + putnext(listener->tcp_rq, mp); + } +} + +/* ARGSUSED */ +static int +tcp_accept_common(conn_t *lconnp, conn_t *econnp, + sock_upper_handle_t sock_handle, cred_t *cr) +{ + tcp_t *listener, *eager; + mblk_t *opt_mp; + struct tcp_options *tcpopt; + + listener = lconnp->conn_tcp; + ASSERT(listener->tcp_state == TCPS_LISTEN); + eager = econnp->conn_tcp; + ASSERT(eager->tcp_listener != NULL); + + ASSERT(eager->tcp_rq != NULL); + + /* If tcp_fused and sodirect enabled disable it */ + if (eager->tcp_fused && eager->tcp_sodirect != NULL) { + /* Fused, disable sodirect */ + mutex_enter(eager->tcp_sodirect->sod_lockp); + SOD_DISABLE(eager->tcp_sodirect); + mutex_exit(eager->tcp_sodirect->sod_lockp); + eager->tcp_sodirect = NULL; + } + + opt_mp = allocb(sizeof (struct tcp_options), BPRI_HI); + if (opt_mp == NULL) { + return (-TPROTO); + } + bzero((char *)opt_mp->b_rptr, sizeof (struct tcp_options)); + eager->tcp_issocket = B_TRUE; + + econnp->conn_upcalls = lconnp->conn_upcalls; + econnp->conn_zoneid = listener->tcp_connp->conn_zoneid; + econnp->conn_allzones = listener->tcp_connp->conn_allzones; + ASSERT(econnp->conn_netstack == + listener->tcp_connp->conn_netstack); + ASSERT(eager->tcp_tcps == listener->tcp_tcps); + + /* Put the ref for IP */ + CONN_INC_REF(econnp); + + /* + * We should have minimum of 3 references on the conn + * at this point. One each for TCP and IP and one for + * the T_conn_ind that was sent up when the 3-way handshake + * completed. In the normal case we would also have another + * reference (making a total of 4) for the conn being in the + * classifier hash list. However the eager could have received + * an RST subsequently and tcp_closei_local could have removed + * the eager from the classifier hash list, hence we can't + * assert that reference. + */ + ASSERT(econnp->conn_ref >= 3); + + opt_mp->b_datap->db_type = M_SETOPTS; + opt_mp->b_wptr += sizeof (struct tcp_options); + + /* + * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO + * from listener to acceptor. In case of non-STREAMS sockets, + * we also need to pass the upper handle along. + */ + tcpopt = (struct tcp_options *)opt_mp->b_rptr; + tcpopt->to_flags = 0; + + if (IPCL_IS_NONSTR(econnp)) { + ASSERT(sock_handle != NULL); + tcpopt->to_flags |= TCPOPT_UPPERHANDLE; + tcpopt->to_handle = sock_handle; + } + if (listener->tcp_bound_if != 0) { + tcpopt->to_flags |= TCPOPT_BOUNDIF; + tcpopt->to_boundif = listener->tcp_bound_if; + } + if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) { + tcpopt->to_flags |= TCPOPT_RECVPKTINFO; + } + + mutex_enter(&listener->tcp_eager_lock); + if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { + + tcp_t *tail; + tcp_t *tcp; + mblk_t *mp1; + + tcp = listener->tcp_eager_prev_q0; + /* + * listener->tcp_eager_prev_q0 points to the TAIL of the + * deferred T_conn_ind queue. We need to get to the head + * of the queue in order to send up T_conn_ind the same + * order as how the 3WHS is completed. + */ + while (tcp != listener) { + if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0 && + !tcp->tcp_kssl_pending) + break; + else + tcp = tcp->tcp_eager_prev_q0; + } + /* None of the pending eagers can be sent up now */ + if (tcp == listener) + goto no_more_eagers; + + mp1 = tcp->tcp_conn.tcp_eager_conn_ind; + tcp->tcp_conn.tcp_eager_conn_ind = NULL; + /* Move from q0 to q */ + ASSERT(listener->tcp_conn_req_cnt_q0 > 0); + listener->tcp_conn_req_cnt_q0--; + listener->tcp_conn_req_cnt_q++; + tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = + tcp->tcp_eager_prev_q0; + tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = + tcp->tcp_eager_next_q0; + tcp->tcp_eager_prev_q0 = NULL; + tcp->tcp_eager_next_q0 = NULL; + tcp->tcp_conn_def_q0 = B_FALSE; + + /* Make sure the tcp isn't in the list of droppables */ + ASSERT(tcp->tcp_eager_next_drop_q0 == NULL && + tcp->tcp_eager_prev_drop_q0 == NULL); + + /* + * Insert at end of the queue because sockfs sends + * down T_CONN_RES in chronological order. Leaving + * the older conn indications at front of the queue + * helps reducing search time. + */ + tail = listener->tcp_eager_last_q; + if (tail != NULL) { + tail->tcp_eager_next_q = tcp; + } else { + listener->tcp_eager_next_q = tcp; + } + listener->tcp_eager_last_q = tcp; + tcp->tcp_eager_next_q = NULL; + + /* Need to get inside the listener perimeter */ + CONN_INC_REF(listener->tcp_connp); + SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp1, + tcp_send_pending, listener->tcp_connp, SQ_FILL, + SQTAG_TCP_SEND_PENDING); + } +no_more_eagers: + tcp_eager_unlink(eager); + mutex_exit(&listener->tcp_eager_lock); + + /* + * At this point, the eager is detached from the listener + * but we still have an extra refs on eager (apart from the + * usual tcp references). The ref was placed in tcp_rput_data + * before sending the conn_ind in tcp_send_conn_ind. + * The ref will be dropped in tcp_accept_finish(). + */ + SQUEUE_ENTER_ONE(econnp->conn_sqp, opt_mp, tcp_accept_finish, + econnp, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0); + return (0); +} + +int +tcp_accept(sock_lower_handle_t lproto_handle, + sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, + cred_t *cr) +{ + conn_t *lconnp, *econnp; + tcp_t *listener, *eager; + tcp_stack_t *tcps; + + lconnp = (conn_t *)lproto_handle; + listener = lconnp->conn_tcp; + ASSERT(listener->tcp_state == TCPS_LISTEN); + econnp = (conn_t *)eproto_handle; + eager = econnp->conn_tcp; + ASSERT(eager->tcp_listener != NULL); + tcps = eager->tcp_tcps; + + ASSERT(IPCL_IS_NONSTR(econnp)); + /* + * Create helper stream if it is a non-TPI TCP connection. + */ + if (ip_create_helper_stream(econnp, tcps->tcps_ldi_ident)) { + ip1dbg(("tcp_accept: create of IP helper stream" + " failed\n")); + return (EPROTO); + } + eager->tcp_rq = econnp->conn_rq; + eager->tcp_wq = econnp->conn_wq; + + ASSERT(eager->tcp_rq != NULL); + + eager->tcp_sodirect = SOD_SOTOSODP(sock_handle); + return (tcp_accept_common(lconnp, econnp, sock_handle, cr)); } @@ -18581,7 +18553,7 @@ tcp_send_pending(void *arg, mblk_t *mp, void *arg2) * Read the block comment on top of tcp_conn_request(). */ void -tcp_wput_accept(queue_t *q, mblk_t *mp) +tcp_tpi_accept(queue_t *q, mblk_t *mp) { queue_t *rq = RD(q); struct T_conn_res *conn_res; @@ -18589,7 +18561,6 @@ tcp_wput_accept(queue_t *q, mblk_t *mp) tcp_t *listener; struct T_ok_ack *ok; t_scalar_t PRIM_type; - mblk_t *opt_mp; conn_t *econnp; ASSERT(DB_TYPE(mp) == M_PROTO); @@ -18615,14 +18586,6 @@ tcp_wput_accept(queue_t *q, mblk_t *mp) * correct function (tcpclose_accept) in case allocb * fails. */ - opt_mp = allocb(sizeof (struct stroptions), BPRI_HI); - if (opt_mp == NULL) { - mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); - if (mp != NULL) - putnext(rq, mp); - return; - } - bcopy(mp->b_rptr + conn_res->OPT_offset, &eager, conn_res->OPT_length); PRIM_type = conn_res->PRIM_type; @@ -18641,45 +18604,20 @@ tcp_wput_accept(queue_t *q, mblk_t *mp) q->q_ptr = econnp; q->q_qinfo = &tcp_winit; listener = eager->tcp_listener; - eager->tcp_issocket = B_TRUE; /* * TCP is _D_SODIRECT and sockfs is directly above so * save shared sodirect_t pointer (if any). - * - * If tcp_fused and sodirect enabled disable it. */ eager->tcp_sodirect = SOD_QTOSODP(eager->tcp_rq); - if (eager->tcp_fused && eager->tcp_sodirect != NULL) { - /* Fused, disable sodirect */ - mutex_enter(eager->tcp_sodirect->sod_lockp); - SOD_DISABLE(eager->tcp_sodirect); - mutex_exit(eager->tcp_sodirect->sod_lockp); - eager->tcp_sodirect = NULL; + if (tcp_accept_common(listener->tcp_connp, + econnp, NULL, CRED()) < 0) { + mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); + if (mp != NULL) + putnext(rq, mp); + return; } - econnp->conn_zoneid = listener->tcp_connp->conn_zoneid; - econnp->conn_allzones = listener->tcp_connp->conn_allzones; - ASSERT(econnp->conn_netstack == - listener->tcp_connp->conn_netstack); - ASSERT(eager->tcp_tcps == listener->tcp_tcps); - - /* Put the ref for IP */ - CONN_INC_REF(econnp); - - /* - * We should have minimum of 3 references on the conn - * at this point. One each for TCP and IP and one for - * the T_conn_ind that was sent up when the 3-way handshake - * completed. In the normal case we would also have another - * reference (making a total of 4) for the conn being in the - * classifier hash list. However the eager could have received - * an RST subsequently and tcp_closei_local could have removed - * the eager from the classifier hash list, hence we can't - * assert that reference. - */ - ASSERT(econnp->conn_ref >= 3); - /* * Send the new local address also up to sockfs. There * should already be enough space in the mp that came @@ -18721,115 +18659,6 @@ tcp_wput_accept(queue_t *q, mblk_t *mp) } putnext(rq, mp); - - opt_mp->b_datap->db_type = M_SETOPTS; - opt_mp->b_wptr += sizeof (struct stroptions); - - /* - * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO - * from listener to acceptor. The message is chained on the - * bind_mp which tcp_rput_other will send down to IP. - */ - if (listener->tcp_bound_if != 0) { - /* allocate optmgmt req */ - mp = tcp_setsockopt_mp(IPPROTO_IPV6, - IPV6_BOUND_IF, (char *)&listener->tcp_bound_if, - sizeof (int)); - if (mp != NULL) - linkb(opt_mp, mp); - } - if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) { - uint_t on = 1; - - /* allocate optmgmt req */ - mp = tcp_setsockopt_mp(IPPROTO_IPV6, - IPV6_RECVPKTINFO, (char *)&on, sizeof (on)); - if (mp != NULL) - linkb(opt_mp, mp); - } - - - mutex_enter(&listener->tcp_eager_lock); - - if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { - - tcp_t *tail; - tcp_t *tcp; - mblk_t *mp1; - - tcp = listener->tcp_eager_prev_q0; - /* - * listener->tcp_eager_prev_q0 points to the TAIL of the - * deferred T_conn_ind queue. We need to get to the head - * of the queue in order to send up T_conn_ind the same - * order as how the 3WHS is completed. - */ - while (tcp != listener) { - if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0 && - !tcp->tcp_kssl_pending) - break; - else - tcp = tcp->tcp_eager_prev_q0; - } - /* None of the pending eagers can be sent up now */ - if (tcp == listener) - goto no_more_eagers; - - mp1 = tcp->tcp_conn.tcp_eager_conn_ind; - tcp->tcp_conn.tcp_eager_conn_ind = NULL; - /* Move from q0 to q */ - ASSERT(listener->tcp_conn_req_cnt_q0 > 0); - listener->tcp_conn_req_cnt_q0--; - listener->tcp_conn_req_cnt_q++; - tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = - tcp->tcp_eager_prev_q0; - tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = - tcp->tcp_eager_next_q0; - tcp->tcp_eager_prev_q0 = NULL; - tcp->tcp_eager_next_q0 = NULL; - tcp->tcp_conn_def_q0 = B_FALSE; - - /* Make sure the tcp isn't in the list of droppables */ - ASSERT(tcp->tcp_eager_next_drop_q0 == NULL && - tcp->tcp_eager_prev_drop_q0 == NULL); - - /* - * Insert at end of the queue because sockfs sends - * down T_CONN_RES in chronological order. Leaving - * the older conn indications at front of the queue - * helps reducing search time. - */ - tail = listener->tcp_eager_last_q; - if (tail != NULL) { - tail->tcp_eager_next_q = tcp; - } else { - listener->tcp_eager_next_q = tcp; - } - listener->tcp_eager_last_q = tcp; - tcp->tcp_eager_next_q = NULL; - - /* Need to get inside the listener perimeter */ - CONN_INC_REF(listener->tcp_connp); - SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp1, - tcp_send_pending, listener->tcp_connp, - SQ_FILL, SQTAG_TCP_SEND_PENDING); - } -no_more_eagers: - tcp_eager_unlink(eager); - mutex_exit(&listener->tcp_eager_lock); - - /* - * At this point, the eager is detached from the listener - * but we still have an extra refs on eager (apart from the - * usual tcp references). The ref was placed in tcp_rput_data - * before sending the conn_ind in tcp_send_conn_ind. - * The ref will be dropped in tcp_accept_finish(). As sockfs - * has already established this tcp with it's own stream, - * it's OK to set tcp_detached to B_FALSE. - */ - econnp->conn_tcp->tcp_detached = B_FALSE; - SQUEUE_ENTER_ONE(econnp->conn_sqp, opt_mp, tcp_accept_finish, - econnp, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0); return; default: mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0); @@ -18878,7 +18707,7 @@ tcp_getmyname(tcp_t *tcp, struct sockaddr *sa, uint_t *salenp) } static int -tcp_getpeername(tcp_t *tcp, struct sockaddr *sa, uint_t *salenp) +i_tcp_getpeername(tcp_t *tcp, struct sockaddr *sa, uint_t *salenp) { sin_t *sin = (sin_t *)sa; sin6_t *sin6 = (sin6_t *)sa; @@ -18898,6 +18727,7 @@ tcp_getpeername(tcp_t *tcp, struct sockaddr *sa, uint_t *salenp) sin->sin_port = tcp->tcp_fport; IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_remote_v6, sin->sin_addr.s_addr); + *salenp = sizeof (sin_t); break; case AF_INET6: @@ -18912,6 +18742,7 @@ tcp_getpeername(tcp_t *tcp, struct sockaddr *sa, uint_t *salenp) sin6->sin6_flowinfo = tcp->tcp_ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; } + *salenp = sizeof (sin6_t); break; } @@ -18939,7 +18770,7 @@ tcp_wput_cmdblk(queue_t *q, mblk_t *mp) switch (cmdp->cb_cmd) { case TI_GETPEERNAME: - cmdp->cb_error = tcp_getpeername(tcp, data, &cmdp->cb_len); + cmdp->cb_error = i_tcp_getpeername(tcp, data, &cmdp->cb_len); break; case TI_GETMYNAME: cmdp->cb_error = tcp_getmyname(tcp, data, &cmdp->cb_len); @@ -18961,6 +18792,7 @@ tcp_wput(queue_t *q, mblk_t *mp) t_scalar_t type; uchar_t *rptr; struct iocblk *iocp; + size_t size; tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; ASSERT(connp->conn_ref >= 2); @@ -18970,13 +18802,18 @@ tcp_wput(queue_t *q, mblk_t *mp) tcp = connp->conn_tcp; ASSERT(tcp != NULL); + size = msgdsize(mp); + mutex_enter(&tcp->tcp_non_sq_lock); - tcp->tcp_squeue_bytes += msgdsize(mp); + tcp->tcp_squeue_bytes += size; if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) { tcp_setqfull(tcp); } mutex_exit(&tcp->tcp_non_sq_lock); + if (DB_CRED(mp) == NULL && is_system_labeled()) + msg_setcredpid(mp, CONN_CRED(connp), curproc->p_pid); + CONN_INC_REF(connp); SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp, tcp_squeue_flag, SQTAG_TCP_OUTPUT); @@ -19108,6 +18945,16 @@ tcp_wput_sock(queue_t *wq, mblk_t *mp) tcp_wput(wq, mp); } +/* ARGSUSED */ +static void +tcp_wput_fallback(queue_t *wq, mblk_t *mp) +{ +#ifdef DEBUG + cmn_err(CE_CONT, "tcp_wput_fallback: Message during fallback \n"); +#endif + freemsg(mp); +} + static boolean_t tcp_zcopy_check(tcp_t *tcp) { @@ -19150,10 +18997,12 @@ tcp_zcopy_check(tcp_t *tcp) tcp->tcp_snd_zcopy_on = zc_enabled; if (!TCP_IS_DETACHED(tcp)) { if (zc_enabled) { - (void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMSAFE); + (void) proto_set_tx_copyopt(tcp->tcp_rq, connp, + ZCVMSAFE); TCP_STAT(tcps, tcp_zcopy_on); } else { - (void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMUNSAFE); + (void) proto_set_tx_copyopt(tcp->tcp_rq, connp, + ZCVMUNSAFE); TCP_STAT(tcps, tcp_zcopy_off); } } @@ -19170,7 +19019,8 @@ tcp_zcopy_disable(tcp_t *tcp, mblk_t *bp) else if (tcp->tcp_snd_zcopy_on) { tcp->tcp_snd_zcopy_on = B_FALSE; if (!TCP_IS_DETACHED(tcp)) { - (void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMUNSAFE); + (void) proto_set_tx_copyopt(tcp->tcp_rq, tcp->tcp_connp, + ZCVMUNSAFE); TCP_STAT(tcps, tcp_zcopy_disable); } } @@ -19259,9 +19109,16 @@ static void tcp_zcopy_notify(tcp_t *tcp) { struct stdata *stp; + conn_t *connp; if (tcp->tcp_detached) return; + connp = tcp->tcp_connp; + if (IPCL_IS_NONSTR(connp)) { + (*connp->conn_upcalls->su_zcopy_notify) + (connp->conn_upper_handle); + return; + } stp = STREAM(tcp->tcp_rq); mutex_enter(&stp->sd_lock); stp->sd_flag |= STZCNOTIFY; @@ -19423,13 +19280,14 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp) ASSERT(DB_TYPE(mp) == M_DATA); - if (DB_CRED(mp) == NULL) - mblk_setcred(mp, CONN_CRED(connp)); + if (is_system_labeled() && DB_CRED(mp) == NULL) + mblk_setcred(mp, CONN_CRED(tcp->tcp_connp)); ipha = (ipha_t *)mp->b_rptr; src = ipha->ipha_src; dst = ipha->ipha_dst; + ASSERT(q != NULL); DTRACE_PROBE2(tcp__trace__send, mblk_t *, mp, tcp_t *, tcp); /* @@ -22430,7 +22288,7 @@ tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp) error = tcp_getmyname(tcp, (void *)mp1->b_rptr, &addrlen); break; case TI_GETPEERNAME: - error = tcp_getpeername(tcp, (void *)mp1->b_rptr, &addrlen); + error = i_tcp_getpeername(tcp, (void *)mp1->b_rptr, &addrlen); break; } @@ -22445,6 +22303,35 @@ tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp) } } +static void +tcp_disable_direct_sockfs(tcp_t *tcp) +{ +#ifdef _ILP32 + tcp->tcp_acceptor_id = (t_uscalar_t)tcp->tcp_rq; +#else + tcp->tcp_acceptor_id = tcp->tcp_connp->conn_dev; +#endif + /* + * Insert this socket into the acceptor hash. + * We might need it for T_CONN_RES message + */ + tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); + + if (tcp->tcp_fused) { + /* + * This is a fused loopback tcp; disable + * read-side synchronous streams interface + * and drain any queued data. It is okay + * to do this for non-synchronous streams + * fused tcp as well. + */ + tcp_fuse_disable_pair(tcp, B_FALSE); + } + tcp->tcp_issocket = B_FALSE; + tcp->tcp_sodirect = NULL; + TCP_STAT(tcp->tcp_tcps, tcp_sock_fallback); +} + /* * tcp_wput_ioctl is called by tcp_wput_nondata() to handle all M_IOCTL * messages. @@ -22457,7 +22344,6 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2) tcp_t *tcp = connp->conn_tcp; queue_t *q = tcp->tcp_wq; struct iocblk *iocp; - tcp_stack_t *tcps = tcp->tcp_tcps; ASSERT(DB_TYPE(mp) == M_IOCTL); /* @@ -22498,31 +22384,7 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2) DB_TYPE(mp) = M_IOCNAK; iocp->ioc_error = EINVAL; } else { -#ifdef _ILP32 - tcp->tcp_acceptor_id = (t_uscalar_t)RD(q); -#else - tcp->tcp_acceptor_id = tcp->tcp_connp->conn_dev; -#endif - /* - * Insert this socket into the acceptor hash. - * We might need it for T_CONN_RES message - */ - tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); - - if (tcp->tcp_fused) { - /* - * This is a fused loopback tcp; disable - * read-side synchronous streams interface - * and drain any queued data. It is okay - * to do this for non-synchronous streams - * fused tcp as well. - */ - tcp_fuse_disable_pair(tcp, B_FALSE); - } - tcp->tcp_issocket = B_FALSE; - tcp->tcp_sodirect = NULL; - TCP_STAT(tcps, tcp_sock_fallback); - + tcp_disable_direct_sockfs(tcp); DB_TYPE(mp) = M_IOCACK; iocp->ioc_error = 0; } @@ -22546,7 +22408,6 @@ tcp_wput_proto(void *arg, mblk_t *mp, void *arg2) union T_primitives *tprim = (union T_primitives *)mp->b_rptr; uchar_t *rptr; t_scalar_t type; - int len; cred_t *cr = DB_CREDDEF(mp, tcp->tcp_cred); /* @@ -22566,34 +22427,16 @@ tcp_wput_proto(void *arg, mblk_t *mp, void *arg2) if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { type = ((union T_primitives *)rptr)->type; if (type == T_EXDATA_REQ) { - uint32_t msize = msgdsize(mp->b_cont); - - len = msize - 1; - if (len < 0) { - freemsg(mp); - return; - } - /* - * Try to force urgent data out on the wire. - * Even if we have unsent data this will - * at least send the urgent flag. - * XXX does not handle more flag correctly. - */ - len += tcp->tcp_unsent; - len += tcp->tcp_snxt; - tcp->tcp_urg = len; - tcp->tcp_valid_bits |= TCP_URG_VALID; - - /* Bypass tcp protocol for fused tcp loopback */ - if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) - return; + tcp_output_urgent(connp, mp->b_cont, arg2); + freeb(mp); } else if (type != T_DATA_REQ) { goto non_urgent_data; + } else { + /* TODO: options, flags, ... from user */ + /* Set length to zero for reclamation below */ + tcp_wput_data(tcp, mp->b_cont, B_TRUE); + freeb(mp); } - /* TODO: options, flags, ... from user */ - /* Set length to zero for reclamation below */ - tcp_wput_data(tcp, mp->b_cont, B_TRUE); - freeb(mp); return; } else { if (tcp->tcp_debug) { @@ -22631,17 +22474,17 @@ non_urgent_data: /* FALLTHROUGH */ case O_T_BIND_REQ: /* bind request */ case T_BIND_REQ: /* new semantics bind request */ - tcp_bind(tcp, mp); + tcp_tpi_bind(tcp, mp); break; case T_UNBIND_REQ: /* unbind request */ - tcp_unbind(tcp, mp); + tcp_tpi_unbind(tcp, mp); break; case O_T_CONN_RES: /* old connection response XXX */ case T_CONN_RES: /* connection response */ - tcp_accept(tcp, mp); + tcp_tli_accept(tcp, mp); break; case T_CONN_REQ: /* connection request */ - tcp_connect(tcp, mp); + tcp_tpi_connect(tcp, mp); break; case T_DISCON_REQ: /* disconnect request */ tcp_disconnect(tcp, mp); @@ -23278,6 +23121,7 @@ tcp_xmit_end(tcp_t *tcp) ipic->ipic_rtt_sd = tcp->tcp_rtt_sd; CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); + return (0); } @@ -23798,14 +23642,15 @@ tcp_push_timer(void *arg) { conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; - tcp_stack_t *tcps = tcp->tcp_tcps; uint_t flags; sodirect_t *sodp; - TCP_DBGSTAT(tcps, tcp_push_timer_cnt); + TCP_DBGSTAT(tcp->tcp_tcps, tcp_push_timer_cnt); ASSERT(tcp->tcp_listener == NULL); + ASSERT(!IPCL_IS_NONSTR(connp)); + /* * We need to plug synchronous streams during our drain to prevent * a race with tcp_fuse_rrw() or tcp_fusion_rinfop(). @@ -23818,7 +23663,7 @@ tcp_push_timer(void *arg) flags = tcp_rcv_sod_wakeup(tcp, sodp); /* sod_wakeup() does the mutex_exit() */ } else if (tcp->tcp_rcv_list != NULL) { - flags = tcp_rcv_drain(tcp->tcp_rq, tcp); + flags = tcp_rcv_drain(tcp); } if (flags == TH_ACK_NEEDED) tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); @@ -24030,15 +23875,19 @@ tcp_ack_mp(tcp_t *tcp) } /* - * Hash list insertion routine for tcp_t structures. - * Inserts entries with the ones bound to a specific IP address first - * followed by those bound to INADDR_ANY. + * Hash list insertion routine for tcp_t structures. Each hash bucket + * contains a list of tcp_t entries, and each entry is bound to a unique + * port. If there are multiple tcp_t's that are bound to the same port, then + * one of them will be linked into the hash bucket list, and the rest will + * hang off of that one entry. For each port, entries bound to a specific IP + * address will be inserted before those those bound to INADDR_ANY. */ static void tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock) { tcp_t **tcpp; tcp_t *tcpnext; + tcp_t *tcphash; if (tcp->tcp_ptpbhn != NULL) { ASSERT(!caller_holds_lock); @@ -24050,9 +23899,22 @@ tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock) } else { ASSERT(MUTEX_HELD(&tbf->tf_lock)); } - tcpnext = tcpp[0]; - if (tcpnext) { + tcphash = tcpp[0]; + tcpnext = NULL; + if (tcphash != NULL) { + /* Look for an entry using the same port */ + while ((tcphash = tcpp[0]) != NULL && + tcp->tcp_lport != tcphash->tcp_lport) + tcpp = &(tcphash->tcp_bind_hash); + + /* The port was not found, just add to the end */ + if (tcphash == NULL) + goto insert; + /* + * OK, there already exists an entry bound to the + * same port. + * * If the new tcp bound to the INADDR_ANY address * and the first one in the list is not bound to * INADDR_ANY we skip all entries until we find the @@ -24061,17 +23923,36 @@ tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock) * specific address get preference over those binding to * INADDR_ANY. */ + tcpnext = tcphash; + tcphash = NULL; if (V6_OR_V4_INADDR_ANY(tcp->tcp_bound_source_v6) && !V6_OR_V4_INADDR_ANY(tcpnext->tcp_bound_source_v6)) { while ((tcpnext = tcpp[0]) != NULL && !V6_OR_V4_INADDR_ANY(tcpnext->tcp_bound_source_v6)) - tcpp = &(tcpnext->tcp_bind_hash); - if (tcpnext) - tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash; - } else - tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash; + tcpp = &(tcpnext->tcp_bind_hash_port); + + if (tcpnext) { + tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port; + tcphash = tcpnext->tcp_bind_hash; + if (tcphash != NULL) { + tcphash->tcp_ptpbhn = + &(tcp->tcp_bind_hash); + tcpnext->tcp_bind_hash = NULL; + } + } + } else { + tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port; + tcphash = tcpnext->tcp_bind_hash; + if (tcphash != NULL) { + tcphash->tcp_ptpbhn = + &(tcp->tcp_bind_hash); + tcpnext->tcp_bind_hash = NULL; + } + } } - tcp->tcp_bind_hash = tcpnext; +insert: + tcp->tcp_bind_hash_port = tcpnext; + tcp->tcp_bind_hash = tcphash; tcp->tcp_ptpbhn = tcpp; tcpp[0] = tcp; if (!caller_holds_lock) @@ -24101,8 +23982,17 @@ tcp_bind_hash_remove(tcp_t *tcp) ASSERT(lockp != NULL); mutex_enter(lockp); if (tcp->tcp_ptpbhn) { - tcpnext = tcp->tcp_bind_hash; - if (tcpnext) { + tcpnext = tcp->tcp_bind_hash_port; + if (tcpnext != NULL) { + tcp->tcp_bind_hash_port = NULL; + tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; + tcpnext->tcp_bind_hash = tcp->tcp_bind_hash; + if (tcpnext->tcp_bind_hash != NULL) { + tcpnext->tcp_bind_hash->tcp_ptpbhn = + &(tcpnext->tcp_bind_hash); + tcp->tcp_bind_hash = NULL; + } + } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) { tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; tcp->tcp_bind_hash = NULL; } @@ -24507,36 +24397,6 @@ tcp_random(void) return (i); } -/* - * XXX This will go away when TPI is extended to send - * info reqs to sockfs/timod ..... - * Given a queue, set the max packet size for the write - * side of the queue below stream head. This value is - * cached on the stream head. - * Returns 1 on success, 0 otherwise. - */ -static int -setmaxps(queue_t *q, int maxpsz) -{ - struct stdata *stp; - queue_t *wq; - stp = STREAM(q); - - /* - * At this point change of a queue parameter is not allowed - * when a multiplexor is sitting on top. - */ - if (stp->sd_flag & STPLEX) - return (0); - - claimstr(stp->sd_wrq); - wq = stp->sd_wrq->q_next; - ASSERT(wq != NULL); - (void) strqset(wq, QMAXPSZ, 0, maxpsz); - releasestr(stp->sd_wrq); - return (1); -} - static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp, int *t_errorp, int *sys_errorp) @@ -24964,6 +24824,8 @@ tcp_ddi_g_init(void) } +#define INET_NAME "ip" + /* * Initialize the TCP stack instance. */ @@ -24973,6 +24835,8 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns) tcp_stack_t *tcps; tcpparam_t *pa; int i; + int error = 0; + major_t major; tcps = (tcp_stack_t *)kmem_zalloc(sizeof (*tcps), KM_SLEEP); tcps->tcps_netstack = ns; @@ -25038,6 +24902,9 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns) tcps->tcps_kstat = tcp_kstat2_init(stackid, &tcps->tcps_statistics); tcps->tcps_mibkp = tcp_kstat_init(stackid, tcps); + major = mod_name_to_major(INET_NAME); + error = ldi_ident_from_major(major, &tcps->tcps_ldi_ident); + ASSERT(error == 0); return (tcps); } @@ -25125,6 +24992,7 @@ tcp_stack_fini(netstackid_t stackid, void *arg) tcp_kstat_fini(stackid, tcps->tcps_mibkp); tcps->tcps_mibkp = NULL; + ldi_ident_release(tcps->tcps_ldi_ident); kmem_free(tcps, sizeof (*tcps)); } @@ -25922,44 +25790,6 @@ done: } /* - * Allocate a T_SVR4_OPTMGMT_REQ. - * The caller needs to increment tcp_drop_opt_ack_cnt when sending these so - * that tcp_rput_other can drop the acks. - */ -static mblk_t * -tcp_setsockopt_mp(int level, int cmd, char *opt, int optlen) -{ - mblk_t *mp; - struct T_optmgmt_req *tor; - struct opthdr *oh; - uint_t size; - char *optptr; - - size = sizeof (*tor) + sizeof (*oh) + optlen; - mp = allocb(size, BPRI_MED); - if (mp == NULL) - return (NULL); - - mp->b_wptr += size; - mp->b_datap->db_type = M_PROTO; - tor = (struct T_optmgmt_req *)mp->b_rptr; - tor->PRIM_type = T_SVR4_OPTMGMT_REQ; - tor->MGMT_flags = T_NEGOTIATE; - tor->OPT_length = sizeof (*oh) + optlen; - tor->OPT_offset = (t_scalar_t)sizeof (*tor); - - oh = (struct opthdr *)&tor[1]; - oh->level = level; - oh->name = cmd; - oh->len = optlen; - if (optlen != 0) { - optptr = (char *)&oh[1]; - bcopy(opt, optptr, optlen); - } - return (mp); -} - -/* * TCP Timers Implementation. */ timeout_id_t @@ -25968,16 +25798,15 @@ tcp_timeout(conn_t *connp, void (*f)(void *), clock_t tim) mblk_t *mp; tcp_timer_t *tcpt; tcp_t *tcp = connp->conn_tcp; - tcp_stack_t *tcps = tcp->tcp_tcps; ASSERT(connp->conn_sqp != NULL); - TCP_DBGSTAT(tcps, tcp_timeout_calls); + TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_calls); if (tcp->tcp_timercache == NULL) { mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC); } else { - TCP_DBGSTAT(tcps, tcp_timeout_cached_alloc); + TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_cached_alloc); mp = tcp->tcp_timercache; tcp->tcp_timercache = mp->b_next; mp->b_next = NULL; @@ -26052,9 +25881,8 @@ tcp_timeout_cancel(conn_t *connp, timeout_id_t id) mblk_t *mp = (mblk_t *)id; tcp_timer_t *tcpt; clock_t delta; - tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps; - TCP_DBGSTAT(tcps, tcp_timeout_cancel_reqs); + TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_cancel_reqs); if (mp == NULL) return (-1); @@ -26065,7 +25893,7 @@ tcp_timeout_cancel(conn_t *connp, timeout_id_t id) delta = untimeout_default(tcpt->tcpt_tid, 0); if (delta >= 0) { - TCP_DBGSTAT(tcps, tcp_timeout_canceled); + TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_canceled); tcp_timer_free(connp->conn_tcp, mp); CONN_DEC_REF(connp); } @@ -26156,7 +25984,6 @@ static void tcp_timer_free(tcp_t *tcp, mblk_t *mp) { mblk_t *mp1 = tcp->tcp_timercache; - tcp_stack_t *tcps = tcp->tcp_tcps; if (mp->b_wptr != NULL) { /* @@ -26174,7 +26001,7 @@ tcp_timer_free(tcp_t *tcp, mblk_t *mp) tcp->tcp_timercache = mp; } else { kmem_cache_free(tcp_timercache, mp); - TCP_DBGSTAT(tcps, tcp_timermp_freed); + TCP_DBGSTAT(tcp->tcp_tcps, tcp_timermp_freed); } } @@ -26188,23 +26015,33 @@ tcp_timer_free(tcp_t *tcp, mblk_t *mp) * decision to call based on the tcp_t.tcp_flow_stopped value which * when check outside the q's lock is only an advisory check ... */ - void tcp_setqfull(tcp_t *tcp) { - queue_t *q = tcp->tcp_wq; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; + + if (tcp->tcp_closed) + return; + + if (IPCL_IS_NONSTR(connp)) { + (*connp->conn_upcalls->su_txq_full) + (tcp->tcp_connp->conn_upper_handle, B_TRUE); + tcp->tcp_flow_stopped = B_TRUE; + } else { + queue_t *q = tcp->tcp_wq; - if (!(q->q_flag & QFULL)) { - mutex_enter(QLOCK(q)); if (!(q->q_flag & QFULL)) { - /* still need to set QFULL */ - q->q_flag |= QFULL; - tcp->tcp_flow_stopped = B_TRUE; - mutex_exit(QLOCK(q)); - TCP_STAT(tcps, tcp_flwctl_on); - } else { - mutex_exit(QLOCK(q)); + mutex_enter(QLOCK(q)); + if (!(q->q_flag & QFULL)) { + /* still need to set QFULL */ + q->q_flag |= QFULL; + tcp->tcp_flow_stopped = B_TRUE; + mutex_exit(QLOCK(q)); + TCP_STAT(tcps, tcp_flwctl_on); + } else { + mutex_exit(QLOCK(q)); + } } } } @@ -26212,23 +26049,33 @@ tcp_setqfull(tcp_t *tcp) void tcp_clrqfull(tcp_t *tcp) { - queue_t *q = tcp->tcp_wq; + conn_t *connp = tcp->tcp_connp; + + if (tcp->tcp_closed) + return; + + if (IPCL_IS_NONSTR(connp)) { + (*connp->conn_upcalls->su_txq_full) + (tcp->tcp_connp->conn_upper_handle, B_FALSE); + tcp->tcp_flow_stopped = B_FALSE; + } else { + queue_t *q = tcp->tcp_wq; - if (q->q_flag & QFULL) { - mutex_enter(QLOCK(q)); if (q->q_flag & QFULL) { - q->q_flag &= ~QFULL; - tcp->tcp_flow_stopped = B_FALSE; - mutex_exit(QLOCK(q)); - if (q->q_flag & QWANTW) - qbackenable(q, 0); - } else { - mutex_exit(QLOCK(q)); + mutex_enter(QLOCK(q)); + if (q->q_flag & QFULL) { + q->q_flag &= ~QFULL; + tcp->tcp_flow_stopped = B_FALSE; + mutex_exit(QLOCK(q)); + if (q->q_flag & QWANTW) + qbackenable(q, 0); + } else { + mutex_exit(QLOCK(q)); + } } } } - /* * kstats related to squeues i.e. not per IP instance */ @@ -26681,3 +26528,1626 @@ tcp_squeue_add(squeue_t *sqp) } tcp_time_wait->tcp_free_list_cnt = 0; } + +static int +tcp_post_ip_bind(tcp_t *tcp, mblk_t *mp, int error) +{ + mblk_t *ire_mp = NULL; + mblk_t *syn_mp; + mblk_t *mdti; + mblk_t *lsoi; + int retval; + tcph_t *tcph; + uint32_t mss; + queue_t *q = tcp->tcp_rq; + conn_t *connp = tcp->tcp_connp; + tcp_stack_t *tcps = tcp->tcp_tcps; + + if (error == 0) { + /* + * Adapt Multidata information, if any. The + * following tcp_mdt_update routine will free + * the message. + */ + if (mp != NULL && ((mdti = tcp_mdt_info_mp(mp)) != NULL)) { + tcp_mdt_update(tcp, &((ip_mdt_info_t *)mdti-> + b_rptr)->mdt_capab, B_TRUE); + freemsg(mdti); + } + + /* + * Check to update LSO information with tcp, and + * tcp_lso_update routine will free the message. + */ + if (mp != NULL && ((lsoi = tcp_lso_info_mp(mp)) != NULL)) { + tcp_lso_update(tcp, &((ip_lso_info_t *)lsoi-> + b_rptr)->lso_capab); + freemsg(lsoi); + } + + /* Get the IRE, if we had requested for it */ + if (mp != NULL) + ire_mp = tcp_ire_mp(&mp); + + if (tcp->tcp_hard_binding) { + tcp->tcp_hard_binding = B_FALSE; + tcp->tcp_hard_bound = B_TRUE; + CL_INET_CONNECT(tcp); + } else { + if (ire_mp != NULL) + freeb(ire_mp); + goto after_syn_sent; + } + + retval = tcp_adapt_ire(tcp, ire_mp); + if (ire_mp != NULL) + freeb(ire_mp); + if (retval == 0) { + error = (int)((tcp->tcp_state >= TCPS_SYN_SENT) ? + ENETUNREACH : EADDRNOTAVAIL); + goto ipcl_rm; + } + /* + * Don't let an endpoint connect to itself. + * Also checked in tcp_connect() but that + * check can't handle the case when the + * local IP address is INADDR_ANY. + */ + if (tcp->tcp_ipversion == IPV4_VERSION) { + if ((tcp->tcp_ipha->ipha_dst == + tcp->tcp_ipha->ipha_src) && + (BE16_EQL(tcp->tcp_tcph->th_lport, + tcp->tcp_tcph->th_fport))) { + error = EADDRNOTAVAIL; + goto ipcl_rm; + } + } else { + if (IN6_ARE_ADDR_EQUAL( + &tcp->tcp_ip6h->ip6_dst, + &tcp->tcp_ip6h->ip6_src) && + (BE16_EQL(tcp->tcp_tcph->th_lport, + tcp->tcp_tcph->th_fport))) { + error = EADDRNOTAVAIL; + goto ipcl_rm; + } + } + ASSERT(tcp->tcp_state == TCPS_SYN_SENT); + /* + * This should not be possible! Just for + * defensive coding... + */ + if (tcp->tcp_state != TCPS_SYN_SENT) + goto after_syn_sent; + + if (is_system_labeled() && + !tcp_update_label(tcp, CONN_CRED(tcp->tcp_connp))) { + error = EHOSTUNREACH; + goto ipcl_rm; + } + + /* + * tcp_adapt_ire() does not adjust + * for TCP/IP header length. + */ + mss = tcp->tcp_mss - tcp->tcp_hdr_len; + + /* + * Just make sure our rwnd is at + * least tcp_recv_hiwat_mss * MSS + * large, and round up to the nearest + * MSS. + * + * We do the round up here because + * we need to get the interface + * MTU first before we can do the + * round up. + */ + tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss), + tcps->tcps_recv_hiwat_minmss * mss); + if (!IPCL_IS_NONSTR(connp)) + q->q_hiwat = tcp->tcp_rwnd; + tcp->tcp_recv_hiwater = tcp->tcp_rwnd; + tcp_set_ws_value(tcp); + U32_TO_ABE16((tcp->tcp_rwnd >> tcp->tcp_rcv_ws), + tcp->tcp_tcph->th_win); + if (tcp->tcp_rcv_ws > 0 || tcps->tcps_wscale_always) + tcp->tcp_snd_ws_ok = B_TRUE; + + /* + * Set tcp_snd_ts_ok to true + * so that tcp_xmit_mp will + * include the timestamp + * option in the SYN segment. + */ + if (tcps->tcps_tstamp_always || + (tcp->tcp_rcv_ws && tcps->tcps_tstamp_if_wscale)) { + tcp->tcp_snd_ts_ok = B_TRUE; + } + + /* + * tcp_snd_sack_ok can be set in + * tcp_adapt_ire() if the sack metric + * is set. So check it here also. + */ + if (tcps->tcps_sack_permitted == 2 || + tcp->tcp_snd_sack_ok) { + if (tcp->tcp_sack_info == NULL) { + tcp->tcp_sack_info = + kmem_cache_alloc(tcp_sack_info_cache, + KM_SLEEP); + } + tcp->tcp_snd_sack_ok = B_TRUE; + } + + /* + * Should we use ECN? Note that the current + * default value (SunOS 5.9) of tcp_ecn_permitted + * is 1. The reason for doing this is that there + * are equipments out there that will drop ECN + * enabled IP packets. Setting it to 1 avoids + * compatibility problems. + */ + if (tcps->tcps_ecn_permitted == 2) + tcp->tcp_ecn_ok = B_TRUE; + + TCP_TIMER_RESTART(tcp, tcp->tcp_rto); + syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, + tcp->tcp_iss, B_FALSE, NULL, B_FALSE); + if (syn_mp) { + cred_t *cr; + pid_t pid; + + /* + * Obtain the credential from the + * thread calling connect(). + * If none can be found, default to + * the creator of the socket. + */ + if (mp == NULL || + (cr = DB_CRED(mp)) == NULL) { + cr = tcp->tcp_cred; + pid = tcp->tcp_cpid; + } else { + pid = DB_CPID(mp); + } + + mblk_setcred(syn_mp, cr); + DB_CPID(syn_mp) = pid; + tcp_send_data(tcp, tcp->tcp_wq, syn_mp); + } + after_syn_sent: + /* + * A trailer mblk indicates a waiting client upstream. + * We complete here the processing begun in + * either tcp_bind() or tcp_connect() by passing + * upstream the reply message they supplied. + */ + if (mp != NULL) { + ASSERT(mp->b_cont == NULL); + freeb(mp); + } + return (error); + } else { + /* error */ + if (tcp->tcp_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, + "tcp_post_ip_bind: error == %d", error); + } + if (mp != NULL) { + freeb(mp); + } + } + +ipcl_rm: + /* + * Need to unbind with classifier since we were just + * told that our bind succeeded. a.k.a error == 0 at the entry. + */ + tcp->tcp_hard_bound = B_FALSE; + tcp->tcp_hard_binding = B_FALSE; + + ipcl_hash_remove(connp); + +bind_failed: + tcp->tcp_state = TCPS_IDLE; + if (tcp->tcp_ipversion == IPV4_VERSION) + tcp->tcp_ipha->ipha_src = 0; + else + V6_SET_ZERO(tcp->tcp_ip6h->ip6_src); + /* + * Copy of the src addr. in tcp_t is needed since + * the lookup funcs. can only look at tcp_t + */ + V6_SET_ZERO(tcp->tcp_ip_src_v6); + + tcph = tcp->tcp_tcph; + tcph->th_lport[0] = 0; + tcph->th_lport[1] = 0; + tcp_bind_hash_remove(tcp); + bzero(&connp->u_port, sizeof (connp->u_port)); + /* blow away saved option results if any */ + if (tcp->tcp_conn.tcp_opts_conn_req != NULL) + tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); + + conn_delete_ire(tcp->tcp_connp, NULL); + + return (error); +} + +static int +tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, + boolean_t bind_to_req_port_only, cred_t *cr) +{ + in_port_t mlp_port; + mlp_type_t addrtype, mlptype; + boolean_t user_specified; + in_port_t allocated_port; + in_port_t requested_port = *requested_port_ptr; + conn_t *connp; + zone_t *zone; + tcp_stack_t *tcps = tcp->tcp_tcps; + in6_addr_t v6addr = tcp->tcp_ip_src_v6; + + /* + * XXX It's up to the caller to specify bind_to_req_port_only or not. + */ + if (cr == NULL) + cr = tcp->tcp_cred; + /* + * Get a valid port (within the anonymous range and should not + * be a privileged one) to use if the user has not given a port. + * If multiple threads are here, they may all start with + * with the same initial port. But, it should be fine as long as + * tcp_bindi will ensure that no two threads will be assigned + * the same port. + * + * NOTE: XXX If a privileged process asks for an anonymous port, we + * still check for ports only in the range > tcp_smallest_non_priv_port, + * unless TCP_ANONPRIVBIND option is set. + */ + mlptype = mlptSingle; + mlp_port = requested_port; + if (requested_port == 0) { + requested_port = tcp->tcp_anon_priv_bind ? + tcp_get_next_priv_port(tcp) : + tcp_update_next_port(tcps->tcps_next_port_to_try, + tcp, B_TRUE); + if (requested_port == 0) { + return (-TNOADDR); + } + user_specified = B_FALSE; + + /* + * If the user went through one of the RPC interfaces to create + * this socket and RPC is MLP in this zone, then give him an + * anonymous MLP. + */ + connp = tcp->tcp_connp; + if (connp->conn_anon_mlp && is_system_labeled()) { + zone = crgetzone(cr); + addrtype = tsol_mlp_addr_type(zone->zone_id, + IPV6_VERSION, &v6addr, + tcps->tcps_netstack->netstack_ip); + if (addrtype == mlptSingle) { + return (-TNOADDR); + } + mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, + PMAPPORT, addrtype); + mlp_port = PMAPPORT; + } + } else { + int i; + boolean_t priv = B_FALSE; + + /* + * If the requested_port is in the well-known privileged range, + * verify that the stream was opened by a privileged user. + * Note: No locks are held when inspecting tcp_g_*epriv_ports + * but instead the code relies on: + * - the fact that the address of the array and its size never + * changes + * - the atomic assignment of the elements of the array + */ + if (requested_port < tcps->tcps_smallest_nonpriv_port) { + priv = B_TRUE; + } else { + for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { + if (requested_port == + tcps->tcps_g_epriv_ports[i]) { + priv = B_TRUE; + break; + } + } + } + if (priv) { + if (secpolicy_net_privaddr(cr, requested_port, + IPPROTO_TCP) != 0) { + if (tcp->tcp_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_bind: no priv for port %d", + requested_port); + } + return (-TACCES); + } + } + user_specified = B_TRUE; + + connp = tcp->tcp_connp; + if (is_system_labeled()) { + zone = crgetzone(cr); + addrtype = tsol_mlp_addr_type(zone->zone_id, + IPV6_VERSION, &v6addr, + tcps->tcps_netstack->netstack_ip); + if (addrtype == mlptSingle) { + return (-TNOADDR); + } + mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, + requested_port, addrtype); + } + } + + if (mlptype != mlptSingle) { + if (secpolicy_net_bindmlp(cr) != 0) { + if (tcp->tcp_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_bind: no priv for multilevel port %d", + requested_port); + } + return (-TACCES); + } + + /* + * If we're specifically binding a shared IP address and the + * port is MLP on shared addresses, then check to see if this + * zone actually owns the MLP. Reject if not. + */ + if (mlptype == mlptShared && addrtype == mlptShared) { + /* + * No need to handle exclusive-stack zones since + * ALL_ZONES only applies to the shared stack. + */ + zoneid_t mlpzone; + + mlpzone = tsol_mlp_findzone(IPPROTO_TCP, + htons(mlp_port)); + if (connp->conn_zoneid != mlpzone) { + if (tcp->tcp_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_bind: attempt to bind port " + "%d on shared addr in zone %d " + "(should be %d)", + mlp_port, connp->conn_zoneid, + mlpzone); + } + return (-TACCES); + } + } + + if (!user_specified) { + int err; + err = tsol_mlp_anon(zone, mlptype, connp->conn_ulp, + requested_port, B_TRUE); + if (err != 0) { + if (tcp->tcp_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_bind: cannot establish anon " + "MLP for port %d", + requested_port); + } + return (err); + } + connp->conn_anon_port = B_TRUE; + } + connp->conn_mlp_type = mlptype; + } + + allocated_port = tcp_bindi(tcp, requested_port, &v6addr, + tcp->tcp_reuseaddr, B_FALSE, bind_to_req_port_only, user_specified); + + if (allocated_port == 0) { + connp->conn_mlp_type = mlptSingle; + if (connp->conn_anon_port) { + connp->conn_anon_port = B_FALSE; + (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp, + requested_port, B_FALSE); + } + if (bind_to_req_port_only) { + if (tcp->tcp_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_bind: requested addr busy"); + } + return (-TADDRBUSY); + } else { + /* If we are out of ports, fail the bind. */ + if (tcp->tcp_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_bind: out of ports?"); + } + return (-TNOADDR); + } + } + + /* Pass the allocated port back */ + *requested_port_ptr = allocated_port; + return (0); +} + +static int +tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, + boolean_t bind_to_req_port_only) +{ + tcp_t *tcp = connp->conn_tcp; + + sin_t *sin; + sin6_t *sin6; + sin6_t sin6addr; + in_port_t requested_port; + ipaddr_t v4addr; + in6_addr_t v6addr; + uint_t origipversion; + int error = 0; + + ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX); + + if (tcp->tcp_state == TCPS_BOUND) { + return (0); + } else if (tcp->tcp_state > TCPS_BOUND) { + if (tcp->tcp_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, + "tcp_bind: bad state, %d", tcp->tcp_state); + } + return (-TOUTSTATE); + } + origipversion = tcp->tcp_ipversion; + + if (sa != NULL && !OK_32PTR((char *)sa)) { + if (tcp->tcp_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_bind: bad address parameter, " + "address %p, len %d", + (void *)sa, len); + } + return (-TPROTO); + } + + switch (len) { + case 0: /* request for a generic port */ + if (tcp->tcp_family == AF_INET) { + sin = (sin_t *)&sin6addr; + *sin = sin_null; + sin->sin_family = AF_INET; + tcp->tcp_ipversion = IPV4_VERSION; + IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &v6addr); + } else { + ASSERT(tcp->tcp_family == AF_INET6); + sin6 = (sin6_t *)&sin6addr; + *sin6 = sin6_null; + sin6->sin6_family = AF_INET6; + tcp->tcp_ipversion = IPV6_VERSION; + V6_SET_ZERO(v6addr); + } + requested_port = 0; + break; + + case sizeof (sin_t): /* Complete IPv4 address */ + sin = (sin_t *)sa; + /* + * With sockets sockfs will accept bogus sin_family in + * bind() and replace it with the family used in the socket + * call. + */ + if (sin->sin_family != AF_INET || + tcp->tcp_family != AF_INET) { + return (EAFNOSUPPORT); + } + requested_port = ntohs(sin->sin_port); + tcp->tcp_ipversion = IPV4_VERSION; + v4addr = sin->sin_addr.s_addr; + IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr); + break; + + case sizeof (sin6_t): /* Complete IPv6 address */ + sin6 = (sin6_t *)sa; + if (sin6->sin6_family != AF_INET6 || + tcp->tcp_family != AF_INET6) { + return (EAFNOSUPPORT); + } + requested_port = ntohs(sin6->sin6_port); + tcp->tcp_ipversion = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ? + IPV4_VERSION : IPV6_VERSION; + v6addr = sin6->sin6_addr; + break; + + default: + if (tcp->tcp_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, + "tcp_bind: bad address length, %d", len); + } + return (EAFNOSUPPORT); + /* return (-TBADADDR); */ + } + + tcp->tcp_bound_source_v6 = v6addr; + + /* Check for change in ipversion */ + if (origipversion != tcp->tcp_ipversion) { + ASSERT(tcp->tcp_family == AF_INET6); + error = tcp->tcp_ipversion == IPV6_VERSION ? + tcp_header_init_ipv6(tcp) : tcp_header_init_ipv4(tcp); + if (error) { + return (ENOMEM); + } + } + + /* + * Initialize family specific fields. Copy of the src addr. + * in tcp_t is needed for the lookup funcs. + */ + if (tcp->tcp_ipversion == IPV6_VERSION) { + tcp->tcp_ip6h->ip6_src = v6addr; + } else { + IN6_V4MAPPED_TO_IPADDR(&v6addr, tcp->tcp_ipha->ipha_src); + } + tcp->tcp_ip_src_v6 = v6addr; + + bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only; + + error = tcp_bind_select_lport(tcp, &requested_port, + bind_to_req_port_only, cr); + + return (error); +} + +/* + * Return unix error is tli error is TSYSERR, otherwise return a negative + * tli error. + */ +int +tcp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, + boolean_t bind_to_req_port_only) +{ + int error; + tcp_t *tcp = connp->conn_tcp; + + if (tcp->tcp_state >= TCPS_BOUND) { + if (tcp->tcp_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, + "tcp_bind: bad state, %d", tcp->tcp_state); + } + return (-TOUTSTATE); + } + + error = tcp_bind_check(connp, sa, len, cr, bind_to_req_port_only); + if (error != 0) + return (error); + + ASSERT(tcp->tcp_state == TCPS_BOUND); + + tcp->tcp_conn_req_max = 0; + + /* + * We need to make sure that the conn_recv is set to a non-null + * value before we insert the conn into the classifier table. + * This is to avoid a race with an incoming packet which does an + * ipcl_classify(). + */ + connp->conn_recv = tcp_conn_request; + + if (tcp->tcp_family == AF_INET6) { + ASSERT(tcp->tcp_connp->conn_af_isv6); + error = ip_proto_bind_laddr_v6(connp, NULL, IPPROTO_TCP, + &tcp->tcp_bound_source_v6, 0, B_FALSE); + } else { + ASSERT(!tcp->tcp_connp->conn_af_isv6); + error = ip_proto_bind_laddr_v4(connp, NULL, IPPROTO_TCP, + tcp->tcp_ipha->ipha_src, 0, B_FALSE); + } + return (tcp_post_ip_bind(tcp, NULL, error)); +} + +int +tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, + socklen_t len, cred_t *cr) +{ + int error; + conn_t *connp = (conn_t *)proto_handle; + squeue_t *sqp = connp->conn_sqp; + + ASSERT(sqp != NULL); + + error = squeue_synch_enter(sqp, connp, 0); + if (error != 0) { + /* failed to enter */ + return (ENOSR); + } + + /* binding to a NULL address really means unbind */ + if (sa == NULL) { + if (connp->conn_tcp->tcp_state < TCPS_LISTEN) + error = tcp_do_unbind(connp); + else + error = EINVAL; + } else { + error = tcp_do_bind(connp, sa, len, cr, B_TRUE); + } + + squeue_synch_exit(sqp, connp); + + if (error < 0) { + if (error == -TOUTSTATE) + error = EINVAL; + else + error = proto_tlitosyserr(-error); + } + + return (error); +} + +/* + * If the return value from this function is positive, it's a UNIX error. + * Otherwise, if it's negative, then the absolute value is a TLI error. + * the TPI routine tcp_tpi_connect() is a wrapper function for this. + */ +int +tcp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, + cred_t *cr, pid_t pid) +{ + tcp_t *tcp = connp->conn_tcp; + sin_t *sin = (sin_t *)sa; + sin6_t *sin6 = (sin6_t *)sa; + ipaddr_t *dstaddrp; + in_port_t dstport; + uint_t srcid; + int error = 0; + + switch (len) { + default: + /* + * Should never happen + */ + return (EINVAL); + + case sizeof (sin_t): + sin = (sin_t *)sa; + if (sin->sin_port == 0) { + return (-TBADADDR); + } + if (tcp->tcp_connp && tcp->tcp_connp->conn_ipv6_v6only) { + return (EAFNOSUPPORT); + } + break; + + case sizeof (sin6_t): + sin6 = (sin6_t *)sa; + if (sin6->sin6_port == 0) { + return (-TBADADDR); + } + break; + } + /* + * If we're connecting to an IPv4-mapped IPv6 address, we need to + * make sure that the template IP header in the tcp structure is an + * IPv4 header, and that the tcp_ipversion is IPV4_VERSION. We + * need to this before we call tcp_bindi() so that the port lookup + * code will look for ports in the correct port space (IPv4 and + * IPv6 have separate port spaces). + */ + if (tcp->tcp_family == AF_INET6 && tcp->tcp_ipversion == IPV6_VERSION && + IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + int err = 0; + + err = tcp_header_init_ipv4(tcp); + if (err != 0) { + error = ENOMEM; + goto connect_failed; + } + if (tcp->tcp_lport != 0) + *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; + } + + switch (tcp->tcp_state) { + case TCPS_LISTEN: + /* + * Listening sockets are not allowed to issue connect(). + */ + if (IPCL_IS_NONSTR(connp)) + return (EOPNOTSUPP); + /* FALLTHRU */ + case TCPS_IDLE: + /* + * We support quick connect, refer to comments in + * tcp_connect_*() + */ + /* FALLTHRU */ + case TCPS_BOUND: + /* + * We must bump the generation before the operation start. + * This is done to ensure that any upcall made later on sends + * up the right generation to the socket. + */ + SOCK_CONNID_BUMP(tcp->tcp_connid); + + if (tcp->tcp_family == AF_INET6) { + if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + return (tcp_connect_ipv6(tcp, + &sin6->sin6_addr, + sin6->sin6_port, sin6->sin6_flowinfo, + sin6->__sin6_src_id, sin6->sin6_scope_id, + cr, pid)); + } + /* + * Destination adress is mapped IPv6 address. + * Source bound address should be unspecified or + * IPv6 mapped address as well. + */ + if (!IN6_IS_ADDR_UNSPECIFIED( + &tcp->tcp_bound_source_v6) && + !IN6_IS_ADDR_V4MAPPED(&tcp->tcp_bound_source_v6)) { + return (EADDRNOTAVAIL); + } + dstaddrp = &V4_PART_OF_V6((sin6->sin6_addr)); + dstport = sin6->sin6_port; + srcid = sin6->__sin6_src_id; + } else { + dstaddrp = &sin->sin_addr.s_addr; + dstport = sin->sin_port; + srcid = 0; + } + + error = tcp_connect_ipv4(tcp, dstaddrp, dstport, srcid, cr, + pid); + break; + default: + return (-TOUTSTATE); + } + /* + * Note: Code below is the "failure" case + */ +connect_failed: + if (tcp->tcp_conn.tcp_opts_conn_req != NULL) + tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); + return (error); +} + +int +tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, + socklen_t len, sock_connid_t *id, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + tcp_t *tcp = connp->conn_tcp; + squeue_t *sqp = connp->conn_sqp; + int error; + + error = proto_verify_ip_addr(tcp->tcp_family, sa, len); + if (error != 0) { + return (error); + } + + error = squeue_synch_enter(sqp, connp, 0); + if (error != 0) { + /* failed to enter */ + return (ENOSR); + } + + /* + * TCP supports quick connect, so no need to do an implicit bind + */ + error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid); + if (error == 0) { + *id = connp->conn_tcp->tcp_connid; + } else if (error < 0) { + if (error == -TOUTSTATE) { + switch (connp->conn_tcp->tcp_state) { + case TCPS_SYN_SENT: + error = EALREADY; + break; + case TCPS_ESTABLISHED: + error = EISCONN; + break; + case TCPS_LISTEN: + error = EOPNOTSUPP; + break; + default: + error = EINVAL; + break; + } + } else { + error = proto_tlitosyserr(-error); + } + } +done: + squeue_synch_exit(sqp, connp); + + return ((error == 0) ? EINPROGRESS : error); +} + +/* ARGSUSED */ +sock_lower_handle_t +tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, + uint_t *smodep, int *errorp, int flags, cred_t *credp) +{ + conn_t *connp; + boolean_t isv6 = family == AF_INET6; + if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) || + (proto != 0 && proto != IPPROTO_TCP)) { + *errorp = EPROTONOSUPPORT; + return (NULL); + } + + connp = tcp_create_common(NULL, credp, isv6, B_TRUE, errorp); + if (connp == NULL) { + return (NULL); + } + + /* + * Put the ref for TCP. Ref for IP was already put + * by ipcl_conn_create. Also Make the conn_t globally + * visible to walkers + */ + mutex_enter(&connp->conn_lock); + CONN_INC_REF_LOCKED(connp); + ASSERT(connp->conn_ref == 2); + connp->conn_state_flags &= ~CONN_INCIPIENT; + + connp->conn_flags |= IPCL_NONSTR; + mutex_exit(&connp->conn_lock); + + ASSERT(errorp != NULL); + *errorp = 0; + *sock_downcalls = &sock_tcp_downcalls; + *smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP; + + return ((sock_lower_handle_t)connp); +} + +/* ARGSUSED */ +void +tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, + sock_upcalls_t *sock_upcalls, int flags, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + struct sock_proto_props sopp; + + sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | + SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER | + SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ; + + sopp.sopp_rxhiwat = SOCKET_RECVHIWATER; + sopp.sopp_rxlowat = SOCKET_RECVLOWATER; + sopp.sopp_maxpsz = INFPSZ; + sopp.sopp_maxblk = INFPSZ; + sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL; + sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3; + sopp.sopp_maxaddrlen = sizeof (sin6_t); + sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 : + tcp_rinfo.mi_minpsz; + + connp->conn_upcalls = sock_upcalls; + connp->conn_upper_handle = sock_handle; + + (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp); +} + +/* ARGSUSED */ +int +tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + + tcp_close_common(connp, flags); + + ip_close_helper_stream(connp); + + /* + * Drop IP's reference on the conn. This is the last reference + * on the connp if the state was less than established. If the + * connection has gone into timewait state, then we will have + * one ref for the TCP and one more ref (total of two) for the + * classifier connected hash list (a timewait connections stays + * in connected hash till closed). + * + * We can't assert the references because there might be other + * transient reference places because of some walkers or queued + * packets in squeue for the timewait state. + */ + CONN_DEC_REF(connp); + return (0); +} + +/* ARGSUSED */ +int +tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, + cred_t *cr) +{ + tcp_t *tcp; + uint32_t msize; + conn_t *connp = (conn_t *)proto_handle; + int32_t tcpstate; + + ASSERT(connp->conn_ref >= 2); + + if (msg->msg_controllen != 0) { + return (EOPNOTSUPP); + + } + switch (DB_TYPE(mp)) { + case M_DATA: + tcp = connp->conn_tcp; + ASSERT(tcp != NULL); + + tcpstate = tcp->tcp_state; + if (tcpstate < TCPS_ESTABLISHED) { + freemsg(mp); + return (ENOTCONN); + } else if (tcpstate > TCPS_CLOSE_WAIT) { + freemsg(mp); + return (EPIPE); + } + + if (is_system_labeled()) + msg_setcredpid(mp, cr, curproc->p_pid); + + /* XXX pass the size down and to the squeue */ + msize = msgdsize(mp); + + mutex_enter(&tcp->tcp_non_sq_lock); + tcp->tcp_squeue_bytes += msize; + /* + * Squeue Flow Control + */ + if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) { + tcp_setqfull(tcp); + } + mutex_exit(&tcp->tcp_non_sq_lock); + + /* + * The application may pass in an address in the msghdr, but + * we ignore the address on connection-oriented sockets. + * Just like BSD this code does not generate an error for + * TCP (a CONNREQUIRED socket) when sending to an address + * passed in with sendto/sendmsg. Instead the data is + * delivered on the connection as if no address had been + * supplied. + */ + CONN_INC_REF(connp); + + if (msg != NULL && msg->msg_flags & MSG_OOB) { + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, + tcp_output_urgent, connp, tcp_squeue_flag, + SQTAG_TCP_OUTPUT); + } else { + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, + connp, tcp_squeue_flag, SQTAG_TCP_OUTPUT); + } + + return (0); + + default: + ASSERT(0); + } + + freemsg(mp); + return (0); +} + +/* ARGSUSED */ +void +tcp_output_urgent(void *arg, mblk_t *mp, void *arg2) +{ + int len; + uint32_t msize; + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + + msize = msgdsize(mp); + + len = msize - 1; + if (len < 0) { + freemsg(mp); + return; + } + + /* + * Try to force urgent data out on the wire. + * Even if we have unsent data this will + * at least send the urgent flag. + * XXX does not handle more flag correctly. + */ + len += tcp->tcp_unsent; + len += tcp->tcp_snxt; + tcp->tcp_urg = len; + tcp->tcp_valid_bits |= TCP_URG_VALID; + + /* Bypass tcp protocol for fused tcp loopback */ + if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) + return; + tcp_wput_data(tcp, mp, B_TRUE); +} + +/* ARGSUSED */ +int +tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, + socklen_t *addrlen, cred_t *cr) +{ + sin_t *sin; + sin6_t *sin6; + conn_t *connp = (conn_t *)proto_handle; + tcp_t *tcp = connp->conn_tcp; + + ASSERT(tcp != NULL); + if (tcp->tcp_state < TCPS_SYN_RCVD) + return (ENOTCONN); + + addr->sa_family = tcp->tcp_family; + switch (tcp->tcp_family) { + case AF_INET: + if (*addrlen < sizeof (sin_t)) + return (EINVAL); + + sin = (sin_t *)addr; + *sin = sin_null; + sin->sin_family = AF_INET; + if (tcp->tcp_ipversion == IPV4_VERSION) { + IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_remote_v6, + sin->sin_addr.s_addr); + } + sin->sin_port = tcp->tcp_fport; + *addrlen = sizeof (struct sockaddr_in); + break; + case AF_INET6: + sin6 = (sin6_t *)addr; + *sin6 = sin6_null; + sin6->sin6_family = AF_INET6; + + if (*addrlen < sizeof (struct sockaddr_in6)) + return (EINVAL); + + if (tcp->tcp_ipversion == IPV6_VERSION) { + sin6->sin6_flowinfo = tcp->tcp_ip6h->ip6_vcf & + ~IPV6_VERS_AND_FLOW_MASK; + } + + sin6->sin6_addr = tcp->tcp_remote_v6; + sin6->sin6_port = tcp->tcp_fport; + *addrlen = sizeof (struct sockaddr_in6); + break; + } + return (0); +} + +/* ARGSUSED */ +int +tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr, + socklen_t *addrlenp, cred_t *cr) +{ + sin_t *sin; + sin6_t *sin6; + conn_t *connp = (conn_t *)proto_handle; + tcp_t *tcp = connp->conn_tcp; + + switch (tcp->tcp_family) { + case AF_INET: + ASSERT(tcp->tcp_ipversion == IPV4_VERSION); + if (*addrlenp < sizeof (sin_t)) + return (EINVAL); + sin = (sin_t *)addr; + *sin = sin_null; + sin->sin_family = AF_INET; + *addrlenp = sizeof (sin_t); + if (tcp->tcp_state >= TCPS_BOUND) { + sin->sin_addr.s_addr = tcp->tcp_ipha->ipha_src; + sin->sin_port = tcp->tcp_lport; + } + break; + + case AF_INET6: + if (*addrlenp < sizeof (sin6_t)) + return (EINVAL); + sin6 = (sin6_t *)addr; + *sin6 = sin6_null; + sin6->sin6_family = AF_INET6; + *addrlenp = sizeof (sin6_t); + if (tcp->tcp_state >= TCPS_BOUND) { + sin6->sin6_port = tcp->tcp_lport; + if (tcp->tcp_ipversion == IPV4_VERSION) { + IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, + &sin6->sin6_addr); + } else { + sin6->sin6_addr = tcp->tcp_ip6h->ip6_src; + } + } + break; + } + return (0); +} + +/* + * tcp_fallback + * + * A direct socket is falling back to using STREAMS. Hanging + * off of the queue is a temporary tcp_t, which was created using + * tcp_open(). The tcp_open() was called as part of the regular + * sockfs create path, i.e., the SO_SOCKSTR flag is passed down, + * and therefore the temporary tcp_t is marked to be a socket + * (i.e., IPCL_SOCKET, tcp_issocket). So the optimizations + * introduced by FireEngine will be used. + * + * The tcp_t associated with the socket falling back will + * still be marked as a socket, although the direct socket flag + * (IPCL_NONSTR) is removed. A fall back to true TPI semantics + * will not take place until a _SIOCSOCKFALLBACK ioctl is issued. + * + * If the above mentioned behavior, i.e., the tmp tcp_t is created + * as a STREAMS/TPI endpoint, then we will need to do more work here. + * Such as inserting the direct socket into the acceptor hash. + */ +void +tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, + boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb) +{ + tcp_t *tcp, *eager; + conn_t *connp = (conn_t *)proto_handle; + int error; + struct T_capability_ack tca; + struct sockaddr_in6 laddr, faddr; + socklen_t laddrlen, faddrlen; + short opts; + struct stroptions *stropt; + mblk_t *stropt_mp; + mblk_t *mp; + mblk_t *conn_ind_head = NULL; + mblk_t *conn_ind_tail = NULL; + mblk_t *ordrel_mp; + mblk_t *fused_sigurp_mp; + + tcp = connp->conn_tcp; + /* + * No support for acceptor fallback + */ + ASSERT(q->q_qinfo != &tcp_acceptor_rinit); + + stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL); + + /* Pre-allocate the T_ordrel_ind mblk. */ + ASSERT(tcp->tcp_ordrel_mp == NULL); + ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI, + STR_NOSIG, NULL); + ordrel_mp->b_datap->db_type = M_PROTO; + ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND; + ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind); + + /* Pre-allocate the M_PCSIG anyway */ + fused_sigurp_mp = allocb_wait(1, BPRI_HI, STR_NOSIG, NULL); + + /* + * Enter the squeue so that no new packets can come in + */ + error = squeue_synch_enter(connp->conn_sqp, connp, 0); + if (error != 0) { + /* failed to enter, free all the pre-allocated messages. */ + freeb(stropt_mp); + freeb(ordrel_mp); + freeb(fused_sigurp_mp); + return; + } + + /* Disable I/OAT during fallback */ + tcp->tcp_sodirect = NULL; + + connp->conn_dev = (dev_t)RD(q)->q_ptr; + connp->conn_minor_arena = WR(q)->q_ptr; + + RD(q)->q_ptr = WR(q)->q_ptr = connp; + + connp->conn_tcp->tcp_rq = connp->conn_rq = RD(q); + connp->conn_tcp->tcp_wq = connp->conn_wq = WR(q); + + WR(q)->q_qinfo = &tcp_sock_winit; + + if (!direct_sockfs) + tcp_disable_direct_sockfs(tcp); + + /* + * free the helper stream + */ + ip_close_helper_stream(connp); + + /* + * Notify the STREAM head about options + */ + DB_TYPE(stropt_mp) = M_SETOPTS; + stropt = (struct stroptions *)stropt_mp->b_rptr; + stropt_mp->b_wptr += sizeof (struct stroptions); + stropt = (struct stroptions *)stropt_mp->b_rptr; + stropt->so_flags |= SO_HIWAT | SO_WROFF | SO_MAXBLK; + + stropt->so_wroff = tcp->tcp_hdr_len + (tcp->tcp_loopback ? 0 : + tcp->tcp_tcps->tcps_wroff_xtra); + if (tcp->tcp_snd_sack_ok) + stropt->so_wroff += TCPOPT_MAX_SACK_LEN; + stropt->so_hiwat = tcp->tcp_fused ? + tcp_fuse_set_rcv_hiwat(tcp, tcp->tcp_recv_hiwater) : + MAX(tcp->tcp_recv_hiwater, tcp->tcp_tcps->tcps_sth_rcv_hiwat); + stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); + + putnext(RD(q), stropt_mp); + + /* + * Collect the information needed to sync with the sonode + */ + tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID); + + laddrlen = faddrlen = sizeof (sin6_t); + (void) tcp_getsockname(proto_handle, (struct sockaddr *)&laddr, + &laddrlen, CRED()); + error = tcp_getpeername(proto_handle, (struct sockaddr *)&faddr, + &faddrlen, CRED()); + if (error != 0) + faddrlen = 0; + + opts = 0; + if (tcp->tcp_oobinline) + opts |= SO_OOBINLINE; + if (tcp->tcp_dontroute) + opts |= SO_DONTROUTE; + + /* + * Notify the socket that the protocol is now quiescent, + * and it's therefore safe move data from the socket + * to the stream head. + */ + (*quiesced_cb)(connp->conn_upper_handle, q, &tca, + (struct sockaddr *)&laddr, laddrlen, + (struct sockaddr *)&faddr, faddrlen, opts); + + while ((mp = tcp->tcp_rcv_list) != NULL) { + tcp->tcp_rcv_list = mp->b_next; + mp->b_next = NULL; + putnext(q, mp); + } + tcp->tcp_rcv_last_head = NULL; + tcp->tcp_rcv_last_tail = NULL; + tcp->tcp_rcv_cnt = 0; + + /* + * No longer a direct socket + */ + connp->conn_flags &= ~IPCL_NONSTR; + + tcp->tcp_ordrel_mp = ordrel_mp; + + if (tcp->tcp_fused) { + ASSERT(tcp->tcp_fused_sigurg_mp == NULL); + tcp->tcp_fused_sigurg_mp = fused_sigurp_mp; + } else { + freeb(fused_sigurp_mp); + } + + /* + * Send T_CONN_IND messages for all ESTABLISHED connections. + */ + mutex_enter(&tcp->tcp_eager_lock); + for (eager = tcp->tcp_eager_next_q; eager != NULL; + eager = eager->tcp_eager_next_q) { + mp = eager->tcp_conn.tcp_eager_conn_ind; + + eager->tcp_conn.tcp_eager_conn_ind = NULL; + ASSERT(mp != NULL); + /* + * TLI/XTI applications will get confused by + * sending eager as an option since it violates + * the option semantics. So remove the eager as + * option since TLI/XTI app doesn't need it anyway. + */ + if (!TCP_IS_SOCKET(tcp)) { + struct T_conn_ind *conn_ind; + + conn_ind = (struct T_conn_ind *)mp->b_rptr; + conn_ind->OPT_length = 0; + conn_ind->OPT_offset = 0; + } + if (conn_ind_head == NULL) { + conn_ind_head = mp; + } else { + conn_ind_tail->b_next = mp; + } + conn_ind_tail = mp; + } + mutex_exit(&tcp->tcp_eager_lock); + + mp = conn_ind_head; + while (mp != NULL) { + mblk_t *nmp = mp->b_next; + mp->b_next = NULL; + + putnext(tcp->tcp_rq, mp); + mp = nmp; + } + + /* + * There should be atleast two ref's (IP + TCP) + */ + ASSERT(connp->conn_ref >= 2); + squeue_synch_exit(connp->conn_sqp, connp); +} + +/* ARGSUSED */ +static void +tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2) +{ + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + + freemsg(mp); + + if (tcp->tcp_fused) + tcp_unfuse(tcp); + + if (tcp_xmit_end(tcp) != 0) { + /* + * We were crossing FINs and got a reset from + * the other side. Just ignore it. + */ + if (tcp->tcp_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_shutdown_output() out of state %s", + tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); + } + } +} + +/* ARGSUSED */ +int +tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + tcp_t *tcp = connp->conn_tcp; + + /* + * X/Open requires that we check the connected state. + */ + if (tcp->tcp_state < TCPS_SYN_SENT) + return (ENOTCONN); + + /* shutdown the send side */ + if (how != SHUT_RD) { + mblk_t *bp; + + bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL); + CONN_INC_REF(connp); + SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output, + connp, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT); + + (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, + SOCK_OPCTL_SHUT_SEND, 0); + } + + /* shutdown the recv side */ + if (how != SHUT_WR) + (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, + SOCK_OPCTL_SHUT_RECV, 0); + + return (0); +} + +/* + * SOP_LISTEN() calls into tcp_listen(). + */ +/* ARGSUSED */ +int +tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + int error; + squeue_t *sqp = connp->conn_sqp; + + error = squeue_synch_enter(sqp, connp, 0); + if (error != 0) { + /* failed to enter */ + return (ENOBUFS); + } + + error = tcp_do_listen(connp, backlog, cr); + if (error == 0) { + (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, + SOCK_OPCTL_ENAB_ACCEPT, (uintptr_t)backlog); + } else if (error < 0) { + if (error == -TOUTSTATE) + error = EINVAL; + else + error = proto_tlitosyserr(-error); + } + squeue_synch_exit(sqp, connp); + return (error); +} + +static int +tcp_do_listen(conn_t *connp, int backlog, cred_t *cr) +{ + tcp_t *tcp = connp->conn_tcp; + sin_t *sin; + sin6_t *sin6; + int error = 0; + tcp_stack_t *tcps = tcp->tcp_tcps; + + if (tcp->tcp_state >= TCPS_BOUND) { + if ((tcp->tcp_state == TCPS_BOUND || + tcp->tcp_state == TCPS_LISTEN) && + backlog > 0) { + /* + * Handle listen() increasing backlog. + * This is more "liberal" then what the TPI spec + * requires but is needed to avoid a t_unbind + * when handling listen() since the port number + * might be "stolen" between the unbind and bind. + */ + goto do_listen; + } + if (tcp->tcp_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, + "tcp_bind: bad state, %d", tcp->tcp_state); + } + return (-TOUTSTATE); + } else { + int32_t len; + sin6_t addr; + + /* Do an implicit bind: Request for a generic port. */ + if (tcp->tcp_family == AF_INET) { + len = sizeof (sin_t); + sin = (sin_t *)&addr; + *sin = sin_null; + sin->sin_family = AF_INET; + tcp->tcp_ipversion = IPV4_VERSION; + } else { + ASSERT(tcp->tcp_family == AF_INET6); + len = sizeof (sin6_t); + sin6 = (sin6_t *)&addr; + *sin6 = sin6_null; + sin6->sin6_family = AF_INET6; + tcp->tcp_ipversion = IPV6_VERSION; + } + + error = tcp_bind_check(connp, (struct sockaddr *)&addr, len, + cr, B_FALSE); + if (error) + return (error); + /* Fall through and do the fanout insertion */ + } + +do_listen: + ASSERT(tcp->tcp_state == TCPS_BOUND || tcp->tcp_state == TCPS_LISTEN); + tcp->tcp_conn_req_max = backlog; + if (tcp->tcp_conn_req_max) { + if (tcp->tcp_conn_req_max < tcps->tcps_conn_req_min) + tcp->tcp_conn_req_max = tcps->tcps_conn_req_min; + if (tcp->tcp_conn_req_max > tcps->tcps_conn_req_max_q) + tcp->tcp_conn_req_max = tcps->tcps_conn_req_max_q; + /* + * If this is a listener, do not reset the eager list + * and other stuffs. Note that we don't check if the + * existing eager list meets the new tcp_conn_req_max + * requirement. + */ + if (tcp->tcp_state != TCPS_LISTEN) { + tcp->tcp_state = TCPS_LISTEN; + /* Initialize the chain. Don't need the eager_lock */ + tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; + tcp->tcp_eager_next_drop_q0 = tcp; + tcp->tcp_eager_prev_drop_q0 = tcp; + tcp->tcp_second_ctimer_threshold = + tcps->tcps_ip_abort_linterval; + } + } + + /* + * We can call ip_bind directly which returns a T_BIND_ACK mp. The + * processing continues in tcp_rput_other(). + * + * We need to make sure that the conn_recv is set to a non-null + * value before we insert the conn into the classifier table. + * This is to avoid a race with an incoming packet which does an + * ipcl_classify(). + */ + connp->conn_recv = tcp_conn_request; + if (tcp->tcp_family == AF_INET) { + error = ip_proto_bind_laddr_v4(connp, NULL, + IPPROTO_TCP, tcp->tcp_bound_source, tcp->tcp_lport, B_TRUE); + } else { + error = ip_proto_bind_laddr_v6(connp, NULL, IPPROTO_TCP, + &tcp->tcp_bound_source_v6, tcp->tcp_lport, B_TRUE); + } + return (tcp_post_ip_bind(tcp, NULL, error)); +} + +void +tcp_clr_flowctrl(sock_lower_handle_t proto_handle) +{ + conn_t *connp = (conn_t *)proto_handle; + tcp_t *tcp = connp->conn_tcp; + tcp_stack_t *tcps = tcp->tcp_tcps; + uint_t thwin; + + (void) squeue_synch_enter(connp->conn_sqp, connp, 0); + + /* Flow control condition has been removed. */ + tcp->tcp_rwnd = tcp->tcp_recv_hiwater; + thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) + << tcp->tcp_rcv_ws; + thwin -= tcp->tcp_rnxt - tcp->tcp_rack; + /* + * Send back a window update immediately if TCP is above + * ESTABLISHED state and the increase of the rcv window + * that the other side knows is at least 1 MSS after flow + * control is lifted. + */ + if (tcp->tcp_state >= TCPS_ESTABLISHED && + (tcp->tcp_recv_hiwater - thwin >= tcp->tcp_mss)) { + tcp_xmit_ctl(NULL, tcp, + (tcp->tcp_swnd == 0) ? tcp->tcp_suna : + tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); + BUMP_MIB(&tcps->tcps_mib, tcpOutWinUpdate); + } + + squeue_synch_exit(connp->conn_sqp, connp); +} + +/* ARGSUSED */ +int +tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, + int mode, int32_t *rvalp, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + int error; + + switch (cmd) { + case ND_SET: + case ND_GET: + case TCP_IOC_DEFAULT_Q: + case _SIOCSOCKFALLBACK: + case TCP_IOC_ABORT_CONN: + case TI_GETPEERNAME: + case TI_GETMYNAME: + ip1dbg(("tcp_ioctl: cmd 0x%x on non sreams socket", + cmd)); + error = EINVAL; + break; + default: + /* + * Pass on to IP using helper stream + */ + error = ldi_ioctl( + connp->conn_helper_info->ip_helper_stream_handle, + cmd, arg, mode, cr, rvalp); + break; + } + return (error); +} + +sock_downcalls_t sock_tcp_downcalls = { + tcp_activate, + tcp_accept, + tcp_bind, + tcp_listen, + tcp_connect, + tcp_getpeername, + tcp_getsockname, + tcp_getsockopt, + tcp_setsockopt, + tcp_sendmsg, + NULL, + NULL, + NULL, + tcp_shutdown, + tcp_clr_flowctrl, + tcp_ioctl, + tcp_close, +}; diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c index a192c7ad07..15b5d04d61 100644 --- a/usr/src/uts/common/inet/tcp/tcp_fusion.c +++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c @@ -261,10 +261,9 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph) tcp->tcp_kssl_ent == NULL && !IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN, ipst)) { mblk_t *mp; - struct stroptions *stropt; queue_t *peer_rq = peer_tcp->tcp_rq; - ASSERT(!TCP_IS_DETACHED(peer_tcp) && peer_rq != NULL); + ASSERT(!TCP_IS_DETACHED(peer_tcp)); ASSERT(tcp->tcp_fused_sigurg_mp == NULL); ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL); ASSERT(tcp->tcp_kssl_ctx == NULL); @@ -276,19 +275,25 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph) * This is why we pre-allocate the M_PCSIG mblks for both * endpoints which will only be used during/after unfuse. */ - if ((mp = allocb(1, BPRI_HI)) == NULL) - goto failed; + if (!IPCL_IS_NONSTR(tcp->tcp_connp)) { + if ((mp = allocb(1, BPRI_HI)) == NULL) + goto failed; - tcp->tcp_fused_sigurg_mp = mp; + tcp->tcp_fused_sigurg_mp = mp; + } - if ((mp = allocb(1, BPRI_HI)) == NULL) - goto failed; + if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp)) { + if ((mp = allocb(1, BPRI_HI)) == NULL) + goto failed; - peer_tcp->tcp_fused_sigurg_mp = mp; + peer_tcp->tcp_fused_sigurg_mp = mp; + } - /* Allocate M_SETOPTS mblk */ - if ((mp = allocb(sizeof (*stropt), BPRI_HI)) == NULL) + if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp) && + (mp = allocb(sizeof (struct stroptions), + BPRI_HI)) == NULL) { goto failed; + } /* If either tcp or peer_tcp sodirect enabled then disable */ if (tcp->tcp_sodirect != NULL) { @@ -329,12 +334,12 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph) * us data as soon as fusion is finished, and we need to be * able to flow control it in case it sends down huge amount * of data while we're still detached. To prevent that we - * inherit the listener's q_hiwat value; this is temporary - * since we'll repeat the process in tcp_accept_finish(). + * inherit the listener's recv_hiwater value; this is temporary + * since we'll repeat the process intcp_accept_finish(). */ if (!tcp->tcp_refuse) { (void) tcp_fuse_set_rcv_hiwat(tcp, - tcp->tcp_saved_listener->tcp_rq->q_hiwat); + tcp->tcp_saved_listener->tcp_recv_hiwater); /* * Set the stream head's write offset value to zero @@ -342,30 +347,53 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph) * headers; tell it to not break up the writes (this * would reduce the amount of work done by kmem); and * configure our receive buffer. Note that we can only - * do this for the active connect tcp since our eager - * is still detached; it will be dealt with later in + * do this for the active connect tcp since our eager is + * still detached; it will be dealt with later in * tcp_accept_finish(). */ - DB_TYPE(mp) = M_SETOPTS; - mp->b_wptr += sizeof (*stropt); - - stropt = (struct stroptions *)mp->b_rptr; - stropt->so_flags = SO_MAXBLK | SO_WROFF | SO_HIWAT; - stropt->so_maxblk = tcp_maxpsz_set(peer_tcp, B_FALSE); - stropt->so_wroff = 0; - - /* - * Record the stream head's high water mark for - * peer endpoint; this is used for flow-control - * purposes in tcp_fuse_output(). - */ - stropt->so_hiwat = tcp_fuse_set_rcv_hiwat(peer_tcp, - peer_rq->q_hiwat); - - tcp->tcp_refuse = B_FALSE; - peer_tcp->tcp_refuse = B_FALSE; - /* Send the options up */ - putnext(peer_rq, mp); + if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp)) { + struct stroptions *stropt; + + DB_TYPE(mp) = M_SETOPTS; + mp->b_wptr += sizeof (*stropt); + + stropt = (struct stroptions *)mp->b_rptr; + stropt->so_flags = SO_MAXBLK|SO_WROFF|SO_HIWAT; + stropt->so_maxblk = tcp_maxpsz_set(peer_tcp, + B_FALSE); + stropt->so_wroff = 0; + + /* + * Record the stream head's high water mark for + * peer endpoint; this is used for flow-control + * purposes in tcp_fuse_output(). + */ + stropt->so_hiwat = tcp_fuse_set_rcv_hiwat( + peer_tcp, peer_rq->q_hiwat); + + tcp->tcp_refuse = B_FALSE; + peer_tcp->tcp_refuse = B_FALSE; + /* Send the options up */ + putnext(peer_rq, mp); + } else { + struct sock_proto_props sopp; + + /* The peer is a non-STREAMS end point */ + ASSERT(IPCL_IS_TCP(peer_connp)); + + (void) tcp_fuse_set_rcv_hiwat(tcp, + tcp->tcp_saved_listener->tcp_recv_hiwater); + + sopp.sopp_flags = SOCKOPT_MAXBLK | + SOCKOPT_WROFF | SOCKOPT_RCVHIWAT; + sopp.sopp_maxblk = tcp_maxpsz_set(peer_tcp, + B_FALSE); + sopp.sopp_wroff = 0; + sopp.sopp_rxhiwat = tcp_fuse_set_rcv_hiwat( + peer_tcp, peer_tcp->tcp_recv_hiwater); + (*peer_connp->conn_upcalls->su_set_proto_props) + (peer_connp->conn_upper_handle, &sopp); + } } tcp->tcp_refuse = B_FALSE; peer_tcp->tcp_refuse = B_FALSE; @@ -399,8 +427,6 @@ tcp_unfuse(tcp_t *tcp) ASSERT(peer_tcp->tcp_fused && peer_tcp->tcp_loopback_peer == tcp); ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp); ASSERT(tcp->tcp_unsent == 0 && peer_tcp->tcp_unsent == 0); - ASSERT(tcp->tcp_fused_sigurg_mp != NULL); - ASSERT(peer_tcp->tcp_fused_sigurg_mp != NULL); /* * We disable synchronous streams, drain any queued data and @@ -420,10 +446,16 @@ tcp_unfuse(tcp_t *tcp) /* Unfuse the endpoints */ peer_tcp->tcp_fused = tcp->tcp_fused = B_FALSE; peer_tcp->tcp_loopback_peer = tcp->tcp_loopback_peer = NULL; - freeb(peer_tcp->tcp_fused_sigurg_mp); - freeb(tcp->tcp_fused_sigurg_mp); - peer_tcp->tcp_fused_sigurg_mp = NULL; - tcp->tcp_fused_sigurg_mp = NULL; + if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp)) { + ASSERT(peer_tcp->tcp_fused_sigurg_mp != NULL); + freeb(peer_tcp->tcp_fused_sigurg_mp); + peer_tcp->tcp_fused_sigurg_mp = NULL; + } + if (!IPCL_IS_NONSTR(tcp->tcp_connp)) { + ASSERT(tcp->tcp_fused_sigurg_mp != NULL); + freeb(tcp->tcp_fused_sigurg_mp); + tcp->tcp_fused_sigurg_mp = NULL; + } } /* @@ -527,6 +559,7 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size) uint_t max_unread; boolean_t flow_stopped, peer_data_queued = B_FALSE; boolean_t urgent = (DB_TYPE(mp) != M_DATA); + boolean_t push = B_FALSE; mblk_t *mp1 = mp; ill_t *ilp, *olp; ipif_t *iifp, *oifp; @@ -546,7 +579,6 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size) ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); - /* If this connection requires IP, unfuse and use regular path */ if (tcp_loopback_needs_ip(tcp, ns) || tcp_loopback_needs_ip(peer_tcp, ns) || @@ -749,7 +781,38 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size) * Enqueue data into the peer's receive list; we may or may not * drain the contents depending on the conditions below. */ - tcp_rcv_enqueue(peer_tcp, mp, recv_size); + if (IPCL_IS_NONSTR(peer_tcp->tcp_connp) && + peer_tcp->tcp_connp->conn_upper_handle != NULL) { + int error; + int flags = 0; + + if ((tcp->tcp_valid_bits & TCP_URG_VALID) && + (tcp->tcp_urg == tcp->tcp_snxt)) { + flags = MSG_OOB; + (*peer_tcp->tcp_connp->conn_upcalls->su_signal_oob) + (peer_tcp->tcp_connp->conn_upper_handle, 0); + tcp->tcp_valid_bits &= ~TCP_URG_VALID; + } + (*peer_tcp->tcp_connp->conn_upcalls->su_recv)( + peer_tcp->tcp_connp->conn_upper_handle, mp, recv_size, + flags, &error, &push); + } else { + if (IPCL_IS_NONSTR(peer_tcp->tcp_connp) && + (tcp->tcp_valid_bits & TCP_URG_VALID) && + (tcp->tcp_urg == tcp->tcp_snxt)) { + /* + * Can not deal with urgent pointers + * that arrive before the connection has been + * accept()ed. + */ + tcp->tcp_valid_bits &= ~TCP_URG_VALID; + freemsg(mp); + mutex_exit(&peer_tcp->tcp_non_sq_lock); + return (B_TRUE); + } + + tcp_rcv_enqueue(peer_tcp, mp, recv_size); + } /* In case it wrapped around and also to keep it constant */ peer_tcp->tcp_rwnd += recv_size; @@ -797,6 +860,7 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size) (peer_tcp->tcp_rcv_cnt >= peer_tcp->tcp_fuse_rcv_hiwater || peer_tcp->tcp_fuse_rcv_unread_cnt >= max_unread)) || (!peer_tcp->tcp_direct_sockfs && !TCP_IS_DETACHED(peer_tcp) && + !IPCL_IS_NONSTR(peer_tcp->tcp_connp) && !canputnext(peer_tcp->tcp_rq))) { peer_data_queued = B_TRUE; } @@ -861,7 +925,8 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size) * will pull the data via tcp_fuse_rrw(). */ if (urgent || (!flow_stopped && !peer_tcp->tcp_direct_sockfs)) { - ASSERT(peer_tcp->tcp_rcv_list != NULL); + ASSERT(IPCL_IS_NONSTR(peer_tcp->tcp_connp) || + peer_tcp->tcp_rcv_list != NULL); /* * For TLI-based streams, a thread in tcp_accept_swap() * can race with us. That thread will ensure that the @@ -897,6 +962,8 @@ boolean_t tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp) { mblk_t *mp; + conn_t *connp = tcp->tcp_connp; + #ifdef DEBUG uint_t cnt = 0; #endif @@ -907,7 +974,7 @@ tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp) ASSERT(tcp->tcp_loopback); ASSERT(tcp->tcp_fused || tcp->tcp_fused_sigurg); ASSERT(!tcp->tcp_fused || tcp->tcp_loopback_peer != NULL); - ASSERT(sigurg_mpp != NULL || tcp->tcp_fused); + ASSERT(IPCL_IS_NONSTR(connp) || sigurg_mpp != NULL || tcp->tcp_fused); /* No need for the push timer now, in case it was scheduled */ if (tcp->tcp_push_tid != 0) { @@ -921,34 +988,41 @@ tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp) * works properly. */ if (tcp->tcp_fused_sigurg) { - /* - * sigurg_mpp is normally NULL, i.e. when we're still - * fused and didn't get here because of tcp_unfuse(). - * In this case try hard to allocate the M_PCSIG mblk. - */ - if (sigurg_mpp == NULL && - (mp = allocb(1, BPRI_HI)) == NULL && - (mp = allocb_tryhard(1)) == NULL) { - /* Alloc failed; try again next time */ - tcp->tcp_push_tid = TCP_TIMER(tcp, tcp_push_timer, - MSEC_TO_TICK(tcps->tcps_push_timer_interval)); - return (B_TRUE); - } else if (sigurg_mpp != NULL) { + tcp->tcp_fused_sigurg = B_FALSE; + if (IPCL_IS_NONSTR(connp)) { + (*connp->conn_upcalls->su_signal_oob) + (connp->conn_upper_handle, 0); + } else { /* - * Use the supplied M_PCSIG mblk; it means we're - * either unfused or in the process of unfusing, - * and the drain must happen now. + * sigurg_mpp is normally NULL, i.e. when we're still + * fused and didn't get here because of tcp_unfuse(). + * In this case try hard to allocate the M_PCSIG mblk. */ - mp = *sigurg_mpp; - *sigurg_mpp = NULL; - } - ASSERT(mp != NULL); + if (sigurg_mpp == NULL && + (mp = allocb(1, BPRI_HI)) == NULL && + (mp = allocb_tryhard(1)) == NULL) { + /* Alloc failed; try again next time */ + tcp->tcp_push_tid = TCP_TIMER(tcp, + tcp_push_timer, + MSEC_TO_TICK( + tcps->tcps_push_timer_interval)); + return (B_TRUE); + } else if (sigurg_mpp != NULL) { + /* + * Use the supplied M_PCSIG mblk; it means we're + * either unfused or in the process of unfusing, + * and the drain must happen now. + */ + mp = *sigurg_mpp; + *sigurg_mpp = NULL; + } + ASSERT(mp != NULL); - tcp->tcp_fused_sigurg = B_FALSE; - /* Send up the signal */ - DB_TYPE(mp) = M_PCSIG; - *mp->b_wptr++ = (uchar_t)SIGURG; - putnext(q, mp); + /* Send up the signal */ + DB_TYPE(mp) = M_PCSIG; + *mp->b_wptr++ = (uchar_t)SIGURG; + putnext(q, mp); + } /* * Let the regular tcp_rcv_drain() path handle * draining the data if we're no longer fused. @@ -980,6 +1054,7 @@ tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp) #ifdef DEBUG cnt += msgdsize(mp); #endif + ASSERT(!IPCL_IS_NONSTR(connp)); if (sd_rd_eof) { freemsg(mp); } else { @@ -991,12 +1066,14 @@ tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp) if (tcp->tcp_direct_sockfs && !sd_rd_eof) (void) strrput_sig(q, B_TRUE); +#ifdef DEBUG ASSERT(cnt == tcp->tcp_rcv_cnt); +#endif tcp->tcp_rcv_last_head = NULL; tcp->tcp_rcv_last_tail = NULL; tcp->tcp_rcv_cnt = 0; tcp->tcp_fuse_rcv_unread_cnt = 0; - tcp->tcp_rwnd = q->q_hiwat; + tcp->tcp_rwnd = tcp->tcp_recv_hiwater; if (peer_tcp->tcp_flow_stopped && (TCP_UNSENT_BYTES(peer_tcp) <= peer_tcp->tcp_xmit_lowater)) { @@ -1409,8 +1486,10 @@ tcp_fuse_disable_pair(tcp_t *tcp, boolean_t unfusing) } /* Disable synchronous streams */ - tcp_fuse_syncstr_disable(tcp); - tcp_fuse_syncstr_disable(peer_tcp); + if (!IPCL_IS_NONSTR(tcp->tcp_connp)) + tcp_fuse_syncstr_disable(tcp); + if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp)) + tcp_fuse_syncstr_disable(peer_tcp); } /* diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c index 4f0d767774..d977c27e53 100644 --- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c +++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/stream.h> #define _SUN_TPI_VERSION 2 @@ -43,8 +41,8 @@ extern int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr); -extern int tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr); -extern int tcp_opt_set(queue_t *q, uint_t optset_context, int level, +extern int tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr); +extern int tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk); @@ -125,10 +123,10 @@ opdes_t tcp_opt_arr[] = { { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), - 40, -1 /* not initialized */ }, + IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), - 40, -1 /* not initialized */ }, + IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, { IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, { T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, @@ -244,8 +242,8 @@ uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */ optdb_obj_t tcp_opt_obj = { tcp_opt_default, /* TCP default value function pointer */ - tcp_opt_get, /* TCP get function pointer */ - tcp_opt_set, /* TCP set function pointer */ + tcp_tpi_opt_get, /* TCP get function pointer */ + tcp_tpi_opt_set, /* TCP set function pointer */ B_TRUE, /* TCP is tpi provider */ TCP_OPT_ARR_CNT, /* TCP option database count of entries */ tcp_opt_arr, /* TCP option database */ diff --git a/usr/src/uts/common/inet/tcp/tcpddi.c b/usr/src/uts/common/inet/tcp/tcpddi.c index ee5b0181b6..91da903826 100644 --- a/usr/src/uts/common/inet/tcp/tcpddi.c +++ b/usr/src/uts/common/inet/tcp/tcpddi.c @@ -29,12 +29,18 @@ #include <sys/modctl.h> #include <inet/common.h> #include <inet/ip.h> +#include <inet/tcp_impl.h> +#include <sys/strsubr.h> +#include <sys/socketvar.h> #define INET_NAME "tcp" #define INET_MODSTRTAB dummymodinfo #define INET_DEVSTRTAB tcpinfov4 #define INET_DEVDESC "TCP STREAMS driver" #define INET_MODDESC "TCP dummy STREAMS module" +#define INET_SOCKDESC "TCP socket module" +#define INET_SOCK_PROTO_CREATE_FUNC (*tcp_create) +#define INET_SOCK_PROTO_FB_FUNC (*tcp_fallback) #define INET_DEVMINOR 0 #define INET_MODMTFLAGS D_MP /* diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h index 98d8d17f61..97374be482 100644 --- a/usr/src/uts/common/inet/tcp_impl.h +++ b/usr/src/uts/common/inet/tcp_impl.h @@ -39,6 +39,7 @@ extern "C" { #ifdef _KERNEL +#include <inet/optcom.h> #include <inet/tcp.h> #define TCP_MOD_ID 5105 @@ -274,6 +275,14 @@ extern int tcp_fuse_maxpsz_set(tcp_t *); extern optdb_obj_t tcp_opt_obj; extern uint_t tcp_max_optsize; +extern sock_lower_handle_t tcp_create(int, int, int, sock_downcalls_t **, + uint_t *, int *, int, cred_t *); +extern void tcp_fallback(sock_lower_handle_t, queue_t *, boolean_t, + so_proto_quiesced_cb_t); + +extern sock_downcalls_t sock_tcp_downcalls; + + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/inet/tcp_stack.h b/usr/src/uts/common/inet/tcp_stack.h index 43da079d5a..173875f0da 100644 --- a/usr/src/uts/common/inet/tcp_stack.h +++ b/usr/src/uts/common/inet/tcp_stack.h @@ -30,6 +30,8 @@ #include <sys/netstack.h> #include <inet/ip.h> #include <inet/ipdrop.h> +#include <sys/sunddi.h> +#include <sys/sunldi.h> #ifdef __cplusplus extern "C" { @@ -232,6 +234,7 @@ struct tcp_stack { uint32_t tcps_rst_cnt; /* The number of RST not sent because of the rate limit. */ uint32_t tcps_rst_unsent; + ldi_ident_t tcps_ldi_ident; }; typedef struct tcp_stack tcp_stack_t; diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c index 70677c86d8..5f819f1285 100644 --- a/usr/src/uts/common/inet/udp/udp.c +++ b/usr/src/uts/common/inet/udp/udp.c @@ -40,13 +40,13 @@ #include <sys/strsubr.h> #include <sys/suntpi.h> #include <sys/xti_inet.h> -#include <sys/cmn_err.h> #include <sys/kmem.h> #include <sys/policy.h> #include <sys/ucred.h> #include <sys/zone.h> #include <sys/socket.h> +#include <sys/socketvar.h> #include <sys/sockio.h> #include <sys/vtrace.h> #include <sys/sdt.h> @@ -68,7 +68,7 @@ #include <inet/ip_if.h> #include <inet/ip_multi.h> #include <inet/ip_ndp.h> -#include <inet/mi.h> +#include <inet/proto_set.h> #include <inet/mib2.h> #include <inet/nd.h> #include <inet/optcom.h> @@ -150,17 +150,14 @@ typedef struct udpattrs_s { } udpattrs_t; static void udp_addr_req(queue_t *q, mblk_t *mp); -static void udp_bind(queue_t *q, mblk_t *mp); +static void udp_tpi_bind(queue_t *q, mblk_t *mp); static void udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp); static void udp_bind_hash_remove(udp_t *udp, boolean_t caller_holds_lock); -static void udp_bind_result(conn_t *, mblk_t *); -static void udp_bind_ack(conn_t *, mblk_t *mp); -static void udp_bind_error(conn_t *, mblk_t *mp); static int udp_build_hdrs(udp_t *udp); static void udp_capability_req(queue_t *q, mblk_t *mp); -static int udp_close(queue_t *q); -static void udp_connect(queue_t *q, mblk_t *mp); -static void udp_disconnect(queue_t *q, mblk_t *mp); +static int udp_tpi_close(queue_t *q, int flags); +static void udp_tpi_connect(queue_t *q, mblk_t *mp); +static void udp_tpi_disconnect(queue_t *q, mblk_t *mp); static void udp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error); static void udp_err_ack_prim(queue_t *q, mblk_t *mp, int primitive, @@ -171,8 +168,8 @@ static int udp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr); static int udp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr); -static void udp_icmp_error(queue_t *q, mblk_t *mp); -static void udp_icmp_error_ipv6(queue_t *q, mblk_t *mp); +static void udp_icmp_error(conn_t *, mblk_t *); +static void udp_icmp_error_ipv6(conn_t *, mblk_t *); static void udp_info_req(queue_t *q, mblk_t *mp); static void udp_input(void *, mblk_t *, void *); static mblk_t *udp_ip_bind_mp(udp_t *udp, t_scalar_t bind_prim, @@ -201,15 +198,16 @@ static void udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha); static void udp_ud_err(queue_t *q, mblk_t *mp, uchar_t *destaddr, t_scalar_t destlen, t_scalar_t err); -static void udp_unbind(queue_t *q, mblk_t *mp); +static void udp_tpi_unbind(queue_t *q, mblk_t *mp); static in_port_t udp_update_next_port(udp_t *udp, in_port_t port, boolean_t random); static mblk_t *udp_output_v4(conn_t *, mblk_t *, ipaddr_t, uint16_t, uint_t, - int *, boolean_t); + int *, boolean_t, struct nmsghdr *, cred_t *, pid_t); static mblk_t *udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, - int *error); + int *error, struct nmsghdr *msg, cred_t *cr, pid_t pid); static void udp_wput_other(queue_t *q, mblk_t *mp); static void udp_wput_iocdata(queue_t *q, mblk_t *mp); +static void udp_wput_fallback(queue_t *q, mblk_t *mp); static size_t udp_set_rcv_hiwat(udp_t *udp, size_t size); static void *udp_stack_init(netstackid_t stackid, netstack_t *ns); @@ -226,6 +224,25 @@ static void udp_rcv_enqueue(queue_t *q, udp_t *udp, mblk_t *mp, static void udp_rcv_drain(queue_t *q, udp_t *udp, boolean_t closing); static void udp_xmit(queue_t *, mblk_t *, ire_t *ire, conn_t *, zoneid_t); +static int udp_send_connected(conn_t *, mblk_t *, struct nmsghdr *, + cred_t *, pid_t); + +/* Common routine for TPI and socket module */ +static conn_t *udp_do_open(cred_t *, boolean_t, int); +static void udp_do_close(conn_t *); +static int udp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *, + boolean_t); +static int udp_do_unbind(conn_t *); +static int udp_do_getsockname(udp_t *, struct sockaddr *, uint_t *); +static int udp_do_getpeername(udp_t *, struct sockaddr *, uint_t *); + +int udp_getsockname(sock_lower_handle_t, + struct sockaddr *, socklen_t *, cred_t *); +int udp_getpeername(sock_lower_handle_t, + struct sockaddr *, socklen_t *, cred_t *); +static int udp_do_connect(conn_t *, const struct sockaddr *, socklen_t); +static int udp_post_ip_bind_connect(udp_t *, mblk_t *, int); + #define UDP_RECV_HIWATER (56 * 1024) #define UDP_RECV_LOWATER 128 #define UDP_XMIT_HIWATER (56 * 1024) @@ -240,12 +257,12 @@ static struct module_info udp_mod_info = { * We have separate open functions for the /dev/udp and /dev/udp6 devices. */ static struct qinit udp_rinitv4 = { - NULL, NULL, udp_openv4, udp_close, NULL, + NULL, NULL, udp_openv4, udp_tpi_close, NULL, &udp_mod_info, NULL, udp_rrw, udp_rinfop, STRUIOT_STANDARD }; static struct qinit udp_rinitv6 = { - NULL, NULL, udp_openv6, udp_close, NULL, + NULL, NULL, udp_openv6, udp_tpi_close, NULL, &udp_mod_info, NULL, udp_rrw, udp_rinfop, STRUIOT_STANDARD }; @@ -254,17 +271,22 @@ static struct qinit udp_winit = { &udp_mod_info, NULL, NULL, NULL, STRUIOT_NONE }; +/* UDP entry point during fallback */ +struct qinit udp_fallback_sock_winit = { + (pfi_t)udp_wput_fallback, NULL, NULL, NULL, NULL, &udp_mod_info +}; + /* * UDP needs to handle I_LINK and I_PLINK since ifconfig * likes to use it as a place to hang the various streams. */ static struct qinit udp_lrinit = { - (pfi_t)udp_lrput, NULL, udp_openv4, udp_close, NULL, + (pfi_t)udp_lrput, NULL, udp_openv4, udp_tpi_close, NULL, &udp_mod_info }; static struct qinit udp_lwinit = { - (pfi_t)udp_lwput, NULL, udp_openv4, udp_close, NULL, + (pfi_t)udp_lwput, NULL, udp_openv4, udp_tpi_close, NULL, &udp_mod_info }; @@ -559,30 +581,19 @@ udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp) * duplicating the us->us_next_port_to_try. */ static void -udp_bind(queue_t *q, mblk_t *mp) +udp_tpi_bind(queue_t *q, mblk_t *mp) { sin_t *sin; sin6_t *sin6; mblk_t *mp1; - in_port_t port; /* Host byte order */ - in_port_t requested_port; /* Host byte order */ struct T_bind_req *tbr; - int count; - in6_addr_t v6src; - boolean_t bind_to_req_port_only; - int loopmax; - udp_fanout_t *udpf; - in_port_t lport; /* Network byte order */ - zoneid_t zoneid; conn_t *connp; udp_t *udp; - boolean_t is_inaddr_any; - mlp_type_t addrtype, mlptype; - udp_stack_t *us; + int error; + struct sockaddr *sa; connp = Q_TO_CONN(q); udp = connp->conn_udp; - us = udp->udp_us; if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, "udp_bind: bad req, len %u", @@ -607,6 +618,10 @@ udp_bind(queue_t *q, mblk_t *mp) } mp = mp1; + + /* Reset the message type in preparation for shipping it back. */ + DB_TYPE(mp) = M_PCPROTO; + tbr = (struct T_bind_req *)mp->b_rptr; switch (tbr->ADDR_length) { case 0: /* Request for a generic port */ @@ -617,6 +632,7 @@ udp_bind(queue_t *q, mblk_t *mp) *sin = sin_null; sin->sin_family = AF_INET; mp->b_wptr = (uchar_t *)&sin[1]; + sa = (struct sockaddr *)sin; } else { ASSERT(udp->udp_family == AF_INET6); tbr->ADDR_length = sizeof (sin6_t); @@ -624,38 +640,36 @@ udp_bind(queue_t *q, mblk_t *mp) *sin6 = sin6_null; sin6->sin6_family = AF_INET6; mp->b_wptr = (uchar_t *)&sin6[1]; + sa = (struct sockaddr *)sin6; } - port = 0; break; case sizeof (sin_t): /* Complete IPv4 address */ - sin = (sin_t *)mi_offset_param(mp, tbr->ADDR_offset, + sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, sizeof (sin_t)); - if (sin == NULL || !OK_32PTR((char *)sin)) { + if (sa == NULL || !OK_32PTR((char *)sa)) { udp_err_ack(q, mp, TSYSERR, EINVAL); return; } if (udp->udp_family != AF_INET || - sin->sin_family != AF_INET) { + sa->sa_family != AF_INET) { udp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT); return; } - port = ntohs(sin->sin_port); break; case sizeof (sin6_t): /* complete IPv6 address */ - sin6 = (sin6_t *)mi_offset_param(mp, tbr->ADDR_offset, + sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, sizeof (sin6_t)); - if (sin6 == NULL || !OK_32PTR((char *)sin6)) { + if (sa == NULL || !OK_32PTR((char *)sa)) { udp_err_ack(q, mp, TSYSERR, EINVAL); return; } if (udp->udp_family != AF_INET6 || - sin6->sin6_family != AF_INET6) { + sa->sa_family != AF_INET6) { udp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT); return; } - port = ntohs(sin6->sin6_port); break; default: /* Invalid request */ @@ -665,503 +679,21 @@ udp_bind(queue_t *q, mblk_t *mp) return; } - requested_port = port; - - if (requested_port == 0 || tbr->PRIM_type == O_T_BIND_REQ) - bind_to_req_port_only = B_FALSE; - else /* T_BIND_REQ and requested_port != 0 */ - bind_to_req_port_only = B_TRUE; - - if (requested_port == 0) { - /* - * If the application passed in zero for the port number, it - * doesn't care which port number we bind to. Get one in the - * valid range. - */ - if (udp->udp_anon_priv_bind) { - port = udp_get_next_priv_port(udp); - } else { - port = udp_update_next_port(udp, - us->us_next_port_to_try, B_TRUE); - } - } else { - /* - * If the port is in the well-known privileged range, - * make sure the caller was privileged. - */ - int i; - boolean_t priv = B_FALSE; - - if (port < us->us_smallest_nonpriv_port) { - priv = B_TRUE; - } else { - for (i = 0; i < us->us_num_epriv_ports; i++) { - if (port == us->us_epriv_ports[i]) { - priv = B_TRUE; - break; - } - } - } - - if (priv) { - cred_t *cr = DB_CREDDEF(mp, connp->conn_cred); - - if (secpolicy_net_privaddr(cr, port, - IPPROTO_UDP) != 0) { - udp_err_ack(q, mp, TACCES, 0); - return; - } - } - } - - if (port == 0) { - udp_err_ack(q, mp, TNOADDR, 0); - return; - } - - /* - * The state must be TS_UNBND. TPI mandates that users must send - * TPI primitives only 1 at a time and wait for the response before - * sending the next primitive. - */ - rw_enter(&udp->udp_rwlock, RW_WRITER); - if (udp->udp_state != TS_UNBND || udp->udp_pending_op != -1) { - rw_exit(&udp->udp_rwlock); - (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, - "udp_bind: bad state, %u", udp->udp_state); - udp_err_ack(q, mp, TOUTSTATE, 0); - return; - } - udp->udp_pending_op = tbr->PRIM_type; - /* - * Copy the source address into our udp structure. This address - * may still be zero; if so, IP will fill in the correct address - * each time an outbound packet is passed to it. Since the udp is - * not yet in the bind hash list, we don't grab the uf_lock to - * change udp_ipversion - */ - if (udp->udp_family == AF_INET) { - ASSERT(sin != NULL); - ASSERT(udp->udp_ipversion == IPV4_VERSION); - udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE + - udp->udp_ip_snd_options_len; - IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6src); - } else { - ASSERT(sin6 != NULL); - v6src = sin6->sin6_addr; - if (IN6_IS_ADDR_V4MAPPED(&v6src)) { - /* - * no need to hold the uf_lock to set the udp_ipversion - * since we are not yet in the fanout list - */ - udp->udp_ipversion = IPV4_VERSION; - udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + - UDPH_SIZE + udp->udp_ip_snd_options_len; - } else { - udp->udp_ipversion = IPV6_VERSION; - udp->udp_max_hdr_len = udp->udp_sticky_hdrs_len; - } - } - - /* - * If udp_reuseaddr is not set, then we have to make sure that - * the IP address and port number the application requested - * (or we selected for the application) is not being used by - * another stream. If another stream is already using the - * requested IP address and port, the behavior depends on - * "bind_to_req_port_only". If set the bind fails; otherwise we - * search for any an unused port to bind to the the stream. - * - * As per the BSD semantics, as modified by the Deering multicast - * changes, if udp_reuseaddr is set, then we allow multiple binds - * to the same port independent of the local IP address. - * - * This is slightly different than in SunOS 4.X which did not - * support IP multicast. Note that the change implemented by the - * Deering multicast code effects all binds - not only binding - * to IP multicast addresses. - * - * Note that when binding to port zero we ignore SO_REUSEADDR in - * order to guarantee a unique port. - */ - count = 0; - if (udp->udp_anon_priv_bind) { - /* - * loopmax = (IPPORT_RESERVED-1) - - * us->us_min_anonpriv_port + 1 - */ - loopmax = IPPORT_RESERVED - us->us_min_anonpriv_port; - } else { - loopmax = us->us_largest_anon_port - - us->us_smallest_anon_port + 1; - } - - is_inaddr_any = V6_OR_V4_INADDR_ANY(v6src); - zoneid = connp->conn_zoneid; - - for (;;) { - udp_t *udp1; - boolean_t found_exclbind = B_FALSE; - - /* - * Walk through the list of udp streams bound to - * requested port with the same IP address. - */ - lport = htons(port); - udpf = &us->us_bind_fanout[UDP_BIND_HASH(lport, - us->us_bind_fanout_size)]; - mutex_enter(&udpf->uf_lock); - for (udp1 = udpf->uf_udp; udp1 != NULL; - udp1 = udp1->udp_bind_hash) { - if (lport != udp1->udp_port) - continue; - - /* - * On a labeled system, we must treat bindings to ports - * on shared IP addresses by sockets with MAC exemption - * privilege as being in all zones, as there's - * otherwise no way to identify the right receiver. - */ - if (!(IPCL_ZONE_MATCH(udp1->udp_connp, zoneid) || - IPCL_ZONE_MATCH(connp, - udp1->udp_connp->conn_zoneid)) && - !connp->conn_mac_exempt && \ - !udp1->udp_connp->conn_mac_exempt) - continue; + cred_t *cr = DB_CREDDEF(mp, connp->conn_cred); + error = udp_do_bind(connp, sa, tbr->ADDR_length, cr, + tbr->PRIM_type != O_T_BIND_REQ); - /* - * If UDP_EXCLBIND is set for either the bound or - * binding endpoint, the semantics of bind - * is changed according to the following chart. - * - * spec = specified address (v4 or v6) - * unspec = unspecified address (v4 or v6) - * A = specified addresses are different for endpoints - * - * bound bind to allowed? - * ------------------------------------- - * unspec unspec no - * unspec spec no - * spec unspec no - * spec spec yes if A - * - * For labeled systems, SO_MAC_EXEMPT behaves the same - * as UDP_EXCLBIND, except that zoneid is ignored. - */ - if (udp1->udp_exclbind || udp->udp_exclbind || - udp1->udp_connp->conn_mac_exempt || - connp->conn_mac_exempt) { - if (V6_OR_V4_INADDR_ANY( - udp1->udp_bound_v6src) || - is_inaddr_any || - IN6_ARE_ADDR_EQUAL(&udp1->udp_bound_v6src, - &v6src)) { - found_exclbind = B_TRUE; - break; - } - continue; - } - - /* - * Check ipversion to allow IPv4 and IPv6 sockets to - * have disjoint port number spaces. - */ - if (udp->udp_ipversion != udp1->udp_ipversion) { - - /* - * On the first time through the loop, if the - * the user intentionally specified a - * particular port number, then ignore any - * bindings of the other protocol that may - * conflict. This allows the user to bind IPv6 - * alone and get both v4 and v6, or bind both - * both and get each seperately. On subsequent - * times through the loop, we're checking a - * port that we chose (not the user) and thus - * we do not allow casual duplicate bindings. - */ - if (count == 0 && requested_port != 0) - continue; - } - - /* - * No difference depending on SO_REUSEADDR. - * - * If existing port is bound to a - * non-wildcard IP address and - * the requesting stream is bound to - * a distinct different IP addresses - * (non-wildcard, also), keep going. - */ - if (!is_inaddr_any && - !V6_OR_V4_INADDR_ANY(udp1->udp_bound_v6src) && - !IN6_ARE_ADDR_EQUAL(&udp1->udp_bound_v6src, - &v6src)) { - continue; - } - break; - } - - if (!found_exclbind && - (udp->udp_reuseaddr && requested_port != 0)) { - break; - } - - if (udp1 == NULL) { - /* - * No other stream has this IP address - * and port number. We can use it. - */ - break; - } - mutex_exit(&udpf->uf_lock); - if (bind_to_req_port_only) { - /* - * We get here only when requested port - * is bound (and only first of the for() - * loop iteration). - * - * The semantics of this bind request - * require it to fail so we return from - * the routine (and exit the loop). - * - */ - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - udp_err_ack(q, mp, TADDRBUSY, 0); - return; - } - - if (udp->udp_anon_priv_bind) { - port = udp_get_next_priv_port(udp); - } else { - if ((count == 0) && (requested_port != 0)) { - /* - * If the application wants us to find - * a port, get one to start with. Set - * requested_port to 0, so that we will - * update us->us_next_port_to_try below. - */ - port = udp_update_next_port(udp, - us->us_next_port_to_try, B_TRUE); - requested_port = 0; - } else { - port = udp_update_next_port(udp, port + 1, - B_FALSE); - } - } - - if (port == 0 || ++count >= loopmax) { - /* - * We've tried every possible port number and - * there are none available, so send an error - * to the user. - */ - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - udp_err_ack(q, mp, TNOADDR, 0); - return; - } - } - - /* - * Copy the source address into our udp structure. This address - * may still be zero; if so, ip will fill in the correct address - * each time an outbound packet is passed to it. - * If we are binding to a broadcast or multicast address then - * udp_bind_ack will clear the source address when it receives - * the T_BIND_ACK. - */ - udp->udp_v6src = udp->udp_bound_v6src = v6src; - udp->udp_port = lport; - /* - * Now reset the the next anonymous port if the application requested - * an anonymous port, or we handed out the next anonymous port. - */ - if ((requested_port == 0) && (!udp->udp_anon_priv_bind)) { - us->us_next_port_to_try = port + 1; - } - - /* Initialize the O_T_BIND_REQ/T_BIND_REQ for ip. */ - if (udp->udp_family == AF_INET) { - sin->sin_port = udp->udp_port; - } else { - int error; - - sin6->sin6_port = udp->udp_port; - /* Rebuild the header template */ - error = udp_build_hdrs(udp); - if (error != 0) { - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - mutex_exit(&udpf->uf_lock); + if (error != 0) { + if (error > 0) { udp_err_ack(q, mp, TSYSERR, error); - return; - } - } - udp->udp_state = TS_IDLE; - udp_bind_hash_insert(udpf, udp); - mutex_exit(&udpf->uf_lock); - rw_exit(&udp->udp_rwlock); - - if (cl_inet_bind) { - /* - * Running in cluster mode - register bind information - */ - if (udp->udp_ipversion == IPV4_VERSION) { - (*cl_inet_bind)(IPPROTO_UDP, AF_INET, - (uint8_t *)(&V4_PART_OF_V6(udp->udp_v6src)), - (in_port_t)udp->udp_port); } else { - (*cl_inet_bind)(IPPROTO_UDP, AF_INET6, - (uint8_t *)&(udp->udp_v6src), - (in_port_t)udp->udp_port); + udp_err_ack(q, mp, -error, 0); } - - } - - connp->conn_anon_port = (is_system_labeled() && requested_port == 0); - if (is_system_labeled() && (!connp->conn_anon_port || - connp->conn_anon_mlp)) { - uint16_t mlpport; - cred_t *cr = connp->conn_cred; - zone_t *zone; - - zone = crgetzone(cr); - connp->conn_mlp_type = udp->udp_recvucred ? mlptBoth : - mlptSingle; - addrtype = tsol_mlp_addr_type(zone->zone_id, IPV6_VERSION, - &v6src, us->us_netstack->netstack_ip); - if (addrtype == mlptSingle) { - rw_enter(&udp->udp_rwlock, RW_WRITER); - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - udp_err_ack(q, mp, TNOADDR, 0); - connp->conn_anon_port = B_FALSE; - connp->conn_mlp_type = mlptSingle; - return; - } - mlpport = connp->conn_anon_port ? PMAPPORT : port; - mlptype = tsol_mlp_port_type(zone, IPPROTO_UDP, mlpport, - addrtype); - if (mlptype != mlptSingle && - (connp->conn_mlp_type == mlptSingle || - secpolicy_net_bindmlp(cr) != 0)) { - if (udp->udp_debug) { - (void) strlog(UDP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "udp_bind: no priv for multilevel port %d", - mlpport); - } - rw_enter(&udp->udp_rwlock, RW_WRITER); - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - udp_err_ack(q, mp, TACCES, 0); - connp->conn_anon_port = B_FALSE; - connp->conn_mlp_type = mlptSingle; - return; - } - - /* - * If we're specifically binding a shared IP address and the - * port is MLP on shared addresses, then check to see if this - * zone actually owns the MLP. Reject if not. - */ - if (mlptype == mlptShared && addrtype == mlptShared) { - /* - * No need to handle exclusive-stack zones since - * ALL_ZONES only applies to the shared stack. - */ - zoneid_t mlpzone; - - mlpzone = tsol_mlp_findzone(IPPROTO_UDP, - htons(mlpport)); - if (connp->conn_zoneid != mlpzone) { - if (udp->udp_debug) { - (void) strlog(UDP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "udp_bind: attempt to bind port " - "%d on shared addr in zone %d " - "(should be %d)", - mlpport, connp->conn_zoneid, - mlpzone); - } - rw_enter(&udp->udp_rwlock, RW_WRITER); - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - udp_err_ack(q, mp, TACCES, 0); - connp->conn_anon_port = B_FALSE; - connp->conn_mlp_type = mlptSingle; - return; - } - } - if (connp->conn_anon_port) { - int error; - - error = tsol_mlp_anon(zone, mlptype, connp->conn_ulp, - port, B_TRUE); - if (error != 0) { - if (udp->udp_debug) { - (void) strlog(UDP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "udp_bind: cannot establish anon " - "MLP for port %d", port); - } - rw_enter(&udp->udp_rwlock, RW_WRITER); - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - udp_err_ack(q, mp, TACCES, 0); - connp->conn_anon_port = B_FALSE; - connp->conn_mlp_type = mlptSingle; - return; - } - } - connp->conn_mlp_type = mlptype; - } - - /* Pass the protocol number in the message following the address. */ - *mp->b_wptr++ = IPPROTO_UDP; - if (!V6_OR_V4_INADDR_ANY(udp->udp_v6src)) { - /* - * Append a request for an IRE if udp_v6src not - * zero (IPv4 - INADDR_ANY, or IPv6 - all-zeroes address). - */ - mp->b_cont = allocb(sizeof (ire_t), BPRI_HI); - if (!mp->b_cont) { - rw_enter(&udp->udp_rwlock, RW_WRITER); - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - udp_err_ack(q, mp, TSYSERR, ENOMEM); - return; - } - mp->b_cont->b_wptr += sizeof (ire_t); - mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE; + } else { + tbr->PRIM_type = T_BIND_ACK; + qreply(q, mp); } - if (udp->udp_family == AF_INET6) - mp = ip_bind_v6(q, mp, connp, NULL); - else - mp = ip_bind_v4(q, mp, connp); - - /* The above return NULL if the bind needs to be deferred */ - if (mp != NULL) - udp_bind_result(connp, mp); - else - CONN_INC_REF(connp); -} - -/* - * This is called from ip_wput_nondata to handle the results of a - * deferred UDP bind. It is called once the bind has been completed. - */ -void -udp_resume_bind(conn_t *connp, mblk_t *mp) -{ - ASSERT(connp != NULL && IPCL_IS_UDP(connp)); - - udp_bind_result(connp, mp); - - CONN_OPER_PENDING_DONE(connp); } /* @@ -1174,32 +706,25 @@ udp_resume_bind(conn_t *connp, mblk_t *mp) * T_OK_ACK - for the T_CONN_REQ * T_CONN_CON - to keep the TPI user happy * - * The connect completes in udp_bind_result. + * The connect completes in udp_do_connect. * When a T_BIND_ACK is received information is extracted from the IRE * and the two appended messages are sent to the TPI user. * Should udp_bind_result receive T_ERROR_ACK for the T_BIND_REQ it will * convert it to an error ack for the appropriate primitive. */ static void -udp_connect(queue_t *q, mblk_t *mp) +udp_tpi_connect(queue_t *q, mblk_t *mp) { - sin6_t *sin6; - sin_t *sin; + mblk_t *mp1; + udp_t *udp; + conn_t *connp = Q_TO_CONN(q); + int error; + socklen_t len; + struct sockaddr *sa; struct T_conn_req *tcr; - in6_addr_t v6dst; - ipaddr_t v4dst; - uint16_t dstport; - uint32_t flowinfo; - mblk_t *mp1, *mp2; - udp_fanout_t *udpf; - udp_t *udp, *udp1; - ushort_t ipversion; - udp_stack_t *us; - conn_t *connp = Q_TO_CONN(q); udp = connp->conn_udp; tcr = (struct T_conn_req *)mp->b_rptr; - us = udp->udp_us; /* A bit of sanity checking */ if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) { @@ -1218,285 +743,87 @@ udp_connect(queue_t *q, mblk_t *mp) * Make sure that address family matches the type of * family of the the address passed down */ + len = tcr->DEST_length; switch (tcr->DEST_length) { default: udp_err_ack(q, mp, TBADADDR, 0); return; case sizeof (sin_t): - sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset, + sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, sizeof (sin_t)); - if (sin == NULL || !OK_32PTR((char *)sin)) { - udp_err_ack(q, mp, TSYSERR, EINVAL); - return; - } - if (udp->udp_family != AF_INET || - sin->sin_family != AF_INET) { - udp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT); - return; - } - v4dst = sin->sin_addr.s_addr; - dstport = sin->sin_port; - IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); - ASSERT(udp->udp_ipversion == IPV4_VERSION); - ipversion = IPV4_VERSION; break; case sizeof (sin6_t): - sin6 = (sin6_t *)mi_offset_param(mp, tcr->DEST_offset, + sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, sizeof (sin6_t)); - if (sin6 == NULL || !OK_32PTR((char *)sin6)) { - udp_err_ack(q, mp, TSYSERR, EINVAL); - return; - } - if (udp->udp_family != AF_INET6 || - sin6->sin6_family != AF_INET6) { - udp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT); - return; - } - v6dst = sin6->sin6_addr; - dstport = sin6->sin6_port; - if (IN6_IS_ADDR_V4MAPPED(&v6dst)) { - IN6_V4MAPPED_TO_IPADDR(&v6dst, v4dst); - ipversion = IPV4_VERSION; - flowinfo = 0; - } else { - ipversion = IPV6_VERSION; - flowinfo = sin6->sin6_flowinfo; - } break; } - if (dstport == 0) { - udp_err_ack(q, mp, TBADADDR, 0); - return; - } - - rw_enter(&udp->udp_rwlock, RW_WRITER); - /* - * This UDP must have bound to a port already before doing a connect. - * TPI mandates that users must send TPI primitives only 1 at a time - * and wait for the response before sending the next primitive. - */ - if (udp->udp_state == TS_UNBND || udp->udp_pending_op != -1) { - rw_exit(&udp->udp_rwlock); - (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, - "udp_connect: bad state, %u", udp->udp_state); - udp_err_ack(q, mp, TOUTSTATE, 0); + error = proto_verify_ip_addr(udp->udp_family, sa, len); + if (error != 0) { + udp_err_ack(q, mp, TSYSERR, error); return; } - udp->udp_pending_op = T_CONN_REQ; - ASSERT(udp->udp_port != 0 && udp->udp_ptpbhn != NULL); - - if (ipversion == IPV4_VERSION) { - udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE + - udp->udp_ip_snd_options_len; - } else { - udp->udp_max_hdr_len = udp->udp_sticky_hdrs_len; - } - - udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port, - us->us_bind_fanout_size)]; - - mutex_enter(&udpf->uf_lock); - if (udp->udp_state == TS_DATA_XFER) { - /* Already connected - clear out state */ - udp->udp_v6src = udp->udp_bound_v6src; - udp->udp_state = TS_IDLE; - } /* - * Create a default IP header with no IP options. + * We have to send a connection confirmation to + * keep TLI happy. */ - udp->udp_dstport = dstport; - udp->udp_ipversion = ipversion; - if (ipversion == IPV4_VERSION) { - /* - * Interpret a zero destination to mean loopback. - * Update the T_CONN_REQ (sin/sin6) since it is used to - * generate the T_CONN_CON. - */ - if (v4dst == INADDR_ANY) { - v4dst = htonl(INADDR_LOOPBACK); - IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); - if (udp->udp_family == AF_INET) { - sin->sin_addr.s_addr = v4dst; - } else { - sin6->sin6_addr = v6dst; - } - } - udp->udp_v6dst = v6dst; - udp->udp_flowinfo = 0; - - /* - * If the destination address is multicast and - * an outgoing multicast interface has been set, - * use the address of that interface as our - * source address if no source address has been set. - */ - if (V4_PART_OF_V6(udp->udp_v6src) == INADDR_ANY && - CLASSD(v4dst) && - udp->udp_multicast_if_addr != INADDR_ANY) { - IN6_IPADDR_TO_V4MAPPED(udp->udp_multicast_if_addr, - &udp->udp_v6src); - } + if (udp->udp_family == AF_INET) { + mp1 = mi_tpi_conn_con(NULL, (char *)sa, + sizeof (sin_t), NULL, 0); } else { - ASSERT(udp->udp_ipversion == IPV6_VERSION); - /* - * Interpret a zero destination to mean loopback. - * Update the T_CONN_REQ (sin/sin6) since it is used to - * generate the T_CONN_CON. - */ - if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) { - v6dst = ipv6_loopback; - sin6->sin6_addr = v6dst; - } - udp->udp_v6dst = v6dst; - udp->udp_flowinfo = flowinfo; - /* - * If the destination address is multicast and - * an outgoing multicast interface has been set, - * then the ip bind logic will pick the correct source - * address (i.e. matching the outgoing multicast interface). - */ + mp1 = mi_tpi_conn_con(NULL, (char *)sa, + sizeof (sin6_t), NULL, 0); } - - /* - * Verify that the src/port/dst/port is unique for all - * connections in TS_DATA_XFER - */ - for (udp1 = udpf->uf_udp; udp1 != NULL; udp1 = udp1->udp_bind_hash) { - if (udp1->udp_state != TS_DATA_XFER) - continue; - if (udp->udp_port != udp1->udp_port || - udp->udp_ipversion != udp1->udp_ipversion || - dstport != udp1->udp_dstport || - !IN6_ARE_ADDR_EQUAL(&udp->udp_v6src, &udp1->udp_v6src) || - !IN6_ARE_ADDR_EQUAL(&v6dst, &udp1->udp_v6dst) || - !(IPCL_ZONE_MATCH(udp->udp_connp, - udp1->udp_connp->conn_zoneid) || - IPCL_ZONE_MATCH(udp1->udp_connp, - udp->udp_connp->conn_zoneid))) - continue; - mutex_exit(&udpf->uf_lock); - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - udp_err_ack(q, mp, TBADADDR, 0); - return; - } - udp->udp_state = TS_DATA_XFER; - mutex_exit(&udpf->uf_lock); - - /* - * Send down bind to IP to verify that there is a route - * and to determine the source address. - * This will come back as T_BIND_ACK with an IRE_DB_TYPE in rput. - */ - if (udp->udp_family == AF_INET) - mp1 = udp_ip_bind_mp(udp, O_T_BIND_REQ, sizeof (ipa_conn_t)); - else - mp1 = udp_ip_bind_mp(udp, O_T_BIND_REQ, sizeof (ipa6_conn_t)); if (mp1 == NULL) { -bind_failed: - mutex_enter(&udpf->uf_lock); - udp->udp_state = TS_IDLE; - udp->udp_pending_op = -1; - mutex_exit(&udpf->uf_lock); - rw_exit(&udp->udp_rwlock); udp_err_ack(q, mp, TSYSERR, ENOMEM); return; } - rw_exit(&udp->udp_rwlock); /* - * We also have to send a connection confirmation to - * keep TLI happy. Prepare it for udp_bind_result. + * ok_ack for T_CONN_REQ */ - if (udp->udp_family == AF_INET) - mp2 = mi_tpi_conn_con(NULL, (char *)sin, - sizeof (*sin), NULL, 0); - else - mp2 = mi_tpi_conn_con(NULL, (char *)sin6, - sizeof (*sin6), NULL, 0); - if (mp2 == NULL) { - freemsg(mp1); - rw_enter(&udp->udp_rwlock, RW_WRITER); - goto bind_failed; - } - mp = mi_tpi_ok_ack_alloc(mp); if (mp == NULL) { /* Unable to reuse the T_CONN_REQ for the ack. */ - freemsg(mp2); - rw_enter(&udp->udp_rwlock, RW_WRITER); - mutex_enter(&udpf->uf_lock); - udp->udp_state = TS_IDLE; - udp->udp_pending_op = -1; - mutex_exit(&udpf->uf_lock); - rw_exit(&udp->udp_rwlock); + freemsg(mp1); udp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM); return; } - /* Hang onto the T_OK_ACK and T_CONN_CON for later. */ - linkb(mp1, mp); - linkb(mp1, mp2); - - mblk_setcred(mp1, connp->conn_cred); - if (udp->udp_family == AF_INET) - mp1 = ip_bind_v4(q, mp1, connp); - else - mp1 = ip_bind_v6(q, mp1, connp, NULL); - - /* The above return NULL if the bind needs to be deferred */ - if (mp1 != NULL) - udp_bind_result(connp, mp1); - else - CONN_INC_REF(connp); + error = udp_do_connect(connp, sa, len); + if (error != 0) { + freeb(mp1); + if (error < 0) + udp_err_ack(q, mp, -error, 0); + else + udp_err_ack(q, mp, TSYSERR, error); + } else { + putnext(connp->conn_rq, mp); + putnext(connp->conn_rq, mp1); + } } static int -udp_close(queue_t *q) +udp_tpi_close(queue_t *q, int flags) { - conn_t *connp = (conn_t *)q->q_ptr; - udp_t *udp; - - ASSERT(connp != NULL && IPCL_IS_UDP(connp)); - udp = connp->conn_udp; - - udp_quiesce_conn(connp); - ip_quiesce_conn(connp); - /* - * Disable read-side synchronous stream - * interface and drain any queued data. - */ - udp_rcv_drain(q, udp, B_TRUE); - ASSERT(!udp->udp_direct_sockfs); - - qprocsoff(q); - - ASSERT(udp->udp_rcv_cnt == 0); - ASSERT(udp->udp_rcv_msgcnt == 0); - ASSERT(udp->udp_rcv_list_head == NULL); - ASSERT(udp->udp_rcv_list_tail == NULL); - - udp_close_free(connp); + conn_t *connp; - /* - * Now we are truly single threaded on this stream, and can - * delete the things hanging off the connp, and finally the connp. - * We removed this connp from the fanout list, it cannot be - * accessed thru the fanouts, and we already waited for the - * conn_ref to drop to 0. We are already in close, so - * there cannot be any other thread from the top. qprocsoff - * has completed, and service has completed or won't run in - * future. - */ - ASSERT(connp->conn_ref == 1); - inet_minor_free(connp->conn_minor_arena, connp->conn_dev); - connp->conn_ref--; - ipcl_conn_destroy(connp); + if (flags & SO_FALLBACK) { + /* + * stream is being closed while in fallback + * simply free the resources that were allocated + */ + inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); + qprocsoff(q); + goto done; + } + connp = Q_TO_CONN(q); + udp_do_close(connp); +done: q->q_ptr = WR(q)->q_ptr = NULL; return (0); } @@ -1567,39 +894,21 @@ udp_close_free(conn_t *connp) udp->udp_connp = connp; } -/* - * This routine handles each T_DISCON_REQ message passed to udp - * as an indicating that UDP is no longer connected. This results - * in sending a T_BIND_REQ to IP to restore the binding to just - * the local address/port. - * - * This routine sends down a T_BIND_REQ to IP with the following mblks: - * T_BIND_REQ - specifying just the local address/port - * T_OK_ACK - for the T_DISCON_REQ - * - * The disconnect completes in udp_bind_result. - * When a T_BIND_ACK is received the appended T_OK_ACK is sent to the TPI user. - * Should udp_bind_result receive T_ERROR_ACK for the T_BIND_REQ it will - * convert it to an error ack for the appropriate primitive. - */ -static void -udp_disconnect(queue_t *q, mblk_t *mp) +static int +udp_do_disconnect(conn_t *connp) { udp_t *udp; - mblk_t *mp1; + mblk_t *ire_mp; udp_fanout_t *udpf; udp_stack_t *us; - conn_t *connp = Q_TO_CONN(q); + int error; udp = connp->conn_udp; us = udp->udp_us; rw_enter(&udp->udp_rwlock, RW_WRITER); if (udp->udp_state != TS_DATA_XFER || udp->udp_pending_op != -1) { rw_exit(&udp->udp_rwlock); - (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, - "udp_disconnect: bad state, %u", udp->udp_state); - udp_err_ack(q, mp, TOUTSTATE, 0); - return; + return (-TOUTSTATE); } udp->udp_pending_op = T_DISCON_REQ; udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port, @@ -1609,57 +918,85 @@ udp_disconnect(queue_t *q, mblk_t *mp) udp->udp_state = TS_IDLE; mutex_exit(&udpf->uf_lock); - /* - * Send down bind to IP to remove the full binding and revert - * to the local address binding. - */ - if (udp->udp_family == AF_INET) - mp1 = udp_ip_bind_mp(udp, O_T_BIND_REQ, sizeof (sin_t)); - else - mp1 = udp_ip_bind_mp(udp, O_T_BIND_REQ, sizeof (sin6_t)); - if (mp1 == NULL) { + if (udp->udp_family == AF_INET6) { + /* Rebuild the header template */ + error = udp_build_hdrs(udp); + if (error != 0) { + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); + return (error); + } + } + + ire_mp = allocb(sizeof (ire_t), BPRI_HI); + if (ire_mp == NULL) { + mutex_enter(&udpf->uf_lock); udp->udp_pending_op = -1; + mutex_exit(&udpf->uf_lock); rw_exit(&udp->udp_rwlock); - udp_err_ack(q, mp, TSYSERR, ENOMEM); - return; + return (ENOMEM); } - mp = mi_tpi_ok_ack_alloc(mp); + + rw_exit(&udp->udp_rwlock); + + if (udp->udp_family == AF_INET6) { + error = ip_proto_bind_laddr_v6(connp, &ire_mp, IPPROTO_UDP, + &udp->udp_bound_v6src, udp->udp_port, B_TRUE); + } else { + error = ip_proto_bind_laddr_v4(connp, &ire_mp, IPPROTO_UDP, + V4_PART_OF_V6(udp->udp_bound_v6src), udp->udp_port, B_TRUE); + } + + return (udp_post_ip_bind_connect(udp, ire_mp, error)); +} + + +static void +udp_tpi_disconnect(queue_t *q, mblk_t *mp) +{ + conn_t *connp = Q_TO_CONN(q); + int error; + + /* + * Allocate the largest primitive we need to send back + * T_error_ack is > than T_ok_ack + */ + mp = reallocb(mp, sizeof (struct T_error_ack), 1); if (mp == NULL) { /* Unable to reuse the T_DISCON_REQ for the ack. */ - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - udp_err_ack_prim(q, mp1, T_DISCON_REQ, TSYSERR, ENOMEM); + udp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM); return; } - if (udp->udp_family == AF_INET6) { - int error; + error = udp_do_disconnect(connp); - /* Rebuild the header template */ - error = udp_build_hdrs(udp); - if (error != 0) { - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - udp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, error); - freemsg(mp1); - return; + if (error != 0) { + if (error < 0) { + udp_err_ack(q, mp, -error, 0); + } else { + udp_err_ack(q, mp, TSYSERR, error); } + } else { + mp = mi_tpi_ok_ack_alloc(mp); + ASSERT(mp != NULL); + qreply(q, mp); } +} - rw_exit(&udp->udp_rwlock); - /* Append the T_OK_ACK to the T_BIND_REQ for udp_bind_ack */ - linkb(mp1, mp); +int +udp_disconnect(conn_t *connp) +{ + int error; + udp_t *udp = connp->conn_udp; - if (udp->udp_family == AF_INET6) - mp1 = ip_bind_v6(q, mp1, connp, NULL); - else - mp1 = ip_bind_v4(q, mp1, connp); + udp->udp_dgram_errind = B_FALSE; - /* The above return NULL if the bind needs to be deferred */ - if (mp1 != NULL) - udp_bind_result(connp, mp1); - else - CONN_INC_REF(connp); + error = udp_do_disconnect(connp); + + if (error < 0) + error = proto_tlitosyserr(-error); + + return (error); } /* This routine creates a T_ERROR_ACK message and passes it upstream. */ @@ -1783,8 +1120,8 @@ udp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp, * Assumes that IP has pulled up everything up to and including the ICMP header. */ static void -udp_icmp_error(queue_t *q, mblk_t *mp) -{ +udp_icmp_error(conn_t *connp, mblk_t *mp) + { icmph_t *icmph; ipha_t *ipha; int iph_hdr_length; @@ -1793,15 +1130,16 @@ udp_icmp_error(queue_t *q, mblk_t *mp) sin6_t sin6; mblk_t *mp1; int error = 0; - udp_t *udp = Q_TO_UDP(q); + udp_t *udp = connp->conn_udp; + mp1 = NULL; ipha = (ipha_t *)mp->b_rptr; ASSERT(OK_32PTR(mp->b_rptr)); if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) { ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION); - udp_icmp_error_ipv6(q, mp); + udp_icmp_error_ipv6(connp, mp); return; } ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); @@ -1850,27 +1188,66 @@ udp_icmp_error(queue_t *q, mblk_t *mp) return; } + switch (udp->udp_family) { case AF_INET: sin = sin_null; sin.sin_family = AF_INET; sin.sin_addr.s_addr = ipha->ipha_dst; sin.sin_port = udpha->uha_dst_port; - mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0, - error); + if (IPCL_IS_NONSTR(connp)) { + rw_enter(&udp->udp_rwlock, RW_WRITER); + if (udp->udp_state == TS_DATA_XFER) { + if (sin.sin_port == udp->udp_dstport && + sin.sin_addr.s_addr == + V4_PART_OF_V6(udp->udp_v6dst)) { + + rw_exit(&udp->udp_rwlock); + (*connp->conn_upcalls->su_set_error) + (connp->conn_upper_handle, error); + goto done; + } + } else { + udp->udp_delayed_error = error; + *((sin_t *)&udp->udp_delayed_addr) = sin; + } + rw_exit(&udp->udp_rwlock); + } else { + mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), + NULL, 0, error); + } break; case AF_INET6: sin6 = sin6_null; sin6.sin6_family = AF_INET6; IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &sin6.sin6_addr); sin6.sin6_port = udpha->uha_dst_port; + if (IPCL_IS_NONSTR(connp)) { + rw_enter(&udp->udp_rwlock, RW_WRITER); + if (udp->udp_state == TS_DATA_XFER) { + if (sin6.sin6_port == udp->udp_dstport && + IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr, + &udp->udp_v6dst)) { + rw_exit(&udp->udp_rwlock); + (*connp->conn_upcalls->su_set_error) + (connp->conn_upper_handle, error); + goto done; + } + } else { + udp->udp_delayed_error = error; + *((sin6_t *)&udp->udp_delayed_addr) = sin6; + } + rw_exit(&udp->udp_rwlock); + } else { - mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), - NULL, 0, error); + mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), + NULL, 0, error); + } break; } - if (mp1) - putnext(q, mp1); + if (mp1 != NULL) + putnext(connp->conn_rq, mp1); +done: freemsg(mp); } @@ -1881,7 +1258,7 @@ udp_icmp_error(queue_t *q, mblk_t *mp) * ICMPv6 header. */ static void -udp_icmp_error_ipv6(queue_t *q, mblk_t *mp) +udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) { icmp6_t *icmp6; ip6_t *ip6h, *outer_ip6h; @@ -1891,7 +1268,7 @@ udp_icmp_error_ipv6(queue_t *q, mblk_t *mp) sin6_t sin6; mblk_t *mp1; int error = 0; - udp_t *udp = Q_TO_UDP(q); + udp_t *udp = connp->conn_udp; udp_stack_t *us = udp->udp_us; outer_ip6h = (ip6_t *)mp->b_rptr; @@ -1982,7 +1359,13 @@ udp_icmp_error_ipv6(queue_t *q, mblk_t *mp) * message. Free it, then send our empty message. */ freemsg(mp); - putnext(q, newmp); + if (!IPCL_IS_NONSTR(connp)) { + putnext(connp->conn_rq, newmp); + } else { + (*connp->conn_upcalls->su_recv) + (connp->conn_upper_handle, newmp, 0, 0, &error, + NULL); + } return; } case ICMP6_TIME_EXCEEDED: @@ -2018,10 +1401,30 @@ udp_icmp_error_ipv6(queue_t *q, mblk_t *mp) sin6.sin6_port = udpha->uha_dst_port; sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; - mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), NULL, 0, - error); - if (mp1) - putnext(q, mp1); + if (IPCL_IS_NONSTR(connp)) { + rw_enter(&udp->udp_rwlock, RW_WRITER); + if (udp->udp_state == TS_DATA_XFER) { + if (sin6.sin6_port == udp->udp_dstport && + IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr, + &udp->udp_v6dst)) { + rw_exit(&udp->udp_rwlock); + (*connp->conn_upcalls->su_set_error) + (connp->conn_upper_handle, error); + goto done; + } + } else { + udp->udp_delayed_error = error; + *((sin6_t *)&udp->udp_delayed_addr) = sin6; + } + rw_exit(&udp->udp_rwlock); + } else { + mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), + NULL, 0, error); + if (mp1 != NULL) + putnext(connp->conn_rq, mp1); + } + +done: freemsg(mp); } @@ -2166,6 +1569,18 @@ udp_copy_info(struct T_info_ack *tap, udp_t *udp) tap->OPT_size = udp_max_optsize; } +static void +udp_do_capability_ack(udp_t *udp, struct T_capability_ack *tcap, + t_uscalar_t cap_bits1) +{ + tcap->CAP_bits1 = 0; + + if (cap_bits1 & TC1_INFO) { + udp_copy_info(&tcap->INFO_ack, udp); + tcap->CAP_bits1 |= TC1_INFO; + } +} + /* * This routine responds to T_CAPABILITY_REQ messages. It is called by * udp_wput. Much of the T_CAPABILITY_ACK information is copied from @@ -2187,12 +1602,7 @@ udp_capability_req(queue_t *q, mblk_t *mp) return; tcap = (struct T_capability_ack *)mp->b_rptr; - tcap->CAP_bits1 = 0; - - if (cap_bits1 & TC1_INFO) { - udp_copy_info(&tcap->INFO_ack, udp); - tcap->CAP_bits1 |= TC1_INFO; - } + udp_do_capability_ack(udp, tcap, cap_bits1); qreply(q, mp); } @@ -2378,12 +1788,10 @@ static int udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, boolean_t isv6) { - int err; + int error; udp_t *udp; conn_t *connp; dev_t conn_dev; - zoneid_t zoneid; - netstack_t *ns; udp_stack_t *us; vmem_t *minor_arena; @@ -2396,20 +1804,6 @@ udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, if (sflag == MODOPEN) return (EINVAL); - ns = netstack_find_by_cred(credp); - ASSERT(ns != NULL); - us = ns->netstack_udp; - ASSERT(us != NULL); - - /* - * For exclusive stacks we set the zoneid to zero - * to make UDP operate as if in the global zone. - */ - if (ns->netstack_stackid != GLOBAL_NETSTACKID) - zoneid = GLOBAL_ZONEID; - else - zoneid = crgetzoneid(credp); - if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) && ((conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) { minor_arena = ip_minor_arena_la; @@ -2419,25 +1813,34 @@ udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, * or a non socket application is doing the open. * Try to allocate from the small arena. */ - if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) { - netstack_rele(ns); + if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) return (EBUSY); - } + minor_arena = ip_minor_arena_sa; } - *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); + if (flag & SO_FALLBACK) { + /* + * Non streams socket needs a stream to fallback to + */ + RD(q)->q_ptr = (void *)conn_dev; + WR(q)->q_qinfo = &udp_fallback_sock_winit; + WR(q)->q_ptr = (void *)minor_arena; + qprocson(q); + return (0); + } - connp = ipcl_conn_create(IPCL_UDPCONN, KM_SLEEP, ns); - connp->conn_dev = conn_dev; - connp->conn_minor_arena = minor_arena; + connp = udp_do_open(credp, isv6, KM_SLEEP); + if (connp == NULL) { + inet_minor_free(minor_arena, conn_dev); + return (ENOMEM); + } udp = connp->conn_udp; + us = udp->udp_us; - /* - * ipcl_conn_create did a netstack_hold. Undo the hold that was - * done by netstack_find_by_cred() - */ - netstack_rele(ns); + *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); + connp->conn_dev = conn_dev; + connp->conn_minor_arena = minor_arena; /* * Initialize the udp_t structure for this stream. @@ -2452,79 +1855,39 @@ udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, ASSERT(connp->conn_udp == udp); ASSERT(udp->udp_connp == connp); - /* Set the initial state of the stream and the privilege status. */ - udp->udp_state = TS_UNBND; - if (isv6) { - udp->udp_family = AF_INET6; - udp->udp_ipversion = IPV6_VERSION; - udp->udp_max_hdr_len = IPV6_HDR_LEN + UDPH_SIZE; - udp->udp_ttl = us->us_ipv6_hoplimit; - connp->conn_af_isv6 = B_TRUE; - connp->conn_flags |= IPCL_ISV6; - } else { - udp->udp_family = AF_INET; - udp->udp_ipversion = IPV4_VERSION; - udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE; - udp->udp_ttl = us->us_ipv4_ttl; - connp->conn_af_isv6 = B_FALSE; - connp->conn_flags &= ~IPCL_ISV6; - } - - udp->udp_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; - udp->udp_pending_op = -1; - connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; - connp->conn_zoneid = zoneid; - - udp->udp_open_time = lbolt64; - udp->udp_open_pid = curproc->p_pid; - - /* - * If the caller has the process-wide flag set, then default to MAC - * exempt mode. This allows read-down to unlabeled hosts. - */ - if (getpflags(NET_MAC_AWARE, credp) != 0) - connp->conn_mac_exempt = B_TRUE; - if (flag & SO_SOCKSTR) { connp->conn_flags |= IPCL_SOCKET; udp->udp_issocket = B_TRUE; udp->udp_direct_sockfs = B_TRUE; } - connp->conn_ulp_labeled = is_system_labeled(); - - udp->udp_us = us; - q->q_hiwat = us->us_recv_hiwat; WR(q)->q_hiwat = us->us_xmit_hiwat; WR(q)->q_lowat = us->us_xmit_lowat; - connp->conn_recv = udp_input; - crhold(credp); - connp->conn_cred = credp; - - mutex_enter(&connp->conn_lock); - connp->conn_state_flags &= ~CONN_INCIPIENT; - mutex_exit(&connp->conn_lock); - qprocson(q); if (udp->udp_family == AF_INET6) { /* Build initial header template for transmit */ - if ((err = udp_build_hdrs(udp)) != 0) { + if ((error = udp_build_hdrs(udp)) != 0) { rw_exit(&udp->udp_rwlock); qprocsoff(q); + inet_minor_free(minor_arena, conn_dev); ipcl_conn_destroy(connp); - return (err); + return (error); } } rw_exit(&udp->udp_rwlock); /* Set the Stream head write offset and high watermark. */ - (void) mi_set_sth_wroff(q, + (void) proto_set_tx_wroff(q, connp, udp->udp_max_hdr_len + us->us_wroff_extra); - (void) mi_set_sth_hiwat(q, udp_set_rcv_hiwat(udp, q->q_hiwat)); + /* XXX udp_set_rcv_hiwat() doesn't hold the lock, is it a bug??? */ + (void) proto_set_rx_hiwat(q, connp, udp_set_rcv_hiwat(udp, q->q_hiwat)); + mutex_enter(&connp->conn_lock); + connp->conn_state_flags &= ~CONN_INCIPIENT; + mutex_exit(&connp->conn_lock); return (0); } @@ -2582,21 +1945,16 @@ udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) * This routine retrieves the current status of socket options. * It returns the size of the option retrieved. */ -int -udp_opt_get_locked(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) +static int +udp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) { - int *i1 = (int *)ptr; - conn_t *connp; - udp_t *udp; - ip6_pkt_t *ipp; - int len; - udp_stack_t *us; - - connp = Q_TO_CONN(q); - udp = connp->conn_udp; - ipp = &udp->udp_sticky_ipp; - us = udp->udp_us; + udp_t *udp = connp->conn_udp; + udp_stack_t *us = udp->udp_us; + int *i1 = (int *)ptr; + ip6_pkt_t *ipp = &udp->udp_sticky_ipp; + int len; + ASSERT(RW_READ_HELD(&udp->udp_rwlock)); switch (level) { case SOL_SOCKET: switch (name) { @@ -2625,10 +1983,10 @@ udp_opt_get_locked(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) break; /* goto sizeof (int) option return */ case SO_SNDBUF: - *i1 = q->q_hiwat; + *i1 = udp->udp_xmit_hiwat; break; /* goto sizeof (int) option return */ case SO_RCVBUF: - *i1 = RD(q)->q_hiwat; + *i1 = udp->udp_rcv_disply_hiwat; break; /* goto sizeof (int) option return */ case SO_DGRAM_ERRIND: *i1 = udp->udp_dgram_errind; @@ -2907,15 +2265,15 @@ udp_opt_get_locked(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) } int -udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) +udp_tpi_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) { - udp_t *udp; + udp_t *udp; int err; udp = Q_TO_UDP(q); rw_enter(&udp->udp_rwlock, RW_READER); - err = udp_opt_get_locked(q, level, name, ptr); + err = udp_opt_get(Q_TO_CONN(q), level, name, ptr); rw_exit(&udp->udp_rwlock); return (err); } @@ -2924,83 +2282,34 @@ udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) * This routine sets socket options. */ /* ARGSUSED */ -int -udp_opt_set_locked(queue_t *q, uint_t optset_context, int level, - int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, - uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk) +static int +udp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen, + uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, cred_t *cr, + void *thisdg_attrs, boolean_t checkonly) { udpattrs_t *attrs = thisdg_attrs; int *i1 = (int *)invalp; boolean_t onoff = (*i1 == 0) ? 0 : 1; - boolean_t checkonly; + udp_t *udp = connp->conn_udp; + udp_stack_t *us = udp->udp_us; int error; - conn_t *connp; - udp_t *udp; uint_t newlen; - udp_stack_t *us; size_t sth_wroff; - connp = Q_TO_CONN(q); - udp = connp->conn_udp; - us = udp->udp_us; - - switch (optset_context) { - case SETFN_OPTCOM_CHECKONLY: - checkonly = B_TRUE; - /* - * Note: Implies T_CHECK semantics for T_OPTCOM_REQ - * inlen != 0 implies value supplied and - * we have to "pretend" to set it. - * inlen == 0 implies that there is no - * value part in T_CHECK request and just validation - * done elsewhere should be enough, we just return here. - */ - if (inlen == 0) { - *outlenp = 0; - return (0); - } - break; - case SETFN_OPTCOM_NEGOTIATE: - checkonly = B_FALSE; - break; - case SETFN_UD_NEGOTIATE: - case SETFN_CONN_NEGOTIATE: - checkonly = B_FALSE; - /* - * Negotiating local and "association-related" options - * through T_UNITDATA_REQ. - * - * Following routine can filter out ones we do not - * want to be "set" this way. - */ - if (!udp_opt_allow_udr_set(level, name)) { - *outlenp = 0; - return (EINVAL); - } - break; - default: - /* - * We should never get here - */ - *outlenp = 0; - return (EINVAL); - } - - ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || - (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); - + ASSERT(RW_WRITE_HELD(&udp->udp_rwlock)); /* * For fixed length options, no sanity check * of passed in length is done. It is assumed *_optcom_req() * routines do the right thing. */ - switch (level) { case SOL_SOCKET: switch (name) { case SO_REUSEADDR: - if (!checkonly) + if (!checkonly) { udp->udp_reuseaddr = onoff; + PASS_OPT_TO_IP(connp); + } break; case SO_DEBUG: if (!checkonly) @@ -3011,16 +2320,22 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level, * but are only meaningful to IP. */ case SO_DONTROUTE: - if (!checkonly) + if (!checkonly) { udp->udp_dontroute = onoff; + PASS_OPT_TO_IP(connp); + } break; case SO_USELOOPBACK: - if (!checkonly) + if (!checkonly) { udp->udp_useloopback = onoff; + PASS_OPT_TO_IP(connp); + } break; case SO_BROADCAST: - if (!checkonly) + if (!checkonly) { udp->udp_broadcast = onoff; + PASS_OPT_TO_IP(connp); + } break; case SO_SNDBUF: @@ -3029,7 +2344,8 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level, return (ENOBUFS); } if (!checkonly) { - q->q_hiwat = *i1; + udp->udp_xmit_hiwat = *i1; + connp->conn_wq->q_hiwat = *i1; } break; case SO_RCVBUF: @@ -3038,10 +2354,13 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level, return (ENOBUFS); } if (!checkonly) { - RD(q)->q_hiwat = *i1; + int size; + + udp->udp_rcv_disply_hiwat = *i1; + size = udp_set_rcv_hiwat(udp, *i1); rw_exit(&udp->udp_rwlock); - (void) mi_set_sth_hiwat(RD(q), - udp_set_rcv_hiwat(udp, *i1)); + (void) proto_set_rx_hiwat(connp->conn_rq, connp, + size); rw_enter(&udp->udp_rwlock, RW_WRITER); } break; @@ -3065,11 +2384,20 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level, udp->udp_timestamp = onoff; break; case SO_ANON_MLP: - /* Pass option along to IP level for handling */ - return (-EINVAL); + if (!checkonly) { + connp->conn_anon_mlp = onoff; + PASS_OPT_TO_IP(connp); + } + break; case SO_MAC_EXEMPT: - /* Pass option along to IP level for handling */ - return (-EINVAL); + if (secpolicy_net_mac_aware(cr) != 0 || + udp->udp_state != TS_UNBND) + return (EACCES); + if (!checkonly) { + connp->conn_mac_exempt = onoff; + PASS_OPT_TO_IP(connp); + } + break; case SCM_UCRED: { struct ucred_s *ucr; cred_t *cr, *newcr; @@ -3149,7 +2477,8 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level, UDPH_SIZE + udp->udp_ip_snd_options_len; sth_wroff = udp->udp_max_hdr_len + us->us_wroff_extra; rw_exit(&udp->udp_rwlock); - (void) mi_set_sth_wroff(RD(q), sth_wroff); + (void) proto_set_tx_wroff(connp->conn_rq, connp, + sth_wroff); rw_enter(&udp->udp_rwlock, RW_WRITER); break; @@ -3173,6 +2502,7 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level, if (!checkonly) { udp->udp_multicast_if_addr = inap->s_addr; + PASS_OPT_TO_IP(connp); } break; } @@ -3181,8 +2511,10 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level, udp->udp_multicast_ttl = *invalp; break; case IP_MULTICAST_LOOP: - if (!checkonly) + if (!checkonly) { connp->conn_multicast_loop = *invalp; + PASS_OPT_TO_IP(connp); + } break; case IP_RECVOPTS: if (!checkonly) @@ -3193,12 +2525,16 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level, udp->udp_recvdstaddr = onoff; break; case IP_RECVIF: - if (!checkonly) + if (!checkonly) { udp->udp_recvif = onoff; + PASS_OPT_TO_IP(connp); + } break; case IP_RECVSLLA: - if (!checkonly) + if (!checkonly) { udp->udp_recvslla = onoff; + PASS_OPT_TO_IP(connp); + } break; case IP_RECVTTL: if (!checkonly) @@ -3278,12 +2614,16 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level, */ return (-EINVAL); case IP_BOUND_IF: - if (!checkonly) + if (!checkonly) { udp->udp_bound_if = *i1; + PASS_OPT_TO_IP(connp); + } break; case IP_UNSPEC_SRC: - if (!checkonly) + if (!checkonly) { udp->udp_unspec_source = onoff; + PASS_OPT_TO_IP(connp); + } break; case IP_BROADCAST_TTL: if (!checkonly) @@ -3315,8 +2655,10 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level, switch (name) { case IPV6_MULTICAST_IF: - if (!checkonly) + if (!checkonly) { udp->udp_multicast_if_index = *i1; + PASS_OPT_TO_IP(connp); + } break; case IPV6_UNICAST_HOPS: /* -1 means use default */ @@ -3371,8 +2713,10 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level, *outlenp = 0; return (EINVAL); } - if (!checkonly) + if (!checkonly) { connp->conn_multicast_loop = *i1; + PASS_OPT_TO_IP(connp); + } break; case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: @@ -3389,53 +2733,71 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level, */ return (-EINVAL); case IPV6_BOUND_IF: - if (!checkonly) + if (!checkonly) { udp->udp_bound_if = *i1; + PASS_OPT_TO_IP(connp); + } break; case IPV6_UNSPEC_SRC: - if (!checkonly) + if (!checkonly) { udp->udp_unspec_source = onoff; + PASS_OPT_TO_IP(connp); + } break; /* * Set boolean switches for ancillary data delivery */ case IPV6_RECVPKTINFO: - if (!checkonly) + if (!checkonly) { udp->udp_ip_recvpktinfo = onoff; + PASS_OPT_TO_IP(connp); + } break; case IPV6_RECVTCLASS: if (!checkonly) { udp->udp_ipv6_recvtclass = onoff; + PASS_OPT_TO_IP(connp); } break; case IPV6_RECVPATHMTU: if (!checkonly) { udp->udp_ipv6_recvpathmtu = onoff; + PASS_OPT_TO_IP(connp); } break; case IPV6_RECVHOPLIMIT: - if (!checkonly) + if (!checkonly) { udp->udp_ipv6_recvhoplimit = onoff; + PASS_OPT_TO_IP(connp); + } break; case IPV6_RECVHOPOPTS: - if (!checkonly) + if (!checkonly) { udp->udp_ipv6_recvhopopts = onoff; + PASS_OPT_TO_IP(connp); + } break; case IPV6_RECVDSTOPTS: - if (!checkonly) + if (!checkonly) { udp->udp_ipv6_recvdstopts = onoff; + PASS_OPT_TO_IP(connp); + } break; case _OLD_IPV6_RECVDSTOPTS: if (!checkonly) udp->udp_old_ipv6_recvdstopts = onoff; break; case IPV6_RECVRTHDRDSTOPTS: - if (!checkonly) + if (!checkonly) { udp->udp_ipv6_recvrthdrdstopts = onoff; + PASS_OPT_TO_IP(connp); + } break; case IPV6_RECVRTHDR: - if (!checkonly) + if (!checkonly) { udp->udp_ipv6_recvrthdr = onoff; + PASS_OPT_TO_IP(connp); + } break; /* * Set sticky options or ancillary data. @@ -3477,6 +2839,7 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level, error = udp_build_hdrs(udp); if (error != 0) return (error); + PASS_OPT_TO_IP(connp); } break; case IPV6_HOPLIMIT: @@ -3541,8 +2904,9 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level, } else { sin6_t *sin6 = (sin6_t *)invalp; - if (sin6->sin6_family != AF_INET6) + if (sin6->sin6_family != AF_INET6) { return (EAFNOSUPPORT); + } if (IN6_IS_ADDR_V4MAPPED( &sin6->sin6_addr)) return (EADDRNOTAVAIL); @@ -3557,6 +2921,7 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level, error = udp_build_hdrs(udp); if (error != 0) return (error); + PASS_OPT_TO_IP(connp); } break; case IPV6_HOPOPTS: { @@ -3785,6 +3150,8 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level, } if (!checkonly) { + int size; + udp->udp_nat_t_endpoint = onoff; udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + @@ -3795,8 +3162,10 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level, udp->udp_max_hdr_len += sizeof (uint32_t); } - (void) mi_set_sth_wroff(RD(q), - udp->udp_max_hdr_len + us->us_wroff_extra); + size = udp->udp_max_hdr_len + + us->us_wroff_extra; + (void) proto_set_tx_wroff(connp->conn_rq, connp, + size); } break; default: @@ -3820,20 +3189,82 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level, } int -udp_opt_set(queue_t *q, uint_t optset_context, int level, - int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, - uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk) +udp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, + uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, + void *thisdg_attrs, cred_t *cr) { - udp_t *udp; - int err; + int error; + boolean_t checkonly; - udp = Q_TO_UDP(q); + error = 0; + switch (optset_context) { + case SETFN_OPTCOM_CHECKONLY: + checkonly = B_TRUE; + /* + * Note: Implies T_CHECK semantics for T_OPTCOM_REQ + * inlen != 0 implies value supplied and + * we have to "pretend" to set it. + * inlen == 0 implies that there is no + * value part in T_CHECK request and just validation + * done elsewhere should be enough, we just return here. + */ + if (inlen == 0) { + *outlenp = 0; + goto done; + } + break; + case SETFN_OPTCOM_NEGOTIATE: + checkonly = B_FALSE; + break; + case SETFN_UD_NEGOTIATE: + case SETFN_CONN_NEGOTIATE: + checkonly = B_FALSE; + /* + * Negotiating local and "association-related" options + * through T_UNITDATA_REQ. + * + * Following routine can filter out ones we do not + * want to be "set" this way. + */ + if (!udp_opt_allow_udr_set(level, name)) { + *outlenp = 0; + error = EINVAL; + goto done; + } + break; + default: + /* + * We should never get here + */ + *outlenp = 0; + error = EINVAL; + goto done; + } + + ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || + (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); + + error = udp_do_opt_set(connp, level, name, inlen, invalp, outlenp, + outvalp, cr, thisdg_attrs, checkonly); +done: + return (error); +} + +/* ARGSUSED */ +int +udp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, + uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, + void *thisdg_attrs, cred_t *cr, mblk_t *mblk) +{ + conn_t *connp = Q_TO_CONN(q); + int error; + udp_t *udp = connp->conn_udp; rw_enter(&udp->udp_rwlock, RW_WRITER); - err = udp_opt_set_locked(q, optset_context, level, name, inlen, invalp, - outlenp, outvalp, thisdg_attrs, cr, mblk); + error = udp_opt_set(connp, optset_context, level, name, inlen, invalp, + outlenp, outvalp, thisdg_attrs, cr); rw_exit(&udp->udp_rwlock); - return (err); + return (error); } /* @@ -3853,8 +3284,11 @@ udp_build_hdrs(udp_t *udp) udpha_t *udpha; ip6_pkt_t *ipp = &udp->udp_sticky_ipp; size_t sth_wroff; + conn_t *connp = udp->udp_connp; ASSERT(RW_WRITE_HELD(&udp->udp_rwlock)); + ASSERT(connp != NULL); + hdrs_len = ip_total_hdrs_len_v6(ipp) + UDPH_SIZE; ASSERT(hdrs_len != 0); if (hdrs_len != udp->udp_sticky_hdrs_len) { @@ -3892,7 +3326,8 @@ udp_build_hdrs(udp_t *udp) udp->udp_max_hdr_len = hdrs_len; sth_wroff = udp->udp_max_hdr_len + us->us_wroff_extra; rw_exit(&udp->udp_rwlock); - (void) mi_set_sth_wroff(udp->udp_connp->conn_rq, sth_wroff); + (void) proto_set_tx_wroff(udp->udp_connp->conn_rq, + udp->udp_connp, sth_wroff); rw_enter(&udp->udp_rwlock, RW_WRITER); } return (0); @@ -4164,6 +3599,33 @@ udp_save_ip_rcv_opt(udp_t *udp, void *opt, int opt_len) } } +static void +udp_queue_fallback(udp_t *udp, mblk_t *mp) +{ + ASSERT(MUTEX_HELD(&udp->udp_recv_lock)); + if (IPCL_IS_NONSTR(udp->udp_connp)) { + /* + * fallback has started but messages have not been moved yet + */ + if (udp->udp_fallback_queue_head == NULL) { + ASSERT(udp->udp_fallback_queue_tail == NULL); + udp->udp_fallback_queue_head = mp; + udp->udp_fallback_queue_tail = mp; + } else { + ASSERT(udp->udp_fallback_queue_tail != NULL); + udp->udp_fallback_queue_tail->b_next = mp; + udp->udp_fallback_queue_tail = mp; + } + mutex_exit(&udp->udp_recv_lock); + } else { + /* + * no more fallbacks possible, ok to drop lock. + */ + mutex_exit(&udp->udp_recv_lock); + putnext(udp->udp_connp->conn_rq, mp); + } +} + /* ARGSUSED2 */ static void udp_input(void *arg1, mblk_t *mp, void *arg2) @@ -4222,7 +3684,7 @@ udp_input(void *arg1, mblk_t *mp, void *arg2) /* * ICMP messages. */ - udp_icmp_error(connp->conn_rq, mp); + udp_icmp_error(connp, mp); return; } } @@ -4403,7 +3865,6 @@ udp_input(void *arg1, mblk_t *mp, void *arg2) UDP_STAT(us, udp_in_recvucred); } - /* XXX FIXME: apply to AF_INET6 as well */ /* * If SO_TIMESTAMP is set allocate the appropriate sized * buffer. Since gethrestime() expects a pointer aligned @@ -4873,7 +4334,6 @@ udp_input(void *arg1, mblk_t *mp, void *arg2) dstopt += ipp.ipp_dstoptslen; udi_size -= toh->len; } - if (cr != NULL) { struct T_opthdr *toh; @@ -4915,23 +4375,37 @@ udp_input(void *arg1, mblk_t *mp, void *arg2) if (options_mp != NULL) freeb(options_mp); - if (udp_bits.udpb_direct_sockfs) { - /* - * There is nothing above us except for the stream head; - * use the read-side synchronous stream interface in - * order to reduce the time spent in interrupt thread. - */ - ASSERT(udp->udp_issocket); - udp_rcv_enqueue(connp->conn_rq, udp, mp, mp_len); + if (IPCL_IS_NONSTR(connp)) { + int error; + + if ((*connp->conn_upcalls->su_recv) + (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error, + NULL) < 0) { + mutex_enter(&udp->udp_recv_lock); + if (error == ENOSPC) { + /* + * let's confirm while holding the lock + */ + if ((*connp->conn_upcalls->su_recv) + (connp->conn_upper_handle, NULL, 0, 0, + &error, NULL) < 0) { + if (error == ENOSPC) { + connp->conn_flow_cntrld = + B_TRUE; + } else { + ASSERT(error == EOPNOTSUPP); + } + } + mutex_exit(&udp->udp_recv_lock); + } else { + ASSERT(error == EOPNOTSUPP); + udp_queue_fallback(udp, mp); + } + } } else { - /* - * Use regular STREAMS interface to pass data upstream - * if this is not a socket endpoint, or if we have - * switched over to the slow mode due to sockmod being - * popped or a module being pushed on top of us. - */ putnext(connp->conn_rq, mp); } + ASSERT(MUTEX_NOT_HELD(&udp->udp_recv_lock)); return; tossit: @@ -4942,243 +4416,6 @@ tossit: } /* - * Handle the results of a T_BIND_REQ whether deferred by IP or handled - * immediately. - */ -static void -udp_bind_result(conn_t *connp, mblk_t *mp) -{ - struct T_error_ack *tea; - - switch (mp->b_datap->db_type) { - case M_PROTO: - case M_PCPROTO: - /* M_PROTO messages contain some type of TPI message. */ - ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= - (uintptr_t)INT_MAX); - if (mp->b_wptr - mp->b_rptr < sizeof (t_scalar_t)) { - freemsg(mp); - return; - } - tea = (struct T_error_ack *)mp->b_rptr; - - switch (tea->PRIM_type) { - case T_ERROR_ACK: - switch (tea->ERROR_prim) { - case O_T_BIND_REQ: - case T_BIND_REQ: - udp_bind_error(connp, mp); - return; - default: - break; - } - ASSERT(0); - freemsg(mp); - return; - - case T_BIND_ACK: - udp_bind_ack(connp, mp); - return; - - default: - break; - } - freemsg(mp); - return; - default: - /* FIXME: other cases? */ - ASSERT(0); - freemsg(mp); - return; - } -} - -/* - * Process a T_BIND_ACK - */ -static void -udp_bind_ack(conn_t *connp, mblk_t *mp) -{ - udp_t *udp = connp->conn_udp; - mblk_t *mp1; - ire_t *ire; - struct T_bind_ack *tba; - uchar_t *addrp; - ipa_conn_t *ac; - ipa6_conn_t *ac6; - udp_fanout_t *udpf; - udp_stack_t *us = udp->udp_us; - - ASSERT(udp->udp_pending_op != -1); - rw_enter(&udp->udp_rwlock, RW_WRITER); - /* - * If a broadcast/multicast address was bound set - * the source address to 0. - * This ensures no datagrams with broadcast address - * as source address are emitted (which would violate - * RFC1122 - Hosts requirements) - * - * Note that when connecting the returned IRE is - * for the destination address and we only perform - * the broadcast check for the source address (it - * is OK to connect to a broadcast/multicast address.) - */ - mp1 = mp->b_cont; - if (mp1 != NULL && mp1->b_datap->db_type == IRE_DB_TYPE) { - ire = (ire_t *)mp1->b_rptr; - - /* - * Note: we get IRE_BROADCAST for IPv6 to "mark" a multicast - * local address. - */ - udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port, - us->us_bind_fanout_size)]; - if (ire->ire_type == IRE_BROADCAST && - udp->udp_state != TS_DATA_XFER) { - ASSERT(udp->udp_pending_op == T_BIND_REQ || - udp->udp_pending_op == O_T_BIND_REQ); - /* This was just a local bind to a broadcast addr */ - mutex_enter(&udpf->uf_lock); - V6_SET_ZERO(udp->udp_v6src); - mutex_exit(&udpf->uf_lock); - if (udp->udp_family == AF_INET6) - (void) udp_build_hdrs(udp); - } else if (V6_OR_V4_INADDR_ANY(udp->udp_v6src)) { - /* - * Local address not yet set - pick it from the - * T_bind_ack - */ - tba = (struct T_bind_ack *)mp->b_rptr; - addrp = &mp->b_rptr[tba->ADDR_offset]; - switch (udp->udp_family) { - case AF_INET: - if (tba->ADDR_length == sizeof (ipa_conn_t)) { - ac = (ipa_conn_t *)addrp; - } else { - ASSERT(tba->ADDR_length == - sizeof (ipa_conn_x_t)); - ac = &((ipa_conn_x_t *)addrp)->acx_conn; - } - mutex_enter(&udpf->uf_lock); - IN6_IPADDR_TO_V4MAPPED(ac->ac_laddr, - &udp->udp_v6src); - mutex_exit(&udpf->uf_lock); - break; - case AF_INET6: - if (tba->ADDR_length == sizeof (ipa6_conn_t)) { - ac6 = (ipa6_conn_t *)addrp; - } else { - ASSERT(tba->ADDR_length == - sizeof (ipa6_conn_x_t)); - ac6 = &((ipa6_conn_x_t *) - addrp)->ac6x_conn; - } - mutex_enter(&udpf->uf_lock); - udp->udp_v6src = ac6->ac6_laddr; - mutex_exit(&udpf->uf_lock); - (void) udp_build_hdrs(udp); - break; - } - } - mp1 = mp1->b_cont; - } - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - /* - * Look for one or more appended ACK message added by - * udp_connect or udp_disconnect. - * If none found just send up the T_BIND_ACK. - * udp_connect has appended a T_OK_ACK and a T_CONN_CON. - * udp_disconnect has appended a T_OK_ACK. - */ - if (mp1 != NULL) { - if (mp->b_cont == mp1) - mp->b_cont = NULL; - else { - ASSERT(mp->b_cont->b_cont == mp1); - mp->b_cont->b_cont = NULL; - } - freemsg(mp); - mp = mp1; - while (mp != NULL) { - mp1 = mp->b_cont; - mp->b_cont = NULL; - putnext(connp->conn_rq, mp); - mp = mp1; - } - return; - } - freemsg(mp->b_cont); - mp->b_cont = NULL; - putnext(connp->conn_rq, mp); -} - -static void -udp_bind_error(conn_t *connp, mblk_t *mp) -{ - udp_t *udp = connp->conn_udp; - struct T_error_ack *tea; - udp_fanout_t *udpf; - udp_stack_t *us = udp->udp_us; - - tea = (struct T_error_ack *)mp->b_rptr; - - /* - * If our O_T_BIND_REQ/T_BIND_REQ fails, - * clear out the associated port and source - * address before passing the message - * upstream. If this was caused by a T_CONN_REQ - * revert back to bound state. - */ - - rw_enter(&udp->udp_rwlock, RW_WRITER); - ASSERT(udp->udp_pending_op != -1); - tea->ERROR_prim = udp->udp_pending_op; - udp->udp_pending_op = -1; - udpf = &us->us_bind_fanout[ - UDP_BIND_HASH(udp->udp_port, - us->us_bind_fanout_size)]; - mutex_enter(&udpf->uf_lock); - - switch (tea->ERROR_prim) { - case T_CONN_REQ: - ASSERT(udp->udp_state == TS_DATA_XFER); - /* Connect failed */ - /* Revert back to the bound source */ - udp->udp_v6src = udp->udp_bound_v6src; - udp->udp_state = TS_IDLE; - mutex_exit(&udpf->uf_lock); - if (udp->udp_family == AF_INET6) - (void) udp_build_hdrs(udp); - rw_exit(&udp->udp_rwlock); - break; - - case T_DISCON_REQ: - case T_BIND_REQ: - case O_T_BIND_REQ: - V6_SET_ZERO(udp->udp_v6src); - V6_SET_ZERO(udp->udp_bound_v6src); - udp->udp_state = TS_UNBND; - udp_bind_hash_remove(udp, B_TRUE); - udp->udp_port = 0; - mutex_exit(&udpf->uf_lock); - if (udp->udp_family == AF_INET6) - (void) udp_build_hdrs(udp); - rw_exit(&udp->udp_rwlock); - break; - - default: - mutex_exit(&udpf->uf_lock); - rw_exit(&udp->udp_rwlock); - (void) mi_strlog(connp->conn_rq, 1, - SL_ERROR|SL_TRACE, - "udp_input_other: bad ERROR_prim, " - "len %d", tea->ERROR_prim); - } - putnext(connp->conn_rq, mp); -} - -/* * return SNMP stuff in buffer in mpdata. We don't hold any lock and report * information that can be changing beneath us. */ @@ -5589,64 +4826,23 @@ done: * is called by udp_wput to handle T_UNBIND_REQ messages. */ static void -udp_unbind(queue_t *q, mblk_t *mp) +udp_tpi_unbind(queue_t *q, mblk_t *mp) { - udp_t *udp = Q_TO_UDP(q); - udp_fanout_t *udpf; - udp_stack_t *us = udp->udp_us; - - if (cl_inet_unbind != NULL) { - /* - * Running in cluster mode - register unbind information - */ - if (udp->udp_ipversion == IPV4_VERSION) { - (*cl_inet_unbind)(IPPROTO_UDP, AF_INET, - (uint8_t *)(&V4_PART_OF_V6(udp->udp_v6src)), - (in_port_t)udp->udp_port); - } else { - (*cl_inet_unbind)(IPPROTO_UDP, AF_INET6, - (uint8_t *)&(udp->udp_v6src), - (in_port_t)udp->udp_port); - } - } + conn_t *connp = Q_TO_CONN(q); + int error; - rw_enter(&udp->udp_rwlock, RW_WRITER); - if (udp->udp_state == TS_UNBND || udp->udp_pending_op != -1) { - rw_exit(&udp->udp_rwlock); - udp_err_ack(q, mp, TOUTSTATE, 0); + error = udp_do_unbind(connp); + if (error) { + if (error < 0) + udp_err_ack(q, mp, -error, 0); + else + udp_err_ack(q, mp, TSYSERR, error); return; } - udp->udp_pending_op = T_UNBIND_REQ; - rw_exit(&udp->udp_rwlock); - /* - * Pass the unbind to IP; T_UNBIND_REQ is larger than T_OK_ACK - * and therefore ip_unbind must never return NULL. - */ - mp = ip_unbind(q, mp); + mp = mi_tpi_ok_ack_alloc(mp); ASSERT(mp != NULL); ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK); - - /* - * Once we're unbound from IP, the pending operation may be cleared - * here. - */ - rw_enter(&udp->udp_rwlock, RW_WRITER); - udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port, - us->us_bind_fanout_size)]; - mutex_enter(&udpf->uf_lock); - udp_bind_hash_remove(udp, B_TRUE); - V6_SET_ZERO(udp->udp_v6src); - V6_SET_ZERO(udp->udp_bound_v6src); - udp->udp_port = 0; - mutex_exit(&udpf->uf_lock); - - udp->udp_pending_op = -1; - udp->udp_state = TS_UNBND; - if (udp->udp_family == AF_INET6) - (void) udp_build_hdrs(udp); - rw_exit(&udp->udp_rwlock); - qreply(q, mp); } @@ -5748,27 +4944,29 @@ udp_update_label(queue_t *wq, mblk_t *mp, ipaddr_t dst) static mblk_t * udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port, - uint_t srcid, int *error, boolean_t insert_spi) + uint_t srcid, int *error, boolean_t insert_spi, struct nmsghdr *msg, + cred_t *cr, pid_t pid) { - udp_t *udp = connp->conn_udp; - queue_t *q = connp->conn_wq; - mblk_t *mp1 = mp; - mblk_t *mp2; - ipha_t *ipha; - int ip_hdr_length; - uint32_t ip_len; - udpha_t *udpha; - boolean_t lock_held = B_FALSE; + udp_t *udp = connp->conn_udp; + mblk_t *mp1 = mp; + mblk_t *mp2; + ipha_t *ipha; + int ip_hdr_length; + uint32_t ip_len; + udpha_t *udpha; + boolean_t lock_held = B_FALSE; in_port_t uha_src_port; udpattrs_t attrs; - uchar_t ip_snd_opt[IP_MAX_OPT_LENGTH]; + uchar_t ip_snd_opt[IP_MAX_OPT_LENGTH]; uint32_t ip_snd_opt_len = 0; - ip4_pkt_t pktinfo; - ip4_pkt_t *pktinfop = &pktinfo; - ip_opt_info_t optinfo; + ip4_pkt_t pktinfo; + ip4_pkt_t *pktinfop = &pktinfo; + ip_opt_info_t optinfo; ip_stack_t *ipst = connp->conn_netstack->netstack_ip; udp_stack_t *us = udp->udp_us; ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; + queue_t *q = connp->conn_wq; + ire_t *ire; *error = 0; @@ -5784,26 +4982,55 @@ udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port, * If options passed in, feed it for verification and handling */ attrs.udpattr_credset = B_FALSE; - if (DB_TYPE(mp) != M_DATA) { - mp1 = mp->b_cont; - if (((struct T_unitdata_req *)mp->b_rptr)->OPT_length != 0) { + if (IPCL_IS_NONSTR(connp)) { + if (msg->msg_controllen != 0) { attrs.udpattr_ipp4 = pktinfop; attrs.udpattr_mb = mp; - if (udp_unitdata_opt_process(q, mp, error, &attrs) < 0) + + rw_enter(&udp->udp_rwlock, RW_WRITER); + *error = process_auxiliary_options(connp, + msg->msg_control, msg->msg_controllen, + &attrs, &udp_opt_obj, udp_opt_set); + rw_exit(&udp->udp_rwlock); + if (*error) goto done; - /* - * Note: success in processing options. - * mp option buffer represented by - * OPT_length/offset now potentially modified - * and contain option setting results - */ - ASSERT(*error == 0); + } + } else { + if (DB_TYPE(mp) != M_DATA) { + mp1 = mp->b_cont; + if (((struct T_unitdata_req *) + mp->b_rptr)->OPT_length != 0) { + attrs.udpattr_ipp4 = pktinfop; + attrs.udpattr_mb = mp; + if (udp_unitdata_opt_process(q, mp, error, + &attrs) < 0) + goto done; + /* + * Note: success in processing options. + * mp option buffer represented by + * OPT_length/offset now potentially modified + * and contain option setting results + */ + ASSERT(*error == 0); + } } } /* mp1 points to the M_DATA mblk carrying the packet */ ASSERT(mp1 != NULL && DB_TYPE(mp1) == M_DATA); + /* + * Determine whether we need to mark the mblk with the user's + * credentials. + */ + ire = connp->conn_ire_cache; + if (is_system_labeled() || CLASSD(v4dst) || (ire == NULL) || + (ire->ire_addr != v4dst) || + (ire->ire_type & (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK))) { + if (cr != NULL && DB_CRED(mp) == NULL) + msg_setcredpid(mp, cr, pid); + } + rw_enter(&udp->udp_rwlock, RW_READER); lock_held = B_TRUE; /* @@ -6235,7 +5462,7 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid) ipha_t *ipha = (ipha_t *)mp->b_rptr; udp_stack_t *us = udp->udp_us; ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - boolean_t ll_multicast = B_FALSE; + boolean_t ll_multicast = B_FALSE; dev_q = ire->ire_stq->q_next; ASSERT(dev_q != NULL); @@ -6248,6 +5475,7 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid) DEV_Q_FLOW_BLOCKED(dev_q)) { BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + if (ipst->ips_ip_output_queue) (void) putq(connp->conn_wq, mp); else @@ -6397,11 +5625,11 @@ udp_update_label_v6(queue_t *wq, mblk_t *mp, in6_addr_t *dst) return (err); } -void -udp_output_connected(void *arg, mblk_t *mp) +static int +udp_send_connected(conn_t *connp, mblk_t *mp, struct nmsghdr *msg, cred_t *cr, + pid_t pid) { - conn_t *connp = (conn_t *)arg; - udp_t *udp = connp->conn_udp; + udp_t *udp = connp->conn_udp; udp_stack_t *us = udp->udp_us; ipaddr_t v4dst; in_port_t dstport; @@ -6416,7 +5644,7 @@ udp_output_connected(void *arg, mblk_t *mp) /* M_DATA for connected socket */ - ASSERT(udp->udp_issocket); + ASSERT(udp->udp_issocket || IPCL_IS_NONSTR(connp)); UDP_DBGSTAT(us, udp_data_conn); mutex_enter(&connp->conn_lock); @@ -6428,7 +5656,7 @@ udp_output_connected(void *arg, mblk_t *mp) TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, "udp_wput_end: connp %p (%S)", connp, "not-connected; address required"); - return; + return (EDESTADDRREQ); } mapped_addr = IN6_IS_ADDR_V4MAPPED(&udp->udp_v6dst); @@ -6466,20 +5694,100 @@ udp_output_connected(void *arg, mblk_t *mp) * family of the socket. */ mp = udp_output_v4(connp, mp, v4dst, dstport, 0, &error, - insert_spi); + insert_spi, msg, cr, pid); } else { - mp = udp_output_v6(connp, mp, sin6, &error); + mp = udp_output_v6(connp, mp, sin6, &error, msg, cr, pid); } if (error == 0) { ASSERT(mp == NULL); - return; + return (0); } UDP_STAT(us, udp_out_err_output); ASSERT(mp != NULL); - /* mp is freed by the following routine */ - udp_ud_err(connp->conn_wq, mp, (uchar_t *)addr, (t_scalar_t)addrlen, - (t_scalar_t)error); + if (IPCL_IS_NONSTR(connp)) { + freemsg(mp); + return (error); + } else { + /* mp is freed by the following routine */ + udp_ud_err(connp->conn_wq, mp, (uchar_t *)addr, + (t_scalar_t)addrlen, (t_scalar_t)error); + return (0); + } +} + +/* ARGSUSED */ +static int +udp_send_not_connected(conn_t *connp, mblk_t *mp, struct sockaddr *addr, + socklen_t addrlen, struct nmsghdr *msg, cred_t *cr, pid_t pid) +{ + + udp_t *udp = connp->conn_udp; + boolean_t insert_spi = udp->udp_nat_t_endpoint; + int error = 0; + sin6_t *sin6; + sin_t *sin; + uint_t srcid; + uint16_t port; + ipaddr_t v4dst; + + + ASSERT(addr != NULL); + + switch (udp->udp_family) { + case AF_INET6: + sin6 = (sin6_t *)addr; + if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + /* + * Destination is a non-IPv4-compatible IPv6 address. + * Send out an IPv6 format packet. + */ + mp = udp_output_v6(connp, mp, sin6, &error, msg, cr, + pid); + if (error != 0) + goto ud_error; + + return (0); + } + /* + * If the local address is not zero or a mapped address + * return an error. It would be possible to send an IPv4 + * packet but the response would never make it back to the + * application since it is bound to a non-mapped address. + */ + if (!IN6_IS_ADDR_V4MAPPED(&udp->udp_v6src) && + !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) { + error = EADDRNOTAVAIL; + goto ud_error; + } + /* Send IPv4 packet without modifying udp_ipversion */ + /* Extract port and ipaddr */ + port = sin6->sin6_port; + IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, v4dst); + srcid = sin6->__sin6_src_id; + break; + + case AF_INET: + sin = (sin_t *)addr; + /* Extract port and ipaddr */ + port = sin->sin_port; + v4dst = sin->sin_addr.s_addr; + srcid = 0; + break; + } + + mp = udp_output_v4(connp, mp, v4dst, port, srcid, &error, insert_spi, + msg, cr, pid); + + if (error == 0) { + ASSERT(mp == NULL); + return (0); + } + +ud_error: + ASSERT(mp != NULL); + + return (error); } /* @@ -6496,18 +5804,12 @@ udp_output_connected(void *arg, mblk_t *mp) void udp_wput(queue_t *q, mblk_t *mp) { - sin6_t *sin6; - sin_t *sin; - ipaddr_t v4dst; - uint16_t port; - uint_t srcid; conn_t *connp = Q_TO_CONN(q); udp_t *udp = connp->conn_udp; int error = 0; struct sockaddr *addr; socklen_t addrlen; - udp_stack_t *us = udp->udp_us; - boolean_t insert_spi = udp->udp_nat_t_endpoint; + udp_stack_t *us = udp->udp_us; TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_START, "udp_wput_start: queue %p mp %p", q, mp); @@ -6533,7 +5835,7 @@ udp_wput(queue_t *q, mblk_t *mp) "not-connected; address required"); return; } - udp_output_connected(connp, mp); + (void) udp_send_connected(connp, mp, NULL, NULL, -1); return; case M_PROTO: @@ -6587,67 +5889,8 @@ udp_wput(queue_t *q, mblk_t *mp) } ASSERT(addr != NULL); - switch (udp->udp_family) { - case AF_INET6: - sin6 = (sin6_t *)addr; - if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) || - (sin6->sin6_family != AF_INET6)) { - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, - "udp_wput_end: q %p (%S)", q, "badaddr"); - error = EADDRNOTAVAIL; - goto ud_error; - } - - if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { - /* - * Destination is a non-IPv4-compatible IPv6 address. - * Send out an IPv6 format packet. - */ - mp = udp_output_v6(connp, mp, sin6, &error); - if (error != 0) - goto ud_error; - - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, - "udp_wput_end: q %p (%S)", q, "udp_output_v6"); - return; - } - /* - * If the local address is not zero or a mapped address - * return an error. It would be possible to send an IPv4 - * packet but the response would never make it back to the - * application since it is bound to a non-mapped address. - */ - if (!IN6_IS_ADDR_V4MAPPED(&udp->udp_v6src) && - !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) { - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, - "udp_wput_end: q %p (%S)", q, "badaddr"); - error = EADDRNOTAVAIL; - goto ud_error; - } - /* Send IPv4 packet without modifying udp_ipversion */ - /* Extract port and ipaddr */ - port = sin6->sin6_port; - IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, v4dst); - srcid = sin6->__sin6_src_id; - break; - - case AF_INET: - sin = (sin_t *)addr; - if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) || - (sin->sin_family != AF_INET)) { - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, - "udp_wput_end: q %p (%S)", q, "badaddr"); - error = EADDRNOTAVAIL; - goto ud_error; - } - /* Extract port and ipaddr */ - port = sin->sin_port; - v4dst = sin->sin_addr.s_addr; - srcid = 0; - break; - } - - mp = udp_output_v4(connp, mp, v4dst, port, srcid, &error, insert_spi); + error = udp_send_not_connected(connp, mp, addr, addrlen, NULL, NULL, + -1); if (error != 0) { ud_error: UDP_STAT(us, udp_out_err_output); @@ -6658,13 +5901,25 @@ ud_error: } } +/* ARGSUSED */ +static void +udp_wput_fallback(queue_t *wq, mblk_t *mp) +{ +#ifdef DEBUG + cmn_err(CE_CONT, "udp_wput_fallback: Message in fallback \n"); +#endif + freemsg(mp); +} + + /* * udp_output_v6(): * Assumes that udp_wput did some sanity checking on the destination * address. */ static mblk_t * -udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, int *error) +udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, int *error, + struct nmsghdr *msg, cred_t *cr, pid_t pid) { ip6_t *ip6h; ip6i_t *ip6i; /* mp1->b_rptr even if no ip6i_t */ @@ -6674,6 +5929,7 @@ udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, int *error) size_t ip_len; udpha_t *udph; udp_t *udp = connp->conn_udp; + udp_stack_t *us = udp->udp_us; queue_t *q = connp->conn_wq; ip6_pkt_t ipp_s; /* For ancillary data options */ ip6_pkt_t *ipp = &ipp_s; @@ -6689,8 +5945,8 @@ udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, int *error) ip6_hbh_t *hopoptsptr = NULL; uint_t hopoptslen = 0; boolean_t is_ancillary = B_FALSE; - udp_stack_t *us = udp->udp_us; size_t sth_wroff = 0; + ire_t *ire; *error = 0; @@ -6714,19 +5970,51 @@ udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, int *error) */ attrs.udpattr_credset = B_FALSE; opt_present = B_FALSE; - if (DB_TYPE(mp) != M_DATA) { - mp1 = mp->b_cont; - if (((struct T_unitdata_req *)mp->b_rptr)->OPT_length != 0) { + if (IPCL_IS_NONSTR(connp)) { + if (msg->msg_controllen != 0) { attrs.udpattr_ipp6 = ipp; attrs.udpattr_mb = mp; - if (udp_unitdata_opt_process(q, mp, error, - &attrs) < 0) { + + rw_enter(&udp->udp_rwlock, RW_WRITER); + *error = process_auxiliary_options(connp, + msg->msg_control, msg->msg_controllen, + &attrs, &udp_opt_obj, udp_opt_set); + rw_exit(&udp->udp_rwlock); + if (*error) goto done; - } ASSERT(*error == 0); opt_present = B_TRUE; } + } else { + if (DB_TYPE(mp) != M_DATA) { + mp1 = mp->b_cont; + if (((struct T_unitdata_req *) + mp->b_rptr)->OPT_length != 0) { + attrs.udpattr_ipp6 = ipp; + attrs.udpattr_mb = mp; + if (udp_unitdata_opt_process(q, mp, error, + &attrs) < 0) { + goto done; + } + ASSERT(*error == 0); + opt_present = B_TRUE; + } + } } + + /* + * Determine whether we need to mark the mblk with the user's + * credentials. + */ + ire = connp->conn_ire_cache; + if (is_system_labeled() || IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) || + (ire == NULL) || + (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &sin6->sin6_addr)) || + (ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK))) { + if (cr != NULL && DB_CRED(mp) == NULL) + msg_setcredpid(mp, cr, pid); + } + rw_enter(&udp->udp_rwlock, RW_READER); ignore = ipp->ipp_sticky_ignored; @@ -7268,7 +6556,7 @@ no_options: done: if (sth_wroff != 0) { - (void) mi_set_sth_wroff(RD(q), + (void) proto_set_tx_wroff(RD(q), connp, udp->udp_max_hdr_len + us->us_wroff_extra); } if (hopoptsptr != NULL && !is_ancillary) { @@ -7284,7 +6572,7 @@ done: static int -udp_getpeername(udp_t *udp, struct sockaddr *sa, uint_t *salenp) +i_udp_getpeername(udp_t *udp, struct sockaddr *sa, uint_t *salenp) { sin_t *sin = (sin_t *)sa; sin6_t *sin6 = (sin6_t *)sa; @@ -7404,7 +6692,7 @@ udp_wput_cmdblk(queue_t *q, mblk_t *mp) rw_enter(&udp->udp_rwlock, RW_READER); switch (cmdp->cb_cmd) { case TI_GETPEERNAME: - cmdp->cb_error = udp_getpeername(udp, data, &cmdp->cb_len); + cmdp->cb_error = i_udp_getpeername(udp, data, &cmdp->cb_len); break; case TI_GETMYNAME: cmdp->cb_error = udp_getmyname(udp, data, &cmdp->cb_len); @@ -7419,6 +6707,21 @@ udp_wput_cmdblk(queue_t *q, mblk_t *mp) } static void +udp_disable_direct_sockfs(udp_t *udp) +{ + udp->udp_issocket = B_FALSE; + if (udp->udp_direct_sockfs) { + /* + * Disable read-side synchronous stream interface and + * drain any queued data. + */ + udp_rcv_drain(udp->udp_connp->conn_rq, udp, B_FALSE); + ASSERT(!udp->udp_direct_sockfs); + UDP_STAT(udp->udp_us, udp_sock_fallback); + } +} + +static void udp_wput_other(queue_t *q, mblk_t *mp) { uchar_t *rptr = mp->b_rptr; @@ -7458,12 +6761,12 @@ udp_wput_other(queue_t *q, mblk_t *mp) return; case O_T_BIND_REQ: case T_BIND_REQ: - udp_bind(q, mp); + udp_tpi_bind(q, mp); TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, "udp_wput_other_end: q %p (%S)", q, "bindreq"); return; case T_CONN_REQ: - udp_connect(q, mp); + udp_tpi_connect(q, mp); TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, "udp_wput_other_end: q %p (%S)", q, "connreq"); return; @@ -7488,7 +6791,7 @@ udp_wput_other(queue_t *q, mblk_t *mp) "udp_wput_other_end: q %p (%S)", q, "unitdatareq"); return; case T_UNBIND_REQ: - udp_unbind(q, mp); + udp_tpi_unbind(q, mp); TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, "udp_wput_other_end: q %p (%S)", q, "unbindreq"); return; @@ -7509,7 +6812,7 @@ udp_wput_other(queue_t *q, mblk_t *mp) return; case T_DISCON_REQ: - udp_disconnect(q, mp); + udp_tpi_disconnect(q, mp); TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, "udp_wput_other_end: q %p (%S)", q, "disconreq"); return; @@ -7596,18 +6899,8 @@ udp_wput_other(queue_t *q, mblk_t *mp) DB_TYPE(mp) = M_IOCNAK; iocp->ioc_error = EINVAL; } else { - udp->udp_issocket = B_FALSE; - if (udp->udp_direct_sockfs) { - /* - * Disable read-side synchronous - * stream interface and drain any - * queued data. - */ - udp_rcv_drain(RD(q), udp, - B_FALSE); - ASSERT(!udp->udp_direct_sockfs); - UDP_STAT(us, udp_sock_fallback); - } + udp_disable_direct_sockfs(udp); + DB_TYPE(mp) = M_IOCACK; iocp->ioc_error = 0; } @@ -7640,12 +6933,12 @@ udp_wput_other(queue_t *q, mblk_t *mp) static void udp_wput_iocdata(queue_t *q, mblk_t *mp) { - mblk_t *mp1; - struct iocblk *iocp = (struct iocblk *)mp->b_rptr; + mblk_t *mp1; + struct iocblk *iocp = (struct iocblk *)mp->b_rptr; STRUCT_HANDLE(strbuf, sb); - udp_t *udp = Q_TO_UDP(q); - int error; - uint_t addrlen; + udp_t *udp = Q_TO_UDP(q); + int error; + uint_t addrlen; /* Make sure it is one of ours. */ switch (iocp->ioc_cmd) { @@ -7699,16 +6992,17 @@ udp_wput_iocdata(queue_t *q, mblk_t *mp) } mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); + if (mp1 == NULL) return; rw_enter(&udp->udp_rwlock, RW_READER); switch (iocp->ioc_cmd) { case TI_GETMYNAME: - error = udp_getmyname(udp, (void *)mp1->b_rptr, &addrlen); + error = udp_do_getsockname(udp, (void *)mp1->b_rptr, &addrlen); break; case TI_GETPEERNAME: - error = udp_getpeername(udp, (void *)mp1->b_rptr, &addrlen); + error = udp_do_getpeername(udp, (void *)mp1->b_rptr, &addrlen); break; } rw_exit(&udp->udp_rwlock); @@ -7755,7 +7049,7 @@ udp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp, } void -udp_ddi_init(void) +udp_ddi_g_init(void) { udp_max_optsize = optcom_max_optsize(udp_opt_obj.odb_opt_des_arr, udp_opt_obj.odb_opt_arr_cnt); @@ -7769,11 +7063,13 @@ udp_ddi_init(void) } void -udp_ddi_destroy(void) +udp_ddi_g_destroy(void) { netstack_unregister(NS_UDP); } +#define INET_NAME "ip" + /* * Initialize the UDP stack instance. */ @@ -7783,6 +7079,8 @@ udp_stack_init(netstackid_t stackid, netstack_t *ns) udp_stack_t *us; udpparam_t *pa; int i; + int error = 0; + major_t major; us = (udp_stack_t *)kmem_zalloc(sizeof (*us), KM_SLEEP); us->us_netstack = ns; @@ -7825,6 +7123,10 @@ udp_stack_init(netstackid_t stackid, netstack_t *ns) us->us_kstat = udp_kstat2_init(stackid, &us->us_statistics); us->us_mibkp = udp_kstat_init(stackid); + + major = mod_name_to_major(INET_NAME); + error = ldi_ident_from_major(major, &us->us_ldi_ident); + ASSERT(error == 0); return (us); } @@ -7856,6 +7158,8 @@ udp_stack_fini(netstackid_t stackid, void *arg) udp_kstat2_fini(stackid, us->us_kstat); us->us_kstat = NULL; bzero(&us->us_statistics, sizeof (us->us_statistics)); + + ldi_ident_release(us->us_ldi_ident); kmem_free(us, sizeof (*us)); } @@ -8192,8 +7496,6 @@ udp_rcv_drain(queue_t *q, udp_t *udp, boolean_t closing) mblk_t *mp; udp_stack_t *us = udp->udp_us; - ASSERT(q == RD(q)); - mutex_enter(&udp->udp_drain_lock); /* * There is no race with a concurrent udp_input() sending @@ -8222,6 +7524,7 @@ udp_rcv_drain(queue_t *q, udp_t *udp, boolean_t closing) if (closing) { freemsg(mp); } else { + ASSERT(q == RD(q)); putnext(q, mp); } } @@ -8282,3 +7585,1802 @@ udp_lwput(queue_t *q, mblk_t *mp) { freemsg(mp); } + +/* + * Below routines for UDP socket module. + */ + +static conn_t * +udp_do_open(cred_t *credp, boolean_t isv6, int flags) +{ + udp_t *udp; + conn_t *connp; + zoneid_t zoneid; + netstack_t *ns; + udp_stack_t *us; + + ns = netstack_find_by_cred(credp); + ASSERT(ns != NULL); + us = ns->netstack_udp; + ASSERT(us != NULL); + + /* + * For exclusive stacks we set the zoneid to zero + * to make UDP operate as if in the global zone. + */ + if (ns->netstack_stackid != GLOBAL_NETSTACKID) + zoneid = GLOBAL_ZONEID; + else + zoneid = crgetzoneid(credp); + + ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP); + + connp = ipcl_conn_create(IPCL_UDPCONN, flags, ns); + if (connp == NULL) { + netstack_rele(ns); + return (NULL); + } + udp = connp->conn_udp; + + /* + * ipcl_conn_create did a netstack_hold. Undo the hold that was + * done by netstack_find_by_cred() + */ + netstack_rele(ns); + + rw_enter(&udp->udp_rwlock, RW_WRITER); + ASSERT(connp->conn_ulp == IPPROTO_UDP); + ASSERT(connp->conn_udp == udp); + ASSERT(udp->udp_connp == connp); + + /* Set the initial state of the stream and the privilege status. */ + udp->udp_state = TS_UNBND; + if (isv6) { + udp->udp_family = AF_INET6; + udp->udp_ipversion = IPV6_VERSION; + udp->udp_max_hdr_len = IPV6_HDR_LEN + UDPH_SIZE; + udp->udp_ttl = us->us_ipv6_hoplimit; + connp->conn_af_isv6 = B_TRUE; + connp->conn_flags |= IPCL_ISV6; + } else { + udp->udp_family = AF_INET; + udp->udp_ipversion = IPV4_VERSION; + udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE; + udp->udp_ttl = us->us_ipv4_ttl; + connp->conn_af_isv6 = B_FALSE; + connp->conn_flags &= ~IPCL_ISV6; + } + + udp->udp_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + udp->udp_pending_op = -1; + connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; + connp->conn_zoneid = zoneid; + + udp->udp_open_time = lbolt64; + udp->udp_open_pid = curproc->p_pid; + + /* + * If the caller has the process-wide flag set, then default to MAC + * exempt mode. This allows read-down to unlabeled hosts. + */ + if (getpflags(NET_MAC_AWARE, credp) != 0) + connp->conn_mac_exempt = B_TRUE; + + connp->conn_ulp_labeled = is_system_labeled(); + + udp->udp_us = us; + + connp->conn_recv = udp_input; + crhold(credp); + connp->conn_cred = credp; + + *((sin6_t *)&udp->udp_delayed_addr) = sin6_null; + + rw_exit(&udp->udp_rwlock); + + return (connp); +} + +/* ARGSUSED */ +sock_lower_handle_t +udp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, + uint_t *smodep, int *errorp, int flags, cred_t *credp) +{ + udp_t *udp = NULL; + udp_stack_t *us; + conn_t *connp; + boolean_t isv6; + + if (type != SOCK_DGRAM || (family != AF_INET && family != AF_INET6) || + (proto != 0 && proto != IPPROTO_UDP)) { + *errorp = EPROTONOSUPPORT; + return (NULL); + } + + if (family == AF_INET6) + isv6 = B_TRUE; + else + isv6 = B_FALSE; + + connp = udp_do_open(credp, isv6, flags); + if (connp == NULL) { + *errorp = ENOMEM; + return (NULL); + } + + udp = connp->conn_udp; + ASSERT(udp != NULL); + us = udp->udp_us; + ASSERT(us != NULL); + + connp->conn_flags |= IPCL_NONSTR | IPCL_SOCKET; + + /* Set flow control */ + rw_enter(&udp->udp_rwlock, RW_WRITER); + (void) udp_set_rcv_hiwat(udp, us->us_recv_hiwat); + udp->udp_rcv_disply_hiwat = us->us_recv_hiwat; + udp->udp_rcv_lowat = udp_mod_info.mi_lowat; + udp->udp_xmit_hiwat = us->us_xmit_hiwat; + udp->udp_xmit_lowat = us->us_xmit_lowat; + + if (udp->udp_family == AF_INET6) { + /* Build initial header template for transmit */ + if ((*errorp = udp_build_hdrs(udp)) != 0) { + rw_exit(&udp->udp_rwlock); + ipcl_conn_destroy(connp); + return (NULL); + } + } + rw_exit(&udp->udp_rwlock); + + connp->conn_flow_cntrld = B_FALSE; + + ASSERT(us->us_ldi_ident != NULL); + + if ((*errorp = ip_create_helper_stream(connp, us->us_ldi_ident)) != 0) { + ip1dbg(("create of IP helper stream failed\n")); + udp_do_close(connp); + return (NULL); + } + + /* Set the send flow control */ + connp->conn_wq->q_hiwat = us->us_xmit_hiwat; + connp->conn_wq->q_lowat = us->us_xmit_lowat; + + mutex_enter(&connp->conn_lock); + connp->conn_state_flags &= ~CONN_INCIPIENT; + mutex_exit(&connp->conn_lock); + + *errorp = 0; + *smodep = SM_ATOMIC; + *sock_downcalls = &sock_udp_downcalls; + return ((sock_lower_handle_t)connp); +} + +/* ARGSUSED */ +void +udp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, + sock_upcalls_t *sock_upcalls, int flags, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + udp_t *udp = connp->conn_udp; + udp_stack_t *us = udp->udp_us; + struct sock_proto_props sopp; + + connp->conn_upcalls = sock_upcalls; + connp->conn_upper_handle = sock_handle; + + sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | + SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ; + sopp.sopp_wroff = udp->udp_max_hdr_len + us->us_wroff_extra; + sopp.sopp_maxblk = INFPSZ; + sopp.sopp_rxhiwat = udp->udp_rcv_hiwat; + sopp.sopp_maxaddrlen = sizeof (sin6_t); + sopp.sopp_maxpsz = + (udp->udp_family == AF_INET) ? UDP_MAXPACKET_IPV4 : + UDP_MAXPACKET_IPV6; + sopp.sopp_minpsz = (udp_mod_info.mi_minpsz == 1) ? 0 : + udp_mod_info.mi_minpsz; + + (*connp->conn_upcalls->su_set_proto_props)(connp->conn_upper_handle, + &sopp); +} + +static void +udp_do_close(conn_t *connp) +{ + udp_t *udp; + + ASSERT(connp != NULL && IPCL_IS_UDP(connp)); + udp = connp->conn_udp; + + udp_quiesce_conn(connp); + ip_quiesce_conn(connp); + + if (!IPCL_IS_NONSTR(connp)) { + /* + * Disable read-side synchronous stream + * interface and drain any queued data. + */ + ASSERT(connp->conn_wq != NULL); + udp_rcv_drain(connp->conn_wq, udp, B_TRUE); + ASSERT(!udp->udp_direct_sockfs); + + ASSERT(connp->conn_rq != NULL); + qprocsoff(connp->conn_rq); + } + + ASSERT(udp->udp_rcv_cnt == 0); + ASSERT(udp->udp_rcv_msgcnt == 0); + ASSERT(udp->udp_rcv_list_head == NULL); + ASSERT(udp->udp_rcv_list_tail == NULL); + + udp_close_free(connp); + + /* + * Now we are truly single threaded on this stream, and can + * delete the things hanging off the connp, and finally the connp. + * We removed this connp from the fanout list, it cannot be + * accessed thru the fanouts, and we already waited for the + * conn_ref to drop to 0. We are already in close, so + * there cannot be any other thread from the top. qprocsoff + * has completed, and service has completed or won't run in + * future. + */ + ASSERT(connp->conn_ref == 1); + if (!IPCL_IS_NONSTR(connp)) { + inet_minor_free(connp->conn_minor_arena, connp->conn_dev); + } else { + ip_close_helper_stream(connp); + } + + connp->conn_ref--; + ipcl_conn_destroy(connp); +} + +/* ARGSUSED */ +int +udp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + + udp_do_close(connp); + return (0); +} + +static int +udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, + boolean_t bind_to_req_port_only) +{ + sin_t *sin; + sin6_t *sin6; + sin6_t sin6addr; + in_port_t port; /* Host byte order */ + in_port_t requested_port; /* Host byte order */ + int count; + in6_addr_t v6src; + int loopmax; + udp_fanout_t *udpf; + in_port_t lport; /* Network byte order */ + zoneid_t zoneid; + udp_t *udp; + boolean_t is_inaddr_any; + mlp_type_t addrtype, mlptype; + udp_stack_t *us; + int error = 0; + mblk_t *mp = NULL; + + udp = connp->conn_udp; + us = udp->udp_us; + + if (udp->udp_state != TS_UNBND) { + (void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, + "udp_bind: bad state, %u", udp->udp_state); + return (-TOUTSTATE); + } + + switch (len) { + case 0: + if (udp->udp_family == AF_INET) { + sin = (sin_t *)&sin6addr; + *sin = sin_null; + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = INADDR_ANY; + udp->udp_ipversion = IPV4_VERSION; + } else { + ASSERT(udp->udp_family == AF_INET6); + sin6 = (sin6_t *)&sin6addr; + *sin6 = sin6_null; + sin6->sin6_family = AF_INET6; + V6_SET_ZERO(sin6->sin6_addr); + udp->udp_ipversion = IPV6_VERSION; + } + port = 0; + break; + + case sizeof (sin_t): /* Complete IPv4 address */ + sin = (sin_t *)sa; + + if (sin == NULL || !OK_32PTR((char *)sin)) + return (EINVAL); + + if (udp->udp_family != AF_INET || + sin->sin_family != AF_INET) { + return (EAFNOSUPPORT); + } + port = ntohs(sin->sin_port); + break; + + case sizeof (sin6_t): /* complete IPv6 address */ + sin6 = (sin6_t *)sa; + + if (sin6 == NULL || !OK_32PTR((char *)sin6)) + return (EINVAL); + + if (udp->udp_family != AF_INET6 || + sin6->sin6_family != AF_INET6) { + return (EAFNOSUPPORT); + } + port = ntohs(sin6->sin6_port); + break; + + default: /* Invalid request */ + (void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, + "udp_bind: bad ADDR_length length %u", len); + return (-TBADADDR); + } + + requested_port = port; + + if (requested_port == 0 || !bind_to_req_port_only) + bind_to_req_port_only = B_FALSE; + else /* T_BIND_REQ and requested_port != 0 */ + bind_to_req_port_only = B_TRUE; + + if (requested_port == 0) { + /* + * If the application passed in zero for the port number, it + * doesn't care which port number we bind to. Get one in the + * valid range. + */ + if (udp->udp_anon_priv_bind) { + port = udp_get_next_priv_port(udp); + } else { + port = udp_update_next_port(udp, + us->us_next_port_to_try, B_TRUE); + } + } else { + /* + * If the port is in the well-known privileged range, + * make sure the caller was privileged. + */ + int i; + boolean_t priv = B_FALSE; + + if (port < us->us_smallest_nonpriv_port) { + priv = B_TRUE; + } else { + for (i = 0; i < us->us_num_epriv_ports; i++) { + if (port == us->us_epriv_ports[i]) { + priv = B_TRUE; + break; + } + } + } + + if (priv) { + if (secpolicy_net_privaddr(cr, port, IPPROTO_UDP) != 0) + return (-TACCES); + } + } + + if (port == 0) + return (-TNOADDR); + + /* + * The state must be TS_UNBND. TPI mandates that users must send + * TPI primitives only 1 at a time and wait for the response before + * sending the next primitive. + */ + rw_enter(&udp->udp_rwlock, RW_WRITER); + if (udp->udp_state != TS_UNBND || udp->udp_pending_op != -1) { + rw_exit(&udp->udp_rwlock); + (void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, + "udp_bind: bad state, %u", udp->udp_state); + return (-TOUTSTATE); + } + /* XXX how to remove the T_BIND_REQ? Should set it before calling */ + udp->udp_pending_op = T_BIND_REQ; + /* + * Copy the source address into our udp structure. This address + * may still be zero; if so, IP will fill in the correct address + * each time an outbound packet is passed to it. Since the udp is + * not yet in the bind hash list, we don't grab the uf_lock to + * change udp_ipversion + */ + if (udp->udp_family == AF_INET) { + ASSERT(sin != NULL); + ASSERT(udp->udp_ipversion == IPV4_VERSION); + udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE + + udp->udp_ip_snd_options_len; + IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6src); + } else { + ASSERT(sin6 != NULL); + v6src = sin6->sin6_addr; + if (IN6_IS_ADDR_V4MAPPED(&v6src)) { + /* + * no need to hold the uf_lock to set the udp_ipversion + * since we are not yet in the fanout list + */ + udp->udp_ipversion = IPV4_VERSION; + udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + + UDPH_SIZE + udp->udp_ip_snd_options_len; + } else { + udp->udp_ipversion = IPV6_VERSION; + udp->udp_max_hdr_len = udp->udp_sticky_hdrs_len; + } + } + + /* + * If udp_reuseaddr is not set, then we have to make sure that + * the IP address and port number the application requested + * (or we selected for the application) is not being used by + * another stream. If another stream is already using the + * requested IP address and port, the behavior depends on + * "bind_to_req_port_only". If set the bind fails; otherwise we + * search for any an unused port to bind to the the stream. + * + * As per the BSD semantics, as modified by the Deering multicast + * changes, if udp_reuseaddr is set, then we allow multiple binds + * to the same port independent of the local IP address. + * + * This is slightly different than in SunOS 4.X which did not + * support IP multicast. Note that the change implemented by the + * Deering multicast code effects all binds - not only binding + * to IP multicast addresses. + * + * Note that when binding to port zero we ignore SO_REUSEADDR in + * order to guarantee a unique port. + */ + + count = 0; + if (udp->udp_anon_priv_bind) { + /* + * loopmax = (IPPORT_RESERVED-1) - + * us->us_min_anonpriv_port + 1 + */ + loopmax = IPPORT_RESERVED - us->us_min_anonpriv_port; + } else { + loopmax = us->us_largest_anon_port - + us->us_smallest_anon_port + 1; + } + + is_inaddr_any = V6_OR_V4_INADDR_ANY(v6src); + zoneid = connp->conn_zoneid; + + for (;;) { + udp_t *udp1; + boolean_t found_exclbind = B_FALSE; + + /* + * Walk through the list of udp streams bound to + * requested port with the same IP address. + */ + lport = htons(port); + udpf = &us->us_bind_fanout[UDP_BIND_HASH(lport, + us->us_bind_fanout_size)]; + mutex_enter(&udpf->uf_lock); + for (udp1 = udpf->uf_udp; udp1 != NULL; + udp1 = udp1->udp_bind_hash) { + if (lport != udp1->udp_port) + continue; + + /* + * On a labeled system, we must treat bindings to ports + * on shared IP addresses by sockets with MAC exemption + * privilege as being in all zones, as there's + * otherwise no way to identify the right receiver. + */ + if (!(IPCL_ZONE_MATCH(udp1->udp_connp, zoneid) || + IPCL_ZONE_MATCH(connp, + udp1->udp_connp->conn_zoneid)) && + !connp->conn_mac_exempt && \ + !udp1->udp_connp->conn_mac_exempt) + continue; + + /* + * If UDP_EXCLBIND is set for either the bound or + * binding endpoint, the semantics of bind + * is changed according to the following chart. + * + * spec = specified address (v4 or v6) + * unspec = unspecified address (v4 or v6) + * A = specified addresses are different for endpoints + * + * bound bind to allowed? + * ------------------------------------- + * unspec unspec no + * unspec spec no + * spec unspec no + * spec spec yes if A + * + * For labeled systems, SO_MAC_EXEMPT behaves the same + * as UDP_EXCLBIND, except that zoneid is ignored. + */ + if (udp1->udp_exclbind || udp->udp_exclbind || + udp1->udp_connp->conn_mac_exempt || + connp->conn_mac_exempt) { + if (V6_OR_V4_INADDR_ANY( + udp1->udp_bound_v6src) || + is_inaddr_any || + IN6_ARE_ADDR_EQUAL(&udp1->udp_bound_v6src, + &v6src)) { + found_exclbind = B_TRUE; + break; + } + continue; + } + + /* + * Check ipversion to allow IPv4 and IPv6 sockets to + * have disjoint port number spaces. + */ + if (udp->udp_ipversion != udp1->udp_ipversion) { + + /* + * On the first time through the loop, if the + * the user intentionally specified a + * particular port number, then ignore any + * bindings of the other protocol that may + * conflict. This allows the user to bind IPv6 + * alone and get both v4 and v6, or bind both + * both and get each seperately. On subsequent + * times through the loop, we're checking a + * port that we chose (not the user) and thus + * we do not allow casual duplicate bindings. + */ + if (count == 0 && requested_port != 0) + continue; + } + + /* + * No difference depending on SO_REUSEADDR. + * + * If existing port is bound to a + * non-wildcard IP address and + * the requesting stream is bound to + * a distinct different IP addresses + * (non-wildcard, also), keep going. + */ + if (!is_inaddr_any && + !V6_OR_V4_INADDR_ANY(udp1->udp_bound_v6src) && + !IN6_ARE_ADDR_EQUAL(&udp1->udp_bound_v6src, + &v6src)) { + continue; + } + break; + } + + if (!found_exclbind && + (udp->udp_reuseaddr && requested_port != 0)) { + break; + } + + if (udp1 == NULL) { + /* + * No other stream has this IP address + * and port number. We can use it. + */ + break; + } + mutex_exit(&udpf->uf_lock); + if (bind_to_req_port_only) { + /* + * We get here only when requested port + * is bound (and only first of the for() + * loop iteration). + * + * The semantics of this bind request + * require it to fail so we return from + * the routine (and exit the loop). + * + */ + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); + return (-TADDRBUSY); + } + + if (udp->udp_anon_priv_bind) { + port = udp_get_next_priv_port(udp); + } else { + if ((count == 0) && (requested_port != 0)) { + /* + * If the application wants us to find + * a port, get one to start with. Set + * requested_port to 0, so that we will + * update us->us_next_port_to_try below. + */ + port = udp_update_next_port(udp, + us->us_next_port_to_try, B_TRUE); + requested_port = 0; + } else { + port = udp_update_next_port(udp, port + 1, + B_FALSE); + } + } + + if (port == 0 || ++count >= loopmax) { + /* + * We've tried every possible port number and + * there are none available, so send an error + * to the user. + */ + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); + return (-TNOADDR); + } + } + + /* + * Copy the source address into our udp structure. This address + * may still be zero; if so, ip will fill in the correct address + * each time an outbound packet is passed to it. + * If we are binding to a broadcast or multicast address then + * udp_post_ip_bind_connect will clear the source address + * when udp_do_bind success. + */ + udp->udp_v6src = udp->udp_bound_v6src = v6src; + udp->udp_port = lport; + /* + * Now reset the the next anonymous port if the application requested + * an anonymous port, or we handed out the next anonymous port. + */ + if ((requested_port == 0) && (!udp->udp_anon_priv_bind)) { + us->us_next_port_to_try = port + 1; + } + + /* Initialize the O_T_BIND_REQ/T_BIND_REQ for ip. */ + if (udp->udp_family == AF_INET) { + sin->sin_port = udp->udp_port; + } else { + sin6->sin6_port = udp->udp_port; + /* Rebuild the header template */ + error = udp_build_hdrs(udp); + if (error != 0) { + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); + mutex_exit(&udpf->uf_lock); + return (error); + } + } + udp->udp_state = TS_IDLE; + udp_bind_hash_insert(udpf, udp); + mutex_exit(&udpf->uf_lock); + rw_exit(&udp->udp_rwlock); + + if (cl_inet_bind) { + /* + * Running in cluster mode - register bind information + */ + if (udp->udp_ipversion == IPV4_VERSION) { + (*cl_inet_bind)(IPPROTO_UDP, AF_INET, + (uint8_t *)(&V4_PART_OF_V6(udp->udp_v6src)), + (in_port_t)udp->udp_port); + } else { + (*cl_inet_bind)(IPPROTO_UDP, AF_INET6, + (uint8_t *)&(udp->udp_v6src), + (in_port_t)udp->udp_port); + } + + } + + connp->conn_anon_port = (is_system_labeled() && requested_port == 0); + if (is_system_labeled() && (!connp->conn_anon_port || + connp->conn_anon_mlp)) { + uint16_t mlpport; + cred_t *cr = connp->conn_cred; + zone_t *zone; + + zone = crgetzone(cr); + connp->conn_mlp_type = udp->udp_recvucred ? mlptBoth : + mlptSingle; + addrtype = tsol_mlp_addr_type(zone->zone_id, IPV6_VERSION, + &v6src, us->us_netstack->netstack_ip); + if (addrtype == mlptSingle) { + rw_enter(&udp->udp_rwlock, RW_WRITER); + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); + connp->conn_anon_port = B_FALSE; + connp->conn_mlp_type = mlptSingle; + return (-TNOADDR); + } + mlpport = connp->conn_anon_port ? PMAPPORT : port; + mlptype = tsol_mlp_port_type(zone, IPPROTO_UDP, mlpport, + addrtype); + if (mlptype != mlptSingle && + (connp->conn_mlp_type == mlptSingle || + secpolicy_net_bindmlp(cr) != 0)) { + if (udp->udp_debug) { + (void) strlog(UDP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "udp_bind: no priv for multilevel port %d", + mlpport); + } + rw_enter(&udp->udp_rwlock, RW_WRITER); + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); + connp->conn_anon_port = B_FALSE; + connp->conn_mlp_type = mlptSingle; + return (-TACCES); + } + + /* + * If we're specifically binding a shared IP address and the + * port is MLP on shared addresses, then check to see if this + * zone actually owns the MLP. Reject if not. + */ + if (mlptype == mlptShared && addrtype == mlptShared) { + /* + * No need to handle exclusive-stack zones since + * ALL_ZONES only applies to the shared stack. + */ + zoneid_t mlpzone; + + mlpzone = tsol_mlp_findzone(IPPROTO_UDP, + htons(mlpport)); + if (connp->conn_zoneid != mlpzone) { + if (udp->udp_debug) { + (void) strlog(UDP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "udp_bind: attempt to bind port " + "%d on shared addr in zone %d " + "(should be %d)", + mlpport, connp->conn_zoneid, + mlpzone); + } + rw_enter(&udp->udp_rwlock, RW_WRITER); + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); + connp->conn_anon_port = B_FALSE; + connp->conn_mlp_type = mlptSingle; + return (-TACCES); + } + } + if (connp->conn_anon_port) { + error = tsol_mlp_anon(zone, mlptype, connp->conn_ulp, + port, B_TRUE); + if (error != 0) { + if (udp->udp_debug) { + (void) strlog(UDP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "udp_bind: cannot establish anon " + "MLP for port %d", port); + } + rw_enter(&udp->udp_rwlock, RW_WRITER); + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); + connp->conn_anon_port = B_FALSE; + connp->conn_mlp_type = mlptSingle; + return (-TACCES); + } + } + connp->conn_mlp_type = mlptype; + } + + if (!V6_OR_V4_INADDR_ANY(udp->udp_v6src)) { + /* + * Append a request for an IRE if udp_v6src not + * zero (IPv4 - INADDR_ANY, or IPv6 - all-zeroes address). + */ + mp = allocb(sizeof (ire_t), BPRI_HI); + if (!mp) { + rw_enter(&udp->udp_rwlock, RW_WRITER); + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); + return (ENOMEM); + } + mp->b_wptr += sizeof (ire_t); + mp->b_datap->db_type = IRE_DB_REQ_TYPE; + } + if (udp->udp_family == AF_INET6) { + ASSERT(udp->udp_connp->conn_af_isv6); + error = ip_proto_bind_laddr_v6(connp, &mp, IPPROTO_UDP, + &udp->udp_bound_v6src, udp->udp_port, B_TRUE); + } else { + ASSERT(!udp->udp_connp->conn_af_isv6); + error = ip_proto_bind_laddr_v4(connp, &mp, IPPROTO_UDP, + V4_PART_OF_V6(udp->udp_bound_v6src), udp->udp_port, + B_TRUE); + } + + (void) udp_post_ip_bind_connect(udp, mp, error); + return (error); +} + +int +udp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, + socklen_t len, cred_t *cr) +{ + int error; + conn_t *connp; + + connp = (conn_t *)proto_handle; + + if (sa == NULL) + error = udp_do_unbind(connp); + else + error = udp_do_bind(connp, sa, len, cr, B_TRUE); + + if (error < 0) { + if (error == -TOUTSTATE) + error = EINVAL; + else + error = proto_tlitosyserr(-error); + } + + return (error); +} + +static int +udp_implicit_bind(conn_t *connp, cred_t *cr) +{ + int error; + + error = udp_do_bind(connp, NULL, 0, cr, B_FALSE); + return ((error < 0) ? proto_tlitosyserr(-error) : error); +} + +/* + * This routine removes a port number association from a stream. It + * is called by udp_unbind and udp_tpi_unbind. + */ +static int +udp_do_unbind(conn_t *connp) +{ + udp_t *udp = connp->conn_udp; + udp_fanout_t *udpf; + udp_stack_t *us = udp->udp_us; + + if (cl_inet_unbind != NULL) { + /* + * Running in cluster mode - register unbind information + */ + if (udp->udp_ipversion == IPV4_VERSION) { + (*cl_inet_unbind)(IPPROTO_UDP, AF_INET, + (uint8_t *)(&V4_PART_OF_V6(udp->udp_v6src)), + (in_port_t)udp->udp_port); + } else { + (*cl_inet_unbind)(IPPROTO_UDP, AF_INET6, + (uint8_t *)&(udp->udp_v6src), + (in_port_t)udp->udp_port); + } + } + + rw_enter(&udp->udp_rwlock, RW_WRITER); + if (udp->udp_state == TS_UNBND || udp->udp_pending_op != -1) { + rw_exit(&udp->udp_rwlock); + return (-TOUTSTATE); + } + udp->udp_pending_op = T_UNBIND_REQ; + rw_exit(&udp->udp_rwlock); + + /* + * Pass the unbind to IP; T_UNBIND_REQ is larger than T_OK_ACK + * and therefore ip_unbind must never return NULL. + */ + ip_unbind(connp); + + /* + * Once we're unbound from IP, the pending operation may be cleared + * here. + */ + rw_enter(&udp->udp_rwlock, RW_WRITER); + udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port, + us->us_bind_fanout_size)]; + + mutex_enter(&udpf->uf_lock); + udp_bind_hash_remove(udp, B_TRUE); + V6_SET_ZERO(udp->udp_v6src); + V6_SET_ZERO(udp->udp_bound_v6src); + udp->udp_port = 0; + mutex_exit(&udpf->uf_lock); + + udp->udp_pending_op = -1; + udp->udp_state = TS_UNBND; + if (udp->udp_family == AF_INET6) + (void) udp_build_hdrs(udp); + rw_exit(&udp->udp_rwlock); + + return (0); +} + +static int +udp_post_ip_bind_connect(udp_t *udp, mblk_t *ire_mp, int error) +{ + ire_t *ire; + udp_fanout_t *udpf; + udp_stack_t *us = udp->udp_us; + + ASSERT(udp->udp_pending_op != -1); + rw_enter(&udp->udp_rwlock, RW_WRITER); + if (error == 0) { + /* For udp_do_connect() success */ + /* udp_do_bind() success will do nothing in here */ + /* + * If a broadcast/multicast address was bound, set + * the source address to 0. + * This ensures no datagrams with broadcast address + * as source address are emitted (which would violate + * RFC1122 - Hosts requirements) + * + * Note that when connecting the returned IRE is + * for the destination address and we only perform + * the broadcast check for the source address (it + * is OK to connect to a broadcast/multicast address.) + */ + if (ire_mp != NULL && ire_mp->b_datap->db_type == IRE_DB_TYPE) { + ire = (ire_t *)ire_mp->b_rptr; + + /* + * Note: we get IRE_BROADCAST for IPv6 to "mark" a + * multicast local address. + */ + udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port, + us->us_bind_fanout_size)]; + if (ire->ire_type == IRE_BROADCAST && + udp->udp_state != TS_DATA_XFER) { + ASSERT(udp->udp_pending_op == T_BIND_REQ || + udp->udp_pending_op == O_T_BIND_REQ); + /* + * This was just a local bind to a broadcast + * addr. + */ + mutex_enter(&udpf->uf_lock); + V6_SET_ZERO(udp->udp_v6src); + mutex_exit(&udpf->uf_lock); + if (udp->udp_family == AF_INET6) + (void) udp_build_hdrs(udp); + } else if (V6_OR_V4_INADDR_ANY(udp->udp_v6src)) { + if (udp->udp_family == AF_INET6) + (void) udp_build_hdrs(udp); + } + } + } else { + udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port, + us->us_bind_fanout_size)]; + mutex_enter(&udpf->uf_lock); + + if (udp->udp_state == TS_DATA_XFER) { + /* Connect failed */ + /* Revert back to the bound source */ + udp->udp_v6src = udp->udp_bound_v6src; + udp->udp_state = TS_IDLE; + } else { + /* For udp_do_bind() failed */ + V6_SET_ZERO(udp->udp_v6src); + V6_SET_ZERO(udp->udp_bound_v6src); + udp->udp_state = TS_UNBND; + udp_bind_hash_remove(udp, B_TRUE); + udp->udp_port = 0; + } + mutex_exit(&udpf->uf_lock); + if (udp->udp_family == AF_INET6) + (void) udp_build_hdrs(udp); + } + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); + if (ire_mp != NULL) + freeb(ire_mp); + return (error); +} + +/* + * It associates a default destination address with the stream. + */ +static int +udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len) +{ + sin6_t *sin6; + sin_t *sin; + in6_addr_t v6dst; + ipaddr_t v4dst; + uint16_t dstport; + uint32_t flowinfo; + mblk_t *ire_mp; + udp_fanout_t *udpf; + udp_t *udp, *udp1; + ushort_t ipversion; + udp_stack_t *us; + int error; + + udp = connp->conn_udp; + us = udp->udp_us; + + /* + * Address has been verified by the caller + */ + switch (len) { + default: + /* + * Should never happen + */ + return (EINVAL); + + case sizeof (sin_t): + sin = (sin_t *)sa; + v4dst = sin->sin_addr.s_addr; + dstport = sin->sin_port; + IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); + ASSERT(udp->udp_ipversion == IPV4_VERSION); + ipversion = IPV4_VERSION; + break; + + case sizeof (sin6_t): + sin6 = (sin6_t *)sa; + v6dst = sin6->sin6_addr; + dstport = sin6->sin6_port; + if (IN6_IS_ADDR_V4MAPPED(&v6dst)) { + IN6_V4MAPPED_TO_IPADDR(&v6dst, v4dst); + ipversion = IPV4_VERSION; + flowinfo = 0; + } else { + ipversion = IPV6_VERSION; + flowinfo = sin6->sin6_flowinfo; + } + break; + } + + if (dstport == 0) + return (-TBADADDR); + + rw_enter(&udp->udp_rwlock, RW_WRITER); + + /* + * This UDP must have bound to a port already before doing a connect. + * TPI mandates that users must send TPI primitives only 1 at a time + * and wait for the response before sending the next primitive. + */ + if (udp->udp_state == TS_UNBND || udp->udp_pending_op != -1) { + rw_exit(&udp->udp_rwlock); + (void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, + "udp_connect: bad state, %u", udp->udp_state); + return (-TOUTSTATE); + } + udp->udp_pending_op = T_CONN_REQ; + ASSERT(udp->udp_port != 0 && udp->udp_ptpbhn != NULL); + + if (ipversion == IPV4_VERSION) { + udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE + + udp->udp_ip_snd_options_len; + } else { + udp->udp_max_hdr_len = udp->udp_sticky_hdrs_len; + } + + udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port, + us->us_bind_fanout_size)]; + + mutex_enter(&udpf->uf_lock); + if (udp->udp_state == TS_DATA_XFER) { + /* Already connected - clear out state */ + udp->udp_v6src = udp->udp_bound_v6src; + udp->udp_state = TS_IDLE; + } + + /* + * Create a default IP header with no IP options. + */ + udp->udp_dstport = dstport; + udp->udp_ipversion = ipversion; + if (ipversion == IPV4_VERSION) { + /* + * Interpret a zero destination to mean loopback. + * Update the T_CONN_REQ (sin/sin6) since it is used to + * generate the T_CONN_CON. + */ + if (v4dst == INADDR_ANY) { + v4dst = htonl(INADDR_LOOPBACK); + IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); + if (udp->udp_family == AF_INET) { + sin->sin_addr.s_addr = v4dst; + } else { + sin6->sin6_addr = v6dst; + } + } + udp->udp_v6dst = v6dst; + udp->udp_flowinfo = 0; + + /* + * If the destination address is multicast and + * an outgoing multicast interface has been set, + * use the address of that interface as our + * source address if no source address has been set. + */ + if (V4_PART_OF_V6(udp->udp_v6src) == INADDR_ANY && + CLASSD(v4dst) && + udp->udp_multicast_if_addr != INADDR_ANY) { + IN6_IPADDR_TO_V4MAPPED(udp->udp_multicast_if_addr, + &udp->udp_v6src); + } + } else { + ASSERT(udp->udp_ipversion == IPV6_VERSION); + /* + * Interpret a zero destination to mean loopback. + * Update the T_CONN_REQ (sin/sin6) since it is used to + * generate the T_CONN_CON. + */ + if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) { + v6dst = ipv6_loopback; + sin6->sin6_addr = v6dst; + } + udp->udp_v6dst = v6dst; + udp->udp_flowinfo = flowinfo; + /* + * If the destination address is multicast and + * an outgoing multicast interface has been set, + * then the ip bind logic will pick the correct source + * address (i.e. matching the outgoing multicast interface). + */ + } + + /* + * Verify that the src/port/dst/port is unique for all + * connections in TS_DATA_XFER + */ + for (udp1 = udpf->uf_udp; udp1 != NULL; udp1 = udp1->udp_bind_hash) { + if (udp1->udp_state != TS_DATA_XFER) + continue; + if (udp->udp_port != udp1->udp_port || + udp->udp_ipversion != udp1->udp_ipversion || + dstport != udp1->udp_dstport || + !IN6_ARE_ADDR_EQUAL(&udp->udp_v6src, &udp1->udp_v6src) || + !IN6_ARE_ADDR_EQUAL(&v6dst, &udp1->udp_v6dst) || + !(IPCL_ZONE_MATCH(udp->udp_connp, + udp1->udp_connp->conn_zoneid) || + IPCL_ZONE_MATCH(udp1->udp_connp, + udp->udp_connp->conn_zoneid))) + continue; + mutex_exit(&udpf->uf_lock); + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); + return (-TBADADDR); + } + udp->udp_state = TS_DATA_XFER; + mutex_exit(&udpf->uf_lock); + + ire_mp = allocb(sizeof (ire_t), BPRI_HI); + if (ire_mp == NULL) { + mutex_enter(&udpf->uf_lock); + udp->udp_state = TS_IDLE; + udp->udp_pending_op = -1; + mutex_exit(&udpf->uf_lock); + rw_exit(&udp->udp_rwlock); + return (ENOMEM); + } + + rw_exit(&udp->udp_rwlock); + + ire_mp->b_wptr += sizeof (ire_t); + ire_mp->b_datap->db_type = IRE_DB_REQ_TYPE; + + if (udp->udp_family == AF_INET) { + error = ip_proto_bind_connected_v4(connp, &ire_mp, IPPROTO_UDP, + &V4_PART_OF_V6(udp->udp_v6src), udp->udp_port, + V4_PART_OF_V6(udp->udp_v6dst), udp->udp_dstport, + B_TRUE, B_TRUE); + } else { + error = ip_proto_bind_connected_v6(connp, &ire_mp, IPPROTO_UDP, + &udp->udp_v6src, udp->udp_port, &udp->udp_v6dst, + &udp->udp_sticky_ipp, udp->udp_dstport, B_TRUE, B_TRUE); + } + + return (udp_post_ip_bind_connect(udp, ire_mp, error)); +} + +/* ARGSUSED */ +static int +udp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, + socklen_t len, sock_connid_t *id, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + udp_t *udp = connp->conn_udp; + int error; + boolean_t did_bind = B_FALSE; + + if (sa == NULL) { + /* + * Disconnect + * Make sure we are connected + */ + if (udp->udp_state != TS_DATA_XFER) + return (EINVAL); + + error = udp_disconnect(connp); + return (error); + } + + error = proto_verify_ip_addr(udp->udp_family, sa, len); + if (error != 0) + goto done; + + /* do an implicit bind if necessary */ + if (udp->udp_state == TS_UNBND) { + error = udp_implicit_bind(connp, cr); + /* + * We could be racing with an actual bind, in which case + * we would see EPROTO. We cross our fingers and try + * to connect. + */ + if (!(error == 0 || error == EPROTO)) + goto done; + did_bind = B_TRUE; + } + /* + * set SO_DGRAM_ERRIND + */ + udp->udp_dgram_errind = B_TRUE; + + error = udp_do_connect(connp, sa, len); + + if (error != 0 && did_bind) { + int unbind_err; + + unbind_err = udp_do_unbind(connp); + ASSERT(unbind_err == 0); + } + + if (error == 0) { + *id = 0; + (*connp->conn_upcalls->su_connected) + (connp->conn_upper_handle, 0, NULL, -1); + } else if (error < 0) { + error = proto_tlitosyserr(-error); + } + +done: + if (error != 0 && udp->udp_state == TS_DATA_XFER) { + /* + * No need to hold locks to set state + * after connect failure socket state is undefined + * We set the state only to imitate old sockfs behavior + */ + udp->udp_state = TS_IDLE; + } + return (error); +} + +/* ARGSUSED */ +int +udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, + cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + udp_t *udp = connp->conn_udp; + udp_stack_t *us = udp->udp_us; + int error = 0; + + ASSERT(DB_TYPE(mp) == M_DATA); + + /* + * If the socket is connected and no change in destination + */ + if (msg->msg_namelen == 0) { + error = udp_send_connected(connp, mp, msg, cr, curproc->p_pid); + if (error == EDESTADDRREQ) + return (error); + else + return (udp->udp_dgram_errind ? error : 0); + } + + /* + * Do an implicit bind if necessary. + */ + if (udp->udp_state == TS_UNBND) { + error = udp_implicit_bind(connp, cr); + /* + * We could be racing with an actual bind, in which case + * we would see EPROTO. We cross our fingers and try + * to send. + */ + if (!(error == 0 || error == EPROTO)) { + freemsg(mp); + return (error); + } + } + + rw_enter(&udp->udp_rwlock, RW_WRITER); + + if (msg->msg_name != NULL && udp->udp_state == TS_DATA_XFER) { + rw_exit(&udp->udp_rwlock); + freemsg(mp); + return (EISCONN); + } + + + if (udp->udp_delayed_error != 0) { + boolean_t match; + + error = udp->udp_delayed_error; + match = B_FALSE; + udp->udp_delayed_error = 0; + switch (udp->udp_family) { + case AF_INET: { + /* Compare just IP address and port */ + sin_t *sin1 = (sin_t *)msg->msg_name; + sin_t *sin2 = (sin_t *)&udp->udp_delayed_addr; + + if (msg->msg_namelen == sizeof (sin_t) && + sin1->sin_port == sin2->sin_port && + sin1->sin_addr.s_addr == sin2->sin_addr.s_addr) + match = B_TRUE; + + break; + } + case AF_INET6: { + sin6_t *sin1 = (sin6_t *)msg->msg_name; + sin6_t *sin2 = (sin6_t *)&udp->udp_delayed_addr; + + if (msg->msg_namelen == sizeof (sin6_t) && + sin1->sin6_port == sin2->sin6_port && + IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr, + &sin2->sin6_addr)) + match = B_TRUE; + break; + } + default: + ASSERT(0); + } + + *((sin6_t *)&udp->udp_delayed_addr) = sin6_null; + + if (match) { + rw_exit(&udp->udp_rwlock); + freemsg(mp); + return (error); + } + } + + error = proto_verify_ip_addr(udp->udp_family, + (struct sockaddr *)msg->msg_name, msg->msg_namelen); + rw_exit(&udp->udp_rwlock); + + if (error != 0) { + freemsg(mp); + return (error); + } + + error = udp_send_not_connected(connp, mp, + (struct sockaddr *)msg->msg_name, msg->msg_namelen, msg, cr, + curproc->p_pid); + if (error != 0) { + UDP_STAT(us, udp_out_err_output); + freemsg(mp); + } + return (udp->udp_dgram_errind ? error : 0); +} + +void +udp_fallback(sock_lower_handle_t proto_handle, queue_t *q, + boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb) +{ + conn_t *connp = (conn_t *)proto_handle; + udp_t *udp; + struct T_capability_ack tca; + struct sockaddr_in6 laddr, faddr; + socklen_t laddrlen, faddrlen; + short opts; + struct stroptions *stropt; + mblk_t *stropt_mp; + int error; + + udp = connp->conn_udp; + + stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL); + + /* + * setup the fallback stream that was allocated + */ + connp->conn_dev = (dev_t)RD(q)->q_ptr; + connp->conn_minor_arena = WR(q)->q_ptr; + + RD(q)->q_ptr = WR(q)->q_ptr = connp; + + WR(q)->q_qinfo = &udp_winit; + + connp->conn_rq = RD(q); + connp->conn_wq = WR(q); + + /* Notify stream head about options before sending up data */ + stropt_mp->b_datap->db_type = M_SETOPTS; + stropt_mp->b_wptr += sizeof (*stropt); + stropt = (struct stroptions *)stropt_mp->b_rptr; + stropt->so_flags = SO_WROFF | SO_HIWAT; + stropt->so_wroff = + (ushort_t)(udp->udp_max_hdr_len + udp->udp_us->us_wroff_extra); + stropt->so_hiwat = udp->udp_rcv_disply_hiwat; + putnext(RD(q), stropt_mp); + + /* + * Free the helper stream + */ + ip_close_helper_stream(connp); + + if (!direct_sockfs) + udp_disable_direct_sockfs(udp); + + /* + * Collect the information needed to sync with the sonode + */ + udp_do_capability_ack(udp, &tca, TC1_INFO); + + laddrlen = faddrlen = sizeof (sin6_t); + (void) udp_getsockname((sock_lower_handle_t)connp, + (struct sockaddr *)&laddr, &laddrlen, NULL); + error = udp_getpeername((sock_lower_handle_t)connp, + (struct sockaddr *)&faddr, &faddrlen, NULL); + if (error != 0) + faddrlen = 0; + + opts = 0; + if (udp->udp_dgram_errind) + opts |= SO_DGRAM_ERRIND; + if (udp->udp_dontroute) + opts |= SO_DONTROUTE; + + /* + * Once we grab the drain lock, no data will be send up + * to the socket. So we notify the socket that the endpoint + * is quiescent and it's therefore safe move data from + * the socket to the stream head. + */ + (*quiesced_cb)(connp->conn_upper_handle, q, &tca, + (struct sockaddr *)&laddr, laddrlen, + (struct sockaddr *)&faddr, faddrlen, opts); + + /* + * push up any packets that were queued in udp_t + */ + + mutex_enter(&udp->udp_recv_lock); + while (udp->udp_fallback_queue_head != NULL) { + mblk_t *mp; + mp = udp->udp_fallback_queue_head; + udp->udp_fallback_queue_head = mp->b_next; + mutex_exit(&udp->udp_recv_lock); + mp->b_next = NULL; + putnext(RD(q), mp); + mutex_enter(&udp->udp_recv_lock); + } + udp->udp_fallback_queue_tail = udp->udp_fallback_queue_head; + /* + * No longer a streams less socket + */ + connp->conn_flags &= ~IPCL_NONSTR; + mutex_exit(&udp->udp_recv_lock); + + ASSERT(connp->conn_ref >= 1); +} + +static int +udp_do_getpeername(udp_t *udp, struct sockaddr *sa, uint_t *salenp) +{ + sin_t *sin = (sin_t *)sa; + sin6_t *sin6 = (sin6_t *)sa; + + ASSERT(RW_LOCK_HELD(&udp->udp_rwlock)); + ASSERT(udp != NULL); + + if (udp->udp_state != TS_DATA_XFER) + return (ENOTCONN); + + switch (udp->udp_family) { + case AF_INET: + ASSERT(udp->udp_ipversion == IPV4_VERSION); + + if (*salenp < sizeof (sin_t)) + return (EINVAL); + + *salenp = sizeof (sin_t); + *sin = sin_null; + sin->sin_family = AF_INET; + sin->sin_port = udp->udp_dstport; + sin->sin_addr.s_addr = V4_PART_OF_V6(udp->udp_v6dst); + break; + case AF_INET6: + if (*salenp < sizeof (sin6_t)) + return (EINVAL); + + *salenp = sizeof (sin6_t); + *sin6 = sin6_null; + sin6->sin6_family = AF_INET6; + sin6->sin6_port = udp->udp_dstport; + sin6->sin6_addr = udp->udp_v6dst; + sin6->sin6_flowinfo = udp->udp_flowinfo; + break; + } + + return (0); +} + +/* ARGSUSED */ +int +udp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa, + socklen_t *salenp, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + udp_t *udp = connp->conn_udp; + int error; + + ASSERT(udp != NULL); + + rw_enter(&udp->udp_rwlock, RW_READER); + + error = udp_do_getpeername(udp, sa, salenp); + + rw_exit(&udp->udp_rwlock); + + return (error); +} + +static int +udp_do_getsockname(udp_t *udp, struct sockaddr *sa, uint_t *salenp) +{ + sin_t *sin = (sin_t *)sa; + sin6_t *sin6 = (sin6_t *)sa; + + ASSERT(udp != NULL); + ASSERT(RW_LOCK_HELD(&udp->udp_rwlock)); + + switch (udp->udp_family) { + case AF_INET: + ASSERT(udp->udp_ipversion == IPV4_VERSION); + + if (*salenp < sizeof (sin_t)) + return (EINVAL); + + *salenp = sizeof (sin_t); + *sin = sin_null; + sin->sin_family = AF_INET; + if (udp->udp_state == TS_UNBND) { + break; + } + sin->sin_port = udp->udp_port; + + if (!IN6_IS_ADDR_V4MAPPED_ANY(&udp->udp_v6src) && + !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) { + sin->sin_addr.s_addr = V4_PART_OF_V6(udp->udp_v6src); + } else { + /* + * INADDR_ANY + * udp_v6src is not set, we might be bound to + * broadcast/multicast. Use udp_bound_v6src as + * local address instead (that could + * also still be INADDR_ANY) + */ + sin->sin_addr.s_addr = + V4_PART_OF_V6(udp->udp_bound_v6src); + } + break; + + case AF_INET6: + if (*salenp < sizeof (sin6_t)) + return (EINVAL); + + *salenp = sizeof (sin6_t); + *sin6 = sin6_null; + sin6->sin6_family = AF_INET6; + if (udp->udp_state == TS_UNBND) { + break; + } + sin6->sin6_port = udp->udp_port; + + if (!IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) { + sin6->sin6_addr = udp->udp_v6src; + } else { + /* + * UNSPECIFIED + * udp_v6src is not set, we might be bound to + * broadcast/multicast. Use udp_bound_v6src as + * local address instead (that could + * also still be UNSPECIFIED) + */ + sin6->sin6_addr = udp->udp_bound_v6src; + } + } + return (0); +} + +/* ARGSUSED */ +int +udp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa, + socklen_t *salenp, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + udp_t *udp = connp->conn_udp; + int error; + + ASSERT(udp != NULL); + rw_enter(&udp->udp_rwlock, RW_READER); + + error = udp_do_getsockname(udp, sa, salenp); + + rw_exit(&udp->udp_rwlock); + + return (error); +} + +int +udp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, + void *optvalp, socklen_t *optlen, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + udp_t *udp = connp->conn_udp; + int error; + t_uscalar_t max_optbuf_len; + void *optvalp_buf; + int len; + + error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, + udp_opt_obj.odb_opt_des_arr, + udp_opt_obj.odb_opt_arr_cnt, + udp_opt_obj.odb_topmost_tpiprovider, + B_FALSE, B_TRUE, cr); + if (error != 0) { + if (error < 0) + error = proto_tlitosyserr(-error); + return (error); + } + + optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); + rw_enter(&udp->udp_rwlock, RW_READER); + len = udp_opt_get(connp, level, option_name, optvalp_buf); + rw_exit(&udp->udp_rwlock); + + if (len < 0) { + /* + * Pass on to IP + */ + kmem_free(optvalp_buf, max_optbuf_len); + return (ip_get_options(connp, level, option_name, + optvalp, optlen, cr)); + } else { + /* + * update optlen and copy option value + */ + t_uscalar_t size = MIN(len, *optlen); + bcopy(optvalp_buf, optvalp, size); + bcopy(&size, optlen, sizeof (size)); + + kmem_free(optvalp_buf, max_optbuf_len); + return (0); + } +} + +int +udp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, + const void *optvalp, socklen_t optlen, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + udp_t *udp = connp->conn_udp; + int error; + + error = proto_opt_check(level, option_name, optlen, NULL, + udp_opt_obj.odb_opt_des_arr, + udp_opt_obj.odb_opt_arr_cnt, + udp_opt_obj.odb_topmost_tpiprovider, + B_TRUE, B_FALSE, cr); + + if (error != 0) { + if (error < 0) + error = proto_tlitosyserr(-error); + return (error); + } + + rw_enter(&udp->udp_rwlock, RW_WRITER); + error = udp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name, + optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp, + NULL, cr); + rw_exit(&udp->udp_rwlock); + + if (error < 0) { + /* + * Pass on to ip + */ + error = ip_set_options(connp, level, option_name, optvalp, + optlen, cr); + } + + return (error); +} + +void +udp_clr_flowctrl(sock_lower_handle_t proto_handle) +{ + conn_t *connp = (conn_t *)proto_handle; + udp_t *udp = connp->conn_udp; + + mutex_enter(&udp->udp_recv_lock); + connp->conn_flow_cntrld = B_FALSE; + mutex_exit(&udp->udp_recv_lock); +} + +/* ARGSUSED */ +int +udp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + + /* shut down the send side */ + if (how != SHUT_RD) + (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, + SOCK_OPCTL_SHUT_SEND, 0); + /* shut down the recv side */ + if (how != SHUT_WR) + (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, + SOCK_OPCTL_SHUT_RECV, 0); + return (0); +} + +int +udp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, + int mode, int32_t *rvalp, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + int error; + + switch (cmd) { + case ND_SET: + case ND_GET: + case _SIOCSOCKFALLBACK: + case TI_GETPEERNAME: + case TI_GETMYNAME: + ip1dbg(("udp_ioctl: cmd 0x%x on non streams socket", + cmd)); + error = EINVAL; + break; + default: + /* + * Pass on to IP using helper stream + */ + error = ldi_ioctl( + connp->conn_helper_info->ip_helper_stream_handle, + cmd, arg, mode, cr, rvalp); + break; + } + return (error); +} + +/* ARGSUSED */ +int +udp_accept(sock_lower_handle_t lproto_handle, + sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, + cred_t *cr) +{ + return (EOPNOTSUPP); +} + +/* ARGSUSED */ +int +udp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) +{ + return (EOPNOTSUPP); +} + +sock_downcalls_t sock_udp_downcalls = { + udp_activate, /* sd_activate */ + udp_accept, /* sd_accept */ + udp_bind, /* sd_bind */ + udp_listen, /* sd_listen */ + udp_connect, /* sd_connect */ + udp_getpeername, /* sd_getpeername */ + udp_getsockname, /* sd_getsockname */ + udp_getsockopt, /* sd_getsockopt */ + udp_setsockopt, /* sd_setsockopt */ + udp_send, /* sd_send */ + NULL, /* sd_send_uio */ + NULL, /* sd_recv_uio */ + NULL, /* sd_poll */ + udp_shutdown, /* sd_shutdown */ + udp_clr_flowctrl, /* sd_setflowctrl */ + udp_ioctl, /* sd_ioctl */ + udp_close /* sd_close */ +}; diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c index f900d0f3e1..0ec5a2c45e 100644 --- a/usr/src/uts/common/inet/udp/udp_opt_data.c +++ b/usr/src/uts/common/inet/udp/udp_opt_data.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/stream.h> #define _SUN_TPI_VERSION 2 @@ -85,9 +83,11 @@ opdes_t udp_opt_arr[] = { { SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), 40, -1 /* not initialized */ }, + (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), + IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), 40, -1 /* not initialized */ }, + (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), + IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, { IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, { T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, @@ -318,8 +318,8 @@ uint_t udp_max_optsize; /* initialized when UDP driver is loaded */ optdb_obj_t udp_opt_obj = { udp_opt_default, /* UDP default value function pointer */ - udp_opt_get, /* UDP get function pointer */ - udp_opt_set, /* UDP set function pointer */ + udp_tpi_opt_get, /* UDP get function pointer */ + udp_tpi_opt_set, /* UDP set function pointer */ B_TRUE, /* UDP is tpi provider */ UDP_OPT_ARR_CNT, /* UDP option database count of entries */ udp_opt_arr, /* UDP option database */ diff --git a/usr/src/uts/common/inet/udp/udpddi.c b/usr/src/uts/common/inet/udp/udpddi.c index 0b80531ab8..63248365cd 100644 --- a/usr/src/uts/common/inet/udp/udpddi.c +++ b/usr/src/uts/common/inet/udp/udpddi.c @@ -30,6 +30,8 @@ #include <inet/common.h> #include <inet/ip.h> #include <inet/udp_impl.h> +#include <sys/strsubr.h> +#include <sys/socketvar.h> #define INET_NAME "udp" #define INET_MODDESC "UDP dummy STREAMS module" @@ -38,6 +40,9 @@ #define INET_MODSTRTAB dummymodinfo #define INET_DEVSTRTAB udpinfov4 #define INET_MODMTFLAGS D_MP +#define INET_SOCKDESC "UDP socket module" +#define INET_SOCK_PROTO_CREATE_FUNC (*udp_create) +#define INET_SOCK_PROTO_FB_FUNC (*udp_fallback) /* * We define both synchronous STREAMS and sockfs direct-access * mode for UDP module instance, because it is autopushed on diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h index 468fa553f4..38d255ac9d 100644 --- a/usr/src/uts/common/inet/udp_impl.h +++ b/usr/src/uts/common/inet/udp_impl.h @@ -252,7 +252,9 @@ struct udp_stack { */ in_port_t us_min_anonpriv_port; + ldi_ident_t us_ldi_ident; }; + typedef struct udp_stack udp_stack_t; /* Internal udp control structure, one per open stream */ @@ -313,9 +315,14 @@ typedef struct udp_s { /* Following protected by udp_rwlock */ mblk_t *udp_rcv_list_head; /* b_next chain of mblks */ mblk_t *udp_rcv_list_tail; /* last mblk in chain */ + kmutex_t udp_recv_lock; /* recv lock */ uint_t udp_rcv_cnt; /* total data in rcv_list */ uint_t udp_rcv_msgcnt; /* total msgs in rcv_list */ + size_t udp_rcv_disply_hiwat; /* user's view of rcvbuf */ size_t udp_rcv_hiwat; /* receive high watermark */ + size_t udp_rcv_lowat; /* receive low watermark */ + size_t udp_xmit_hiwat; /* Send buffer high watermark */ + size_t udp_xmit_lowat; /* Send buffer low watermark */ uint_t udp_label_len; /* length of security label */ uint_t udp_label_len_v6; /* len of v6 security label */ in6_addr_t udp_v6lastdst; /* most recent destination */ @@ -323,6 +330,10 @@ typedef struct udp_s { uint64_t udp_open_time; /* time when this was opened */ pid_t udp_open_pid; /* process id when this was opened */ udp_stack_t *udp_us; /* Stack instance for zone */ + int udp_delayed_error; + mblk_t *udp_fallback_queue_head; + mblk_t *udp_fallback_queue_tail; + struct sockaddr_storage udp_delayed_addr; } udp_t; /* UDP Protocol header */ @@ -351,7 +362,6 @@ typedef struct udpahdr_s { #define UDP_STAT(us, x) ((us)->us_statistics.x.value.ui64++) #define UDP_STAT_UPDATE(us, x, n) \ ((us)->us_statistics.x.value.ui64 += (n)) - #ifdef DEBUG #define UDP_DBGSTAT(us, x) UDP_STAT(us, x) #else @@ -359,25 +369,19 @@ typedef struct udpahdr_s { #endif /* DEBUG */ extern int udp_opt_default(queue_t *, t_scalar_t, t_scalar_t, uchar_t *); -extern int udp_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *); -extern int udp_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *, +extern int udp_tpi_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *); +extern int udp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *, uint_t *, uchar_t *, void *, cred_t *, mblk_t *); extern mblk_t *udp_snmp_get(queue_t *, mblk_t *); extern int udp_snmp_set(queue_t *, t_scalar_t, t_scalar_t, uchar_t *, int); extern void udp_close_free(conn_t *); extern void udp_quiesce_conn(conn_t *); -extern void udp_ddi_init(void); -extern void udp_ddi_destroy(void); -extern void udp_resume_bind(conn_t *, mblk_t *); -extern void udp_wput(queue_t *, mblk_t *); - -extern int udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, - uchar_t *ptr); -extern int udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, - uchar_t *ptr); -extern int udp_opt_set(queue_t *q, uint_t optset_context, - int level, int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, - uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk); +extern void udp_ddi_g_init(void); +extern void udp_ddi_g_destroy(void); +extern void udp_g_q_inactive(udp_stack_t *); +extern void udp_output(conn_t *connp, mblk_t *mp, struct sockaddr *addr, + socklen_t addrlen); +extern void udp_wput(queue_t *, mblk_t *); /* * Object to represent database of options to search passed to @@ -387,6 +391,13 @@ extern int udp_opt_set(queue_t *q, uint_t optset_context, extern optdb_obj_t udp_opt_obj; extern uint_t udp_max_optsize; +extern sock_lower_handle_t udp_create(int, int, int, sock_downcalls_t **, + uint_t *, int *, int, cred_t *); +extern void udp_fallback(sock_lower_handle_t, queue_t *, boolean_t, + so_proto_quiesced_cb_t); + +extern sock_downcalls_t sock_udp_downcalls; + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/io/comstar/port/iscsit/iscsit_isns.c b/usr/src/uts/common/io/comstar/port/iscsit/iscsit_isns.c index 0f166f77b7..2708d10c5b 100644 --- a/usr/src/uts/common/io/comstar/port/iscsit/iscsit_isns.c +++ b/usr/src/uts/common/io/comstar/port/iscsit/iscsit_isns.c @@ -42,6 +42,7 @@ #include <sys/iscsit/isns_protocol.h> #include <iscsit.h> #include <iscsit_isns.h> +#include <sys/ksocket.h> /* local defines */ #define MAX_XID (2^16) @@ -177,7 +178,7 @@ static void isnst_esi_thread(void *arg); static boolean_t -isnst_handle_esi_req(struct sonode *so, isns_pdu_t *pdu, size_t pl_size); +isnst_handle_esi_req(ksocket_t so, isns_pdu_t *pdu, size_t pl_size); static void isnst_esi_start(isns_portal_list_t *portal); static void isnst_esi_stop(); @@ -303,22 +304,22 @@ isnst_esi_stop_thread(isns_esi_tinfo_t *tinfop) list_remove(&esi_list, tinfop); /* - * The only way to break a thread waiting in soaccept() is to signal - * it with EINTR. See idm_so_tgt_svc_offline for more detail. - */ - tinfop->esi_so->so_error = EINTR; - cv_signal(&tinfop->esi_so->so_connind_cv); - - /* - * Must also drop the global lock in case the esi thread is running - * and trying to update the server timestamps. + * The only way to break a thread waiting in ksocket_accept() is to call + * ksocket_close. */ mutex_exit(&isns_esi_mutex); ISNS_GLOBAL_UNLOCK(); + idm_soshutdown(tinfop->esi_so); + idm_sodestroy(tinfop->esi_so); thread_join(tinfop->esi_thread_did); ISNS_GLOBAL_LOCK(); mutex_enter(&isns_esi_mutex); + tinfop->esi_thread_running = B_FALSE; + tinfop->esi_so = NULL; + tinfop->esi_port = 0; + tinfop->esi_registered = B_FALSE; + cv_signal(&isns_esi_cv); tinfop->esi_portal->portal_esi = NULL; kmem_free(tinfop, sizeof (isns_esi_tinfo_t)); } @@ -630,18 +631,22 @@ isnst_stop() */ static void -isnst_update_server_timestamp(struct sonode *so) +isnst_update_server_timestamp(ksocket_t so) { iscsit_isns_svr_t *svr; struct in_addr *sin = NULL, *svr_in; struct in6_addr *sin6 = NULL, *svr_in6; - - if (so->so_faddr_sa->sa_family == AF_INET) { - sin = &((struct sockaddr_in *) - ((void *)so->so_faddr_sa))->sin_addr; + struct sockaddr_in6 t_addr; + socklen_t t_addrlen; + + bzero(&t_addr, sizeof (struct sockaddr_in6)); + t_addrlen = sizeof (struct sockaddr_in6); + (void) ksocket_getpeername(so, (struct sockaddr *)&t_addr, &t_addrlen, + CRED()); + if (((struct sockaddr *)(&t_addr))->sa_family == AF_INET) { + sin = &((struct sockaddr_in *)((void *)(&t_addr)))->sin_addr; } else { - sin6 = &((struct sockaddr_in6 *) - ((void *)so->so_faddr_sa))->sin6_addr; + sin6 = &(&t_addr)->sin6_addr; } /* @@ -1982,7 +1987,7 @@ static void * isnst_open_so(struct sockaddr_storage *sa) { int sa_sz; - struct sonode *so; + ksocket_t so; /* determin local IP address */ if (sa->ss_family == AF_INET) { @@ -2000,7 +2005,8 @@ isnst_open_so(struct sockaddr_storage *sa) } if (so != NULL) { - if (soconnect(so, (struct sockaddr *)sa, sa_sz, 0, 0) != 0) { + if (ksocket_connect(so, (struct sockaddr *)sa, sa_sz, CRED()) + != 0) { /* not calling isnst_close_so() to */ /* make dtrace output look clear */ idm_soshutdown(so); @@ -2133,7 +2139,7 @@ static void isnst_esi_thread(void *arg) { isns_esi_tinfo_t *tinfop; - struct sonode *newso; + ksocket_t newso; struct sockaddr_in sin; struct sockaddr_in6 sin6; uint32_t on; @@ -2141,6 +2147,14 @@ isnst_esi_thread(void *arg) isns_pdu_t *pdu; size_t pl_size; int family; + struct sockaddr_in t_addr; + struct sockaddr_in6 t_addr6; + socklen_t t_addrlen; + socklen_t t_addrlen6; + + bzero(&t_addr, sizeof (struct sockaddr_in6)); + t_addrlen = sizeof (struct sockaddr_in); + t_addrlen6 = sizeof (struct sockaddr_in6); tinfop = (isns_esi_tinfo_t *)arg; tinfop->esi_thread_did = curthread->t_did; @@ -2155,7 +2169,6 @@ isnst_esi_thread(void *arg) family = AF_INET6; } - if ((tinfop->esi_so = idm_socreate(family, SOCK_STREAM, 0)) == NULL) { cmn_err(CE_WARN, @@ -2166,7 +2179,7 @@ isnst_esi_thread(void *arg) mutex_exit(&isns_esi_mutex); thread_exit(); } - + ksocket_hold(tinfop->esi_so); /* * Set options, bind, and listen until we're told to stop */ @@ -2181,17 +2194,19 @@ isnst_esi_thread(void *arg) &sin.sin_addr.s_addr, sizeof (in_addr_t)); on = 1; - (void) sosetsockopt(tinfop->esi_so, SOL_SOCKET, SO_REUSEADDR, - (char *)&on, sizeof (on)); + (void) ksocket_setsockopt(tinfop->esi_so, SOL_SOCKET, + SO_REUSEADDR, (char *)&on, sizeof (on), CRED()); - if (sobind(tinfop->esi_so, (struct sockaddr *)&sin, - sizeof (sin), 0, 0) != 0) { + if (ksocket_bind(tinfop->esi_so, (struct sockaddr *)&sin, + sizeof (sin), CRED()) != 0) { idm_sodestroy(tinfop->esi_so); tinfop->esi_so = NULL; tinfop->esi_thread_failed = B_TRUE; } else { + (void) ksocket_getsockname(tinfop->esi_so, + (struct sockaddr *)(&t_addr), &t_addrlen, CRED()); tinfop->esi_port = ntohs(((struct sockaddr_in *) - ((void *)tinfop->esi_so->so_laddr_sa))->sin_port); + (&t_addr))->sin_port); } break; @@ -2205,17 +2220,19 @@ isnst_esi_thread(void *arg) &sin6.sin6_addr.s6_addr, sizeof (in6_addr_t)); on = 1; - (void) sosetsockopt(tinfop->esi_so, SOL_SOCKET, - SO_REUSEADDR, (char *)&on, sizeof (on)); + (void) ksocket_setsockopt(tinfop->esi_so, SOL_SOCKET, + SO_REUSEADDR, (char *)&on, sizeof (on), CRED()); - if (sobind(tinfop->esi_so, (struct sockaddr *)&sin6, - sizeof (sin6), 0, 0) != 0) { + if (ksocket_bind(tinfop->esi_so, (struct sockaddr *)&sin6, + sizeof (sin6), CRED()) != 0) { idm_sodestroy(tinfop->esi_so); tinfop->esi_so = NULL; tinfop->esi_thread_failed = B_TRUE; } else { + (void) ksocket_getsockname(tinfop->esi_so, + (struct sockaddr *)(&t_addr6), &t_addrlen6, CRED()); tinfop->esi_port = ntohs(((struct sockaddr_in6 *) - ((void *)tinfop->esi_so->so_laddr_sa))->sin6_port); + (&t_addr6))->sin6_port); } break; @@ -2226,7 +2243,7 @@ isnst_esi_thread(void *arg) goto esi_thread_exit; } - if ((rc = solisten(tinfop->esi_so, 5)) != 0) { + if ((rc = ksocket_listen(tinfop->esi_so, 5, CRED())) != 0) { cmn_err(CE_WARN, "isnst_esi_thread: listen failure 0x%x", rc); goto esi_thread_exit; } @@ -2244,21 +2261,21 @@ isnst_esi_thread(void *arg) DTRACE_PROBE2(iscsit__isns__esi__accept__wait, boolean_t, tinfop->esi_thread_running, boolean_t, tinfop->esi_thread_failed); - if ((rc = soaccept(tinfop->esi_so, 0, &newso)) != 0) { + if ((rc = ksocket_accept(tinfop->esi_so, NULL, NULL, + &newso, CRED())) != 0) { mutex_enter(&isns_esi_mutex); DTRACE_PROBE2(iscsit__isns__esi__accept__fail, boolean_t, tinfop->esi_thread_running, boolean_t, tinfop->esi_thread_failed); /* - * If we were interrupted with EINTR, it's not - * really a failure. + * If we were interrupted with EINTR + * it's not really a failure. */ if (rc != EINTR) { cmn_err(CE_WARN, "isnst_esi_thread: " "accept failure (0x%x)", rc); tinfop->esi_thread_failed = B_TRUE; } - tinfop->esi_thread_running = B_FALSE; continue; } @@ -2281,7 +2298,7 @@ isnst_esi_thread(void *arg) tinfop->esi_registered = B_TRUE; } - (void) soshutdown(newso, SHUT_RDWR); + (void) ksocket_close(newso, CRED()); /* * Do not hold the esi mutex during server timestamp @@ -2295,15 +2312,7 @@ isnst_esi_thread(void *arg) } mutex_exit(&isns_esi_mutex); esi_thread_exit: - idm_soshutdown(tinfop->esi_so); - idm_sodestroy(tinfop->esi_so); - mutex_enter(&isns_esi_mutex); - tinfop->esi_thread_running = B_FALSE; - tinfop->esi_so = NULL; - tinfop->esi_port = 0; - tinfop->esi_registered = B_FALSE; - cv_signal(&isns_esi_cv); - mutex_exit(&isns_esi_mutex); + ksocket_rele(tinfop->esi_so); thread_exit(); } @@ -2312,7 +2321,7 @@ esi_thread_exit: */ static boolean_t -isnst_handle_esi_req(struct sonode *so, isns_pdu_t *pdu, size_t pl_size) +isnst_handle_esi_req(ksocket_t ks, isns_pdu_t *pdu, size_t pl_size) { isns_pdu_t *rsp_pdu; isns_resp_t *rsp; @@ -2353,7 +2362,7 @@ isnst_handle_esi_req(struct sonode *so, isns_pdu_t *pdu, size_t pl_size) bcopy(pdu->payload, rsp->data, pl_len - 4); rsp_pdu->payload_len = htons(pl_len); - if (isnst_send_pdu(so, rsp_pdu) != 0) { + if (isnst_send_pdu(ks, rsp_pdu) != 0) { cmn_err(CE_WARN, "isnst_handle_esi_req: Send response failed"); esirv = B_FALSE; } diff --git a/usr/src/uts/common/io/comstar/port/iscsit/iscsit_isns.h b/usr/src/uts/common/io/comstar/port/iscsit/iscsit_isns.h index 40c111f491..af0d8982bb 100644 --- a/usr/src/uts/common/io/comstar/port/iscsit/iscsit_isns.h +++ b/usr/src/uts/common/io/comstar/port/iscsit/iscsit_isns.h @@ -62,7 +62,7 @@ typedef struct { struct isns_portal_list_s *esi_portal; kthread_t *esi_thread; kt_did_t esi_thread_did; - struct sonode *esi_so; + ksocket_t esi_so; uint16_t esi_port; boolean_t esi_thread_running; boolean_t esi_thread_failed; diff --git a/usr/src/uts/common/io/comstar/port/iscsit/iscsit_radiuspacket.c b/usr/src/uts/common/io/comstar/port/iscsit/iscsit_radiuspacket.c index 2441e3b65c..912158cb2d 100644 --- a/usr/src/uts/common/io/comstar/port/iscsit/iscsit_radiuspacket.c +++ b/usr/src/uts/common/io/comstar/port/iscsit/iscsit_radiuspacket.c @@ -32,18 +32,19 @@ #include <sys/idm/idm_so.h> #include <sys/iscsit/radius_packet.h> #include <sys/iscsit/radius_protocol.h> +#include <sys/ksocket.h> static void encode_chap_password(int identifier, int chap_passwd_len, uint8_t *chap_passwd, uint8_t *result); -static size_t iscsit_net_recvmsg(void *socket, struct msghdr *msg, +static size_t iscsit_net_recvmsg(ksocket_t socket, struct msghdr *msg, int timeout); /* * See radius_packet.h. */ int -iscsit_snd_radius_request(void *socket, iscsi_ipaddr_t rsvr_ip_addr, +iscsit_snd_radius_request(ksocket_t socket, iscsi_ipaddr_t rsvr_ip_addr, uint32_t rsvr_port, radius_packet_data_t *req_data) { int i; /* Loop counter. */ @@ -164,7 +165,7 @@ iscsit_snd_radius_request(void *socket, iscsi_ipaddr_t rsvr_ip_addr, * See radius_packet.h. */ int -iscsit_rcv_radius_response(void *socket, uint8_t *shared_secret, +iscsit_rcv_radius_response(ksocket_t socket, uint8_t *shared_secret, uint32_t shared_secret_len, uint8_t *req_authenticator, radius_packet_data_t *resp_data) { @@ -177,8 +178,6 @@ iscsit_rcv_radius_response(void *socket, uint8_t *shared_secret, struct iovec iov[1]; struct nmsghdr msg; - struct sonode *so = (struct sonode *)socket; - int ret = 0; tmp_data = kmem_zalloc(MAX_RAD_PACKET_LEN, KM_SLEEP); iov[0].iov_base = (char *)tmp_data; @@ -193,11 +192,6 @@ iscsit_rcv_radius_response(void *socket, uint8_t *shared_secret, msg.msg_iov = iov; msg.msg_iovlen = 1; - (void) VOP_IOCTL(SOTOV(so), I_POP, 0, FKIOCTL, CRED(), &ret, NULL); - if (ret != 0) { - return (RAD_RSP_RCVD_NO_DATA); - } - received_len = iscsit_net_recvmsg(socket, &msg, RAD_RCV_TIMEOUT); if (received_len <= (size_t)0) { @@ -313,36 +307,32 @@ encode_chap_password(int identifier, int chap_passwd_len, */ /* ARGSUSED */ static size_t -iscsit_net_recvmsg(void *socket, struct msghdr *msg, int timeout) +iscsit_net_recvmsg(ksocket_t socket, struct msghdr *msg, int timeout) { - int idx; - int total_len = 0; - struct uio uio; - uchar_t pri = 0; - int prflag = MSG_ANY; - rval_t rval; - struct sonode *sonode = (struct sonode *)socket; - - /* Initialization of the uio structure. */ - bzero(&uio, sizeof (uio)); - uio.uio_iov = msg->msg_iov; - uio.uio_iovcnt = msg->msg_iovlen; - uio.uio_segflg = UIO_SYSSPACE; - - for (idx = 0; idx < msg->msg_iovlen; idx++) { - total_len += (msg->msg_iov)[idx].iov_len; - } - uio.uio_resid = total_len; - + int prflag = msg->msg_flags; + size_t recv = 0; + struct sockaddr_in6 l_addr, f_addr; + socklen_t l_addrlen; + socklen_t f_addrlen; + + bzero(&l_addr, sizeof (struct sockaddr_in6)); + bzero(&f_addr, sizeof (struct sockaddr_in6)); + l_addrlen = sizeof (struct sockaddr_in6); + f_addrlen = sizeof (struct sockaddr_in6); /* If timeout requested on receive */ if (timeout > 0) { boolean_t loopback = B_FALSE; + (void) ksocket_getsockname(socket, (struct sockaddr *)(&l_addr), + &l_addrlen, CRED()); + (void) ksocket_getpeername(socket, (struct sockaddr *)(&f_addr), + &f_addrlen, CRED()); + /* And this isn't a loopback connection */ - if (sonode->so_laddr.soa_sa->sa_family == AF_INET) { + if (((struct sockaddr *)(&l_addr))->sa_family == AF_INET) { struct sockaddr_in *lin = (struct sockaddr_in *) - ((void *)sonode->so_laddr.soa_sa); + ((void *)(&l_addr)); struct sockaddr_in *fin = (struct sockaddr_in *) - ((void *)sonode->so_faddr.soa_sa); + ((void *)(&f_addr)); if ((lin->sin_family == fin->sin_family) && (bcmp(&lin->sin_addr, &fin->sin_addr, @@ -351,9 +341,9 @@ iscsit_net_recvmsg(void *socket, struct msghdr *msg, int timeout) } } else { struct sockaddr_in6 *lin6 = (struct sockaddr_in6 *) - ((void *)sonode->so_laddr.soa_sa); + ((void *)(&l_addr)); struct sockaddr_in6 *fin6 = (struct sockaddr_in6 *) - ((void *)sonode->so_faddr.soa_sa); + ((void *)(&f_addr)); if ((lin6->sin6_family == fin6->sin6_family) && (bcmp(&lin6->sin6_addr, &fin6->sin6_addr, @@ -361,23 +351,20 @@ iscsit_net_recvmsg(void *socket, struct msghdr *msg, int timeout) loopback = B_TRUE; } } - if (loopback == B_FALSE) { - /* - * Then poll device for up to the timeout - * period or the requested data is received. - */ - if (kstrgetmsg(SOTOV(sonode), - NULL, NULL, &pri, &prflag, timeout * 1000, - &rval) == ETIME) { + struct timeval tl; + tl.tv_sec = timeout; + tl.tv_usec = 0; + /* Set recv timeout */ + if (ksocket_setsockopt(socket, SOL_SOCKET, SO_RCVTIMEO, + &tl, sizeof (struct timeval), CRED())) return (0); - } } } /* * Receive the requested data. Block until all - * data is received. + * data is received or timeout. * * resid occurs only when the connection is * disconnected. In that case it will return @@ -385,6 +372,6 @@ iscsit_net_recvmsg(void *socket, struct msghdr *msg, int timeout) * In general this is the total amount we * requested. */ - (void) sorecvmsg((struct sonode *)socket, msg, &uio); - return (total_len - uio.uio_resid); + (void) ksocket_recvmsg(socket, msg, prflag, &recv, CRED()); + return (recv); } diff --git a/usr/src/uts/common/io/ib/clients/rds/rds_opt.c b/usr/src/uts/common/io/ib/clients/rds/rds_opt.c index f0e863d0f3..902d838ff4 100644 --- a/usr/src/uts/common/io/ib/clients/rds/rds_opt.c +++ b/usr/src/uts/common/io/ib/clients/rds/rds_opt.c @@ -19,14 +19,12 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/ib/clients/rds/rds.h> -#include <inet/mi.h> +#include <inet/proto_set.h> #define rds_max_buf 2097152 opdes_t rds_opt_arr[] = { @@ -143,7 +141,7 @@ rds_opt_set(queue_t *q, uint_t optset_context, int level, } if (!checkonly) { RD(q)->q_hiwat = *i1; - (void) mi_set_sth_hiwat(RD(q), *i1); + (void) proto_set_rx_hiwat(RD(q), NULL, *i1); } break; default: diff --git a/usr/src/uts/common/io/ib/clients/rds/rdsddi.c b/usr/src/uts/common/io/ib/clients/rds/rdsddi.c index 306a2a593e..877e56fe8a 100644 --- a/usr/src/uts/common/io/ib/clients/rds/rdsddi.c +++ b/usr/src/uts/common/io/ib/clients/rds/rdsddi.c @@ -23,7 +23,6 @@ * Use is subject to license terms. */ - #include <sys/types.h> #include <sys/conf.h> #include <sys/modctl.h> @@ -43,6 +42,7 @@ #include <inet/common.h> #include <inet/ip.h> #include <inet/mi.h> +#include <inet/proto_set.h> #include <sys/ib/clients/rds/rds.h> #include <sys/policy.h> #include <inet/ipclassifier.h> @@ -226,8 +226,8 @@ rds_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) WR(q)->q_lowat = rds_xmit_lowat; /* Set the Stream head watermarks */ - (void) mi_set_sth_hiwat(q, rds_recv_hiwat); - (void) mi_set_sth_lowat(q, rds_recv_lowat); + (void) proto_set_rx_hiwat(q, NULL, rds_recv_hiwat); + (void) proto_set_rx_lowat(q, NULL, rds_recv_lowat); return (0); } @@ -337,7 +337,7 @@ rds_deliver_new_msg(mblk_t *mp, ipaddr_t local_addr, ipaddr_t rem_addr, if (rds->rds_port_quota > current_port_quota) { /* this may result in stalling the port */ rds->rds_port_quota = current_port_quota; - (void) mi_set_sth_hiwat(rds->rds_ulpd, + (void) proto_set_rx_hiwat(rds->rds_ulpd, NULL, rds->rds_port_quota * UserBufferSize); RDS_INCR_PORT_QUOTA_ADJUSTED(); } @@ -599,7 +599,8 @@ rds_bind(queue_t *q, mblk_t *mp) RDS_INCR_NPORT(); rds->rds_port_quota = RDS_CURRENT_PORT_QUOTA(); RDS_SET_PORT_QUOTA(rds->rds_port_quota); - (void) mi_set_sth_hiwat(RD(q), rds->rds_port_quota * UserBufferSize); + (void) proto_set_rx_hiwat(RD(q), NULL, + rds->rds_port_quota * UserBufferSize); qreply(q, mp); } @@ -859,7 +860,7 @@ rds_rsrv(queue_t *q) current_port_quota = RDS_GET_PORT_QUOTA(); if (rds->rds_port_quota != current_port_quota) { rds->rds_port_quota = current_port_quota; - (void) mi_set_sth_hiwat(q, + (void) proto_set_rx_hiwat(q, NULL, rds->rds_port_quota * UserBufferSize); } diff --git a/usr/src/uts/common/io/ib/clients/sdp/sdpddi.c b/usr/src/uts/common/io/ib/clients/sdp/sdpddi.c index 0973888811..d0c3bb8b4e 100644 --- a/usr/src/uts/common/io/ib/clients/sdp/sdpddi.c +++ b/usr/src/uts/common/io/ib/clients/sdp/sdpddi.c @@ -23,7 +23,6 @@ * Use is subject to license terms. */ - #include <sys/types.h> #include <sys/conf.h> #include <sys/modctl.h> @@ -182,9 +181,12 @@ sdp_gen_ioctl(queue_t *q, mblk_t *mp) /* LINTED */ iocp = (struct iocblk *)mp->b_rptr; switch (iocp->ioc_cmd) { + uintptr_t send_enable; case SIOCSENABLESDP: bcopy(mp->b_cont->b_rptr, &enable, sizeof (int)); + send_enable = enable; + /* * Check for root privs. * if not net config privs - return state of system SDP @@ -202,7 +204,8 @@ sdp_gen_ioctl(queue_t *q, mblk_t *mp) * action of enabling/disabling sdp is simply acked. */ rw_enter(&sdp_transport_lock, RW_READER); - if ((enable == 1) && (sdp_transport_handle == NULL) && + if ((send_enable == 1) && + (sdp_transport_handle == NULL) && (priv == B_TRUE)) { /* Initialize sdpib transport driver */ rw_exit(&sdp_transport_lock); @@ -215,21 +218,20 @@ sdp_gen_ioctl(queue_t *q, mblk_t *mp) enable = 0; goto done; } - (void) sdp_ioctl(NULL, iocp->ioc_cmd, &enable, - CRED()); - } else if ((enable == 0) && - (sdp_transport_handle != NULL) && - (priv == B_TRUE)) { - (void) sdp_ioctl(NULL, iocp->ioc_cmd, &enable, - CRED()); - (void) ldi_close(sdp_transport_handle, - FNDELAY, kcred); - sdp_transport_handle = NULL; + (void) ldi_ioctl(sdp_transport_handle, + iocp->ioc_cmd, (intptr_t)&send_enable, + FKIOCTL, CRED(), (int *)&enable); + } else if (sdp_transport_handle != NULL) { + (void) ldi_ioctl(sdp_transport_handle, + iocp->ioc_cmd, (intptr_t)&send_enable, + FKIOCTL, CRED(), (int *)&enable); + if (send_enable == 0 && priv == B_TRUE) { + (void) ldi_close(sdp_transport_handle, + FNDELAY, kcred); + sdp_transport_handle = NULL; + } } else { - ret = sdp_ioctl(NULL, iocp->ioc_cmd, &enable, - CRED()); - if (ret == EINVAL) - enable = 0; + enable = 0; } rw_exit(&sdp_transport_lock); diff --git a/usr/src/uts/common/io/idm/idm_so.c b/usr/src/uts/common/io/idm/idm_so.c index b8c236d749..c868c76ddd 100644 --- a/usr/src/uts/common/io/idm/idm_so.c +++ b/usr/src/uts/common/io/idm/idm_so.c @@ -45,7 +45,7 @@ #include <netinet/in.h> #include <net/if.h> #include <sys/sockio.h> - +#include <sys/ksocket.h> #include <sys/idm/idm.h> #include <sys/idm/idm_so.h> #include <sys/idm/idm_text.h> @@ -60,14 +60,13 @@ static void idm_sorx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status); static void idm_sorx_addl_pdu_cb(idm_pdu_t *pdu, idm_status_t status); static void idm_sotx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status); -static idm_status_t idm_so_conn_create_common(idm_conn_t *ic, - struct sonode *new_so); +static idm_status_t idm_so_conn_create_common(idm_conn_t *ic, ksocket_t new_so); static void idm_so_conn_destroy_common(idm_conn_t *ic); static void idm_so_conn_connect_common(idm_conn_t *ic); static void idm_set_ini_preconnect_options(idm_so_conn_t *sc); static void idm_set_ini_postconnect_options(idm_so_conn_t *sc); -static void idm_set_tgt_connect_options(struct sonode *sonode); +static void idm_set_tgt_connect_options(ksocket_t so); static idm_status_t idm_i_so_tx(idm_pdu_t *pdu); static idm_status_t idm_sorecvdata(idm_conn_t *ic, idm_pdu_t *pdu); @@ -180,58 +179,17 @@ idm_so_fini(void) kmem_cache_destroy(idm.idm_sorx_pdu_cache); } -struct sonode * +ksocket_t idm_socreate(int domain, int type, int protocol) { - vnode_t *dvp; - vnode_t *vp; - struct snode *csp; - int err; - major_t maj; - - if ((vp = solookup(domain, type, protocol, NULL, &err)) == NULL) { - - /* - * solookup calls sogetvp if the vp is not found in the cache. - * Since the call to sogetvp is hardwired to use USERSPACE - * and declared static we'll do the work here instead. - */ - err = lookupname(type == SOCK_STREAM ? "/dev/tcp" : "/dev/udp", - UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); - if (err != 0) - return (NULL); - - /* Check that it is the correct vnode */ - if (vp->v_type != VCHR) { - VN_RELE(vp); - return (NULL); - } - - csp = VTOS(VTOS(vp)->s_commonvp); - if (!(csp->s_flag & SDIPSET)) { - char *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); - - err = ddi_dev_pathname(vp->v_rdev, S_IFCHR, - pathname); - if (err == 0) { - err = devfs_lookupname(pathname, NULLVPP, - &dvp); - } - VN_RELE(vp); - kmem_free(pathname, MAXPATHLEN); - if (err != 0) { - return (NULL); - } - vp = dvp; - } + ksocket_t ks; - maj = getmajor(vp->v_rdev); - if (!STREAMSTAB(maj)) { - VN_RELE(vp); - return (NULL); - } + if (!ksocket_socket(&ks, domain, type, protocol, KSOCKET_NOSLEEP, + CRED())) { + return (ks); + } else { + return (NULL); } - return (socreate(vp, domain, type, protocol, SOV_DEFAULT, NULL, &err)); } /* @@ -242,9 +200,9 @@ idm_socreate(int domain, int type, int protocol) * regain control of a thread stuck in idm_sorecv. */ void -idm_soshutdown(struct sonode *so) +idm_soshutdown(ksocket_t so) { - (void) soshutdown(so, SHUT_RDWR); + (void) ksocket_shutdown(so, SHUT_RDWR, CRED()); } /* @@ -254,13 +212,9 @@ idm_soshutdown(struct sonode *so) * otherwise undefined behavior will result. */ void -idm_sodestroy(struct sonode *so) +idm_sodestroy(ksocket_t ks) { - vnode_t *vp = SOTOV(so); - - (void) VOP_CLOSE(vp, 0, 1, 0, kcred, NULL); - - VN_RELE(vp); + (void) ksocket_close(ks, CRED()); } /* @@ -303,8 +257,7 @@ idm_v6_addr_okay(struct in6_addr *addr6) int idm_get_ipaddr(idm_addr_list_t **ipaddr_p) { - struct sonode *so4, *so6; - vnode_t *vp, *vp4, *vp6; + ksocket_t so4, so6; struct lifnum lifn; struct lifconf lifc; struct lifreq *lp; @@ -332,19 +285,15 @@ idm_get_ipaddr(idm_addr_list_t **ipaddr_p) return (0); } - /* setup the vp's for each socket type */ - vp6 = SOTOV(so6); - vp4 = SOTOV(so4); - /* use vp6 for ioctls with unspecified families by default */ - vp = vp6; retry_count: /* snapshot the current number of interfaces */ lifn.lifn_family = PF_UNSPEC; lifn.lifn_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES; lifn.lifn_count = 0; - if (VOP_IOCTL(vp, SIOCGLIFNUM, (intptr_t)&lifn, FKIOCTL, kcred, - &rval, NULL) != 0) { + /* use vp6 for ioctls with unspecified families by default */ + if (ksocket_ioctl(so6, SIOCGLIFNUM, (intptr_t)&lifn, &rval, CRED()) + != 0) { goto cleanup; } @@ -364,8 +313,7 @@ retry_count: lifc.lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES; lifc.lifc_len = bufsize; lifc.lifc_buf = buf; - rc = VOP_IOCTL(vp, SIOCGLIFCONF, (intptr_t)&lifc, FKIOCTL, kcred, - &rval, NULL); + rc = ksocket_ioctl(so6, SIOCGLIFCONF, (intptr_t)&lifc, &rval, CRED()); if (rc != 0) { goto cleanup; } @@ -401,16 +349,16 @@ retry_count: */ switch (ss.ss_family) { case AF_INET: - vp = vp4; + rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)lp, + &rval, CRED()); break; case AF_INET6: - vp = vp6; + rc = ksocket_ioctl(so6, SIOCGLIFFLAGS, (intptr_t)lp, + &rval, CRED()); break; default: continue; } - rc = VOP_IOCTL(vp, SIOCGLIFFLAGS, (intptr_t)lp, FKIOCTL, kcred, - &rval, NULL); if (rc == 0) { /* * If we got the flags, skip uninteresting @@ -468,7 +416,7 @@ cleanup: } int -idm_sorecv(struct sonode *so, void *msg, size_t len) +idm_sorecv(ksocket_t so, void *msg, size_t len) { iovec_t iov; @@ -495,13 +443,13 @@ idm_sorecv(struct sonode *so, void *msg, size_t len) * -1 if sosendmsg returns success but uio_resid != 0 */ int -idm_sosendto(struct sonode *so, void *buff, size_t len, +idm_sosendto(ksocket_t so, void *buff, size_t len, struct sockaddr *name, socklen_t namelen) { struct msghdr msg; - struct uio uio; struct iovec iov[1]; int error; + size_t sent = 0; iov[0].iov_base = buff; iov[0].iov_len = len; @@ -510,19 +458,12 @@ idm_sosendto(struct sonode *so, void *buff, size_t len, bzero(&msg, sizeof (msg)); msg.msg_iov = iov; msg.msg_iovlen = 1; - - /* Initialization of the uio structure. */ - uio.uio_iov = iov; - uio.uio_iovcnt = 1; - uio.uio_segflg = UIO_SYSSPACE; - uio.uio_resid = len; - msg.msg_name = name; msg.msg_namelen = namelen; - if ((error = sosendmsg(so, &msg, &uio)) == 0) { + if ((error = ksocket_sendmsg(so, &msg, 0, &sent, CRED())) == 0) { /* Data sent */ - if (uio.uio_resid == 0) { + if (sent == len) { /* All data sent. Success. */ return (0); } else { @@ -546,11 +487,11 @@ idm_sosendto(struct sonode *so, void *buff, size_t len, * -1 if sosendmsg returns success but uio_resid != 0 */ int -idm_iov_sosend(struct sonode *so, iovec_t *iop, int iovlen, size_t total_len) +idm_iov_sosend(ksocket_t so, iovec_t *iop, int iovlen, size_t total_len) { struct msghdr msg; - struct uio uio; int error; + size_t sent = 0; ASSERT(iop != NULL); @@ -559,16 +500,10 @@ idm_iov_sosend(struct sonode *so, iovec_t *iop, int iovlen, size_t total_len) msg.msg_iov = iop; msg.msg_iovlen = iovlen; - /* Initialization of the uio structure. */ - bzero(&uio, sizeof (uio)); - uio.uio_iov = iop; - uio.uio_iovcnt = iovlen; - uio.uio_segflg = UIO_SYSSPACE; - uio.uio_resid = total_len; - - if ((error = sosendmsg(so, &msg, &uio)) == 0) { + if ((error = ksocket_sendmsg(so, &msg, 0, &sent, CRED())) + == 0) { /* Data sent */ - if (uio.uio_resid == 0) { + if (sent == total_len) { /* All data sent. Success. */ return (0); } else { @@ -592,30 +527,25 @@ idm_iov_sosend(struct sonode *so, iovec_t *iop, int iovlen, size_t total_len) * -1 if sorecvmsg returns success but uio_resid != 0 */ int -idm_iov_sorecv(struct sonode *so, iovec_t *iop, int iovlen, size_t total_len) +idm_iov_sorecv(ksocket_t so, iovec_t *iop, int iovlen, size_t total_len) { struct msghdr msg; - struct uio uio; int error; + size_t recv; + int flags; ASSERT(iop != NULL); /* Initialization of the message header. */ bzero(&msg, sizeof (msg)); msg.msg_iov = iop; - msg.msg_flags = MSG_WAITALL; msg.msg_iovlen = iovlen; + flags = MSG_WAITALL; - /* Initialization of the uio structure. */ - bzero(&uio, sizeof (uio)); - uio.uio_iov = iop; - uio.uio_iovcnt = iovlen; - uio.uio_segflg = UIO_SYSSPACE; - uio.uio_resid = total_len; - - if ((error = sorecvmsg(so, &msg, &uio)) == 0) { + if ((error = ksocket_recvmsg(so, &msg, flags, &recv, CRED())) + == 0) { /* Received data */ - if (uio.uio_resid == 0) { + if (recv == total_len) { /* All requested data received. Success */ return (0); } else { @@ -639,12 +569,14 @@ idm_set_ini_preconnect_options(idm_so_conn_t *sc) int abort = 30000; /* Pre-connect socket options */ - (void) sosetsockopt(sc->ic_so, IPPROTO_TCP, TCP_CONN_NOTIFY_THRESHOLD, - (char *)&conn_notify, sizeof (int)); - (void) sosetsockopt(sc->ic_so, IPPROTO_TCP, TCP_CONN_ABORT_THRESHOLD, - (char *)&conn_abort, sizeof (int)); - (void) sosetsockopt(sc->ic_so, IPPROTO_TCP, TCP_ABORT_THRESHOLD, - (char *)&abort, sizeof (int)); + (void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP, + TCP_CONN_NOTIFY_THRESHOLD, (char *)&conn_notify, sizeof (int), + CRED()); + (void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP, + TCP_CONN_ABORT_THRESHOLD, (char *)&conn_abort, sizeof (int), + CRED()); + (void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP, TCP_ABORT_THRESHOLD, + (char *)&abort, sizeof (int), CRED()); } static void @@ -655,28 +587,28 @@ idm_set_ini_postconnect_options(idm_so_conn_t *sc) const int on = 1; /* Set postconnect options */ - (void) sosetsockopt(sc->ic_so, IPPROTO_TCP, TCP_NODELAY, - (char *)&on, sizeof (int)); - (void) sosetsockopt(sc->ic_so, SOL_SOCKET, SO_RCVBUF, - (char *)&rcvbuf, sizeof (int)); - (void) sosetsockopt(sc->ic_so, SOL_SOCKET, SO_SNDBUF, - (char *)&sndbuf, sizeof (int)); + (void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP, TCP_NODELAY, + (char *)&on, sizeof (int), CRED()); + (void) ksocket_setsockopt(sc->ic_so, SOL_SOCKET, SO_RCVBUF, + (char *)&rcvbuf, sizeof (int), CRED()); + (void) ksocket_setsockopt(sc->ic_so, SOL_SOCKET, SO_SNDBUF, + (char *)&sndbuf, sizeof (int), CRED()); } static void -idm_set_tgt_connect_options(struct sonode *sonode) +idm_set_tgt_connect_options(ksocket_t ks) { int32_t rcvbuf = IDM_RCVBUF_SIZE; int32_t sndbuf = IDM_SNDBUF_SIZE; const int on = 1; /* Set connect options */ - (void) sosetsockopt(sonode, SOL_SOCKET, SO_RCVBUF, - (char *)&rcvbuf, sizeof (int)); - (void) sosetsockopt(sonode, SOL_SOCKET, SO_SNDBUF, - (char *)&sndbuf, sizeof (int)); - (void) sosetsockopt(sonode, IPPROTO_TCP, TCP_NODELAY, - (char *)&on, sizeof (on)); + (void) ksocket_setsockopt(ks, SOL_SOCKET, SO_RCVBUF, + (char *)&rcvbuf, sizeof (int), CRED()); + (void) ksocket_setsockopt(ks, SOL_SOCKET, SO_SNDBUF, + (char *)&sndbuf, sizeof (int), CRED()); + (void) ksocket_setsockopt(ks, IPPROTO_TCP, TCP_NODELAY, + (char *)&on, sizeof (on), CRED()); } static uint32_t @@ -777,7 +709,7 @@ idm_sorecvhdr(idm_conn_t *ic, idm_pdu_t *pdu) static idm_status_t idm_so_ini_conn_create(idm_conn_req_t *cr, idm_conn_t *ic) { - struct sonode *so; + ksocket_t so; idm_so_conn_t *so_conn; idm_status_t idmrc; @@ -789,8 +721,8 @@ idm_so_ini_conn_create(idm_conn_req_t *cr, idm_conn_t *ic) /* Bind the socket if configured to do so */ if (cr->cr_bound) { - if (sobind(so, &cr->cr_bound_addr.sin, - SIZEOF_SOCKADDR(&cr->cr_bound_addr.sin), 0, 0) != 0) { + if (ksocket_bind(so, &cr->cr_bound_addr.sin, + SIZEOF_SOCKADDR(&cr->cr_bound_addr.sin), CRED()) != 0) { idm_sodestroy(so); return (IDM_STATUS_FAIL); } @@ -832,8 +764,8 @@ idm_so_ini_conn_connect(idm_conn_t *ic) so_conn = ic->ic_transport_private; - if (soconnect(so_conn->ic_so, &ic->ic_ini_dst_addr.sin, - (SIZEOF_SOCKADDR(&ic->ic_ini_dst_addr.sin)), 0, 0) != 0) { + if (ksocket_connect(so_conn->ic_so, &ic->ic_ini_dst_addr.sin, + (SIZEOF_SOCKADDR(&ic->ic_ini_dst_addr.sin)), CRED()) != 0) { idm_soshutdown(so_conn->ic_so); return (IDM_STATUS_FAIL); } @@ -846,7 +778,7 @@ idm_so_ini_conn_connect(idm_conn_t *ic) } idm_status_t -idm_so_tgt_conn_create(idm_conn_t *ic, struct sonode *new_so) +idm_so_tgt_conn_create(idm_conn_t *ic, ksocket_t new_so) { idm_status_t idmrc; @@ -875,7 +807,7 @@ idm_so_tgt_conn_connect(idm_conn_t *ic) } static idm_status_t -idm_so_conn_create_common(idm_conn_t *ic, struct sonode *new_so) +idm_so_conn_create_common(idm_conn_t *ic, ksocket_t new_so) { idm_so_conn_t *so_conn; @@ -917,18 +849,20 @@ static void idm_so_conn_connect_common(idm_conn_t *ic) { idm_so_conn_t *so_conn; + struct sockaddr_in6 t_addr; + socklen_t t_addrlen = 0; so_conn = ic->ic_transport_private; - - SOP_GETSOCKNAME(so_conn->ic_so); + bzero(&t_addr, sizeof (struct sockaddr_in6)); + t_addrlen = sizeof (struct sockaddr_in6); /* Set the local and remote addresses in the idm conn handle */ - mutex_enter(&so_conn->ic_so->so_lock); - bcopy(so_conn->ic_so->so_laddr_sa, &ic->ic_laddr, - so_conn->ic_so->so_laddr_len); - bcopy(so_conn->ic_so->so_faddr_sa, &ic->ic_raddr, - so_conn->ic_so->so_faddr_len); - mutex_exit(&so_conn->ic_so->so_lock); + ksocket_getsockname(so_conn->ic_so, (struct sockaddr *)&t_addr, + &t_addrlen, CRED()); + bcopy(&t_addr, &ic->ic_laddr, t_addrlen); + ksocket_getpeername(so_conn->ic_so, (struct sockaddr *)&t_addr, + &t_addrlen, CRED()); + bcopy(&t_addr, &ic->ic_raddr, t_addrlen); mutex_enter(&ic->ic_mutex); so_conn->ic_tx_thread = thread_create(NULL, 0, idm_sotx_thread, ic, 0, @@ -1027,16 +961,16 @@ idm_so_tgt_svc_online(idm_svc_t *is) sin6_ip.sin6_port = htons(sr->sr_port); sin6_ip.sin6_addr = in6addr_any; - (void) sosetsockopt(so_svc->is_so, SOL_SOCKET, SO_REUSEADDR, - (char *)&on, sizeof (on)); + (void) ksocket_setsockopt(so_svc->is_so, SOL_SOCKET, + SO_REUSEADDR, (char *)&on, sizeof (on), CRED()); /* * Turn off SO_MAC_EXEMPT so future sobinds succeed */ - (void) sosetsockopt(so_svc->is_so, SOL_SOCKET, SO_MAC_EXEMPT, - (char *)&off, sizeof (off)); + (void) ksocket_setsockopt(so_svc->is_so, SOL_SOCKET, + SO_MAC_EXEMPT, (char *)&off, sizeof (off), CRED()); - if (sobind(so_svc->is_so, (struct sockaddr *)&sin6_ip, - sizeof (sin6_ip), 0, 0) != 0) { + if (ksocket_bind(so_svc->is_so, (struct sockaddr *)&sin6_ip, + sizeof (sin6_ip), CRED()) != 0) { mutex_exit(&is->is_mutex); idm_sodestroy(so_svc->is_so); return (IDM_STATUS_FAIL); @@ -1045,7 +979,7 @@ idm_so_tgt_svc_online(idm_svc_t *is) idm_set_tgt_connect_options(so_svc->is_so); - if (solisten(so_svc->is_so, 5) != 0) { + if (ksocket_listen(so_svc->is_so, 5, CRED()) != 0) { mutex_exit(&is->is_mutex); idm_soshutdown(so_svc->is_so); idm_sodestroy(so_svc->is_so); @@ -1063,7 +997,7 @@ idm_so_tgt_svc_online(idm_svc_t *is) idm_sodestroy(so_svc->is_so); return (IDM_STATUS_FAIL); } - + ksocket_hold(so_svc->is_so); /* Wait for the port watcher thread to start */ while (!so_svc->is_thread_running) cv_wait(&is->is_cv, &is->is_mutex); @@ -1081,33 +1015,20 @@ static void idm_so_tgt_svc_offline(idm_svc_t *is) { idm_so_svc_t *so_svc; - mutex_enter(&is->is_mutex); so_svc = (idm_so_svc_t *)is->is_so_svc; so_svc->is_thread_running = B_FALSE; mutex_exit(&is->is_mutex); /* - * When called from the kernel, soaccept blocks and cannot be woken - * up via the sockfs API. soclose does not work like you would - * hope. When the Volo project is available we can switch to that - * API which should address this issue. For now, we will poke at - * the socket to wake it up. + * Teardown socket */ - mutex_enter(&so_svc->is_so->so_lock); - so_svc->is_so->so_error = EINTR; - cv_signal(&so_svc->is_so->so_connind_cv); - mutex_exit(&so_svc->is_so->so_lock); + idm_sodestroy(so_svc->is_so); /* * Now we expect the port watcher thread to terminate */ thread_join(so_svc->is_thread_did); - - /* - * Teardown socket - */ - idm_sodestroy(so_svc->is_so); } /* @@ -1117,13 +1038,17 @@ void idm_so_svc_port_watcher(void *arg) { idm_svc_t *svc = arg; - struct sonode *new_so; + ksocket_t new_so; idm_conn_t *ic; idm_status_t idmrc; idm_so_svc_t *so_svc; int rc; const uint32_t off = 0; + struct sockaddr_in6 t_addr; + socklen_t t_addrlen; + bzero(&t_addr, sizeof (struct sockaddr_in6)); + t_addrlen = sizeof (struct sockaddr_in6); mutex_enter(&svc->is_mutex); so_svc = svc->is_so_svc; @@ -1138,7 +1063,9 @@ idm_so_svc_port_watcher(void *arg) while (so_svc->is_thread_running) { mutex_exit(&svc->is_mutex); - if ((rc = soaccept(so_svc->is_so, 0, &new_so)) != 0) { + if ((rc = ksocket_accept(so_svc->is_so, + (struct sockaddr *)&t_addr, &t_addrlen, + &new_so, CRED())) != 0) { mutex_enter(&svc->is_mutex); if (rc == ECONNABORTED) continue; @@ -1148,8 +1075,8 @@ idm_so_svc_port_watcher(void *arg) /* * Turn off SO_MAC_EXEMPT so future sobinds succeed */ - (void) sosetsockopt(new_so, SOL_SOCKET, SO_MAC_EXEMPT, - (char *)&off, sizeof (off)); + (void) ksocket_setsockopt(new_so, SOL_SOCKET, SO_MAC_EXEMPT, + (char *)&off, sizeof (off), CRED()); idmrc = idm_svc_conn_create(svc, IDM_TRANSPORT_TYPE_SOCKETS, &ic); @@ -1178,7 +1105,7 @@ idm_so_svc_port_watcher(void *arg) mutex_enter(&svc->is_mutex); } - + ksocket_rele(so_svc->is_so); so_svc->is_thread_running = B_FALSE; mutex_exit(&svc->is_mutex); diff --git a/usr/src/uts/common/io/ksocket/ksocket.c b/usr/src/uts/common/io/ksocket/ksocket.c new file mode 100644 index 0000000000..512cab56c0 --- /dev/null +++ b/usr/src/uts/common/io/ksocket/ksocket.c @@ -0,0 +1,733 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/file.h> +#include <sys/stropts.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysmacros.h> +#include <sys/filio.h> /* FIO* ioctls */ +#include <sys/sockio.h> /* SIOC* ioctls */ +#include <sys/cmn_err.h> +#include <sys/ksocket.h> +#include <io/ksocket/ksocket_impl.h> +#include <fs/sockfs/sockcommon.h> + +#define SOCKETMOD_TCP "tcp" +#define SOCKETMOD_UDP "udp" +/* + * Kernel Sockets + * + * Mostly a wrapper around the private socket_* functions. + */ +int +ksocket_socket(ksocket_t *ksp, int domain, int type, int protocol, int flags, + struct cred *cr) +{ + static const int version = SOV_DEFAULT; + int error = 0; + struct sonode *so; + *ksp = NULL; + + if (domain == AF_NCA || domain == AF_UNIX) + return (EAFNOSUPPORT); + + ASSERT(flags == KSOCKET_SLEEP || flags == KSOCKET_NOSLEEP); + so = socket_create(domain, type, protocol, NULL, NULL, version, flags, + cr, &error); + if (so == NULL) { + if (error == EAFNOSUPPORT) { + char *mod = NULL; + + /* + * Could be that root file sytem is not loaded or + * soconfig has not run yet. + */ + if (type == SOCK_STREAM && (domain == AF_INET || + domain == AF_INET6) && (protocol == 0 || + protocol == IPPROTO_TCP)) { + mod = SOCKETMOD_TCP; + } else if (type == SOCK_DGRAM && (domain == AF_INET || + domain == AF_INET6) && (protocol == 0 || + protocol == IPPROTO_UDP)) { + mod = SOCKETMOD_UDP; + } else { + return (EAFNOSUPPORT); + } + + so = socket_create(domain, type, protocol, NULL, + mod, version, flags, cr, &error); + if (so == NULL) + return (error); + } else { + return (error); + } + } + + so->so_mode |= SM_KERNEL; + + *ksp = SOTOKS(so); + + return (0); +} +int +ksocket_bind(ksocket_t ks, struct sockaddr *addr, socklen_t addrlen, + struct cred *cr) +{ + int error; + + if (!KSOCKET_VALID(ks)) + return (ENOTSOCK); + + error = socket_bind(KSTOSO(ks), addr, addrlen, _SOBIND_SOCKBSD, cr); + + return (error); +} + +int +ksocket_listen(ksocket_t ks, int backlog, struct cred *cr) +{ + if (!KSOCKET_VALID(ks)) + return (ENOTSOCK); + + return (socket_listen(KSTOSO(ks), backlog, cr)); +} + +int +ksocket_accept(ksocket_t ks, struct sockaddr *addr, + socklen_t *addrlenp, ksocket_t *nks, struct cred *cr) +{ + int error; + struct sonode *nso = NULL; + + *nks = NULL; + + if (!KSOCKET_VALID(ks)) + return (ENOTSOCK); + + if (addr != NULL && addrlenp == NULL) + return (EFAULT); + + error = socket_accept(KSTOSO(ks), KSOCKET_FMODE(ks), cr, &nso); + if (error != 0) + return (error); + + ASSERT(nso != NULL); + + nso->so_mode |= SM_KERNEL; + + if (addr != NULL && addrlenp != NULL) { + error = socket_getpeername(nso, addr, addrlenp, B_TRUE, cr); + if (error != 0) { + (void) socket_close(nso, 0, cr); + socket_destroy(nso); + return ((error == ENOTCONN) ? ECONNABORTED : error); + } + } + + *nks = SOTOKS(nso); + + return (error); +} + +int +ksocket_connect(ksocket_t ks, const struct sockaddr *addr, socklen_t addrlen, + struct cred *cr) +{ + if (!KSOCKET_VALID(ks)) + return (ENOTSOCK); + + return (socket_connect(KSTOSO(ks), addr, addrlen, + KSOCKET_FMODE(ks), 0, cr)); +} + +int +ksocket_send(ksocket_t ks, void *msg, size_t msglen, int flags, + size_t *sent, struct cred *cr) +{ + int error; + struct nmsghdr msghdr; + struct uio auio; + struct iovec iov; + + if (!KSOCKET_VALID(ks)) { + if (sent != NULL) + *sent = 0; + return (ENOTSOCK); + } + + iov.iov_base = msg; + iov.iov_len = msglen; + + bzero(&auio, sizeof (struct uio)); + auio.uio_loffset = 0; + auio.uio_iov = &iov; + auio.uio_iovcnt = 1; + auio.uio_resid = msglen; + if (flags & MSG_USERSPACE) + auio.uio_segflg = UIO_USERSPACE; + else + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_extflg = UIO_COPY_DEFAULT; + auio.uio_limit = 0; + auio.uio_fmode = KSOCKET_FMODE(ks); + + msghdr.msg_name = NULL; + msghdr.msg_namelen = 0; + msghdr.msg_control = NULL; + msghdr.msg_controllen = 0; + msghdr.msg_flags = flags | MSG_EOR; + + error = socket_sendmsg(KSTOSO(ks), &msghdr, &auio, cr); + if (error != 0) { + if (sent != NULL) + *sent = 0; + return (error); + } + + if (sent != NULL) + *sent = msglen - auio.uio_resid; + return (0); +} + +int +ksocket_sendto(ksocket_t ks, void *msg, size_t msglen, int flags, + struct sockaddr *name, socklen_t namelen, size_t *sent, struct cred *cr) +{ + int error; + struct nmsghdr msghdr; + struct uio auio; + struct iovec iov; + + if (!KSOCKET_VALID(ks)) { + if (sent != NULL) + *sent = 0; + return (ENOTSOCK); + } + + iov.iov_base = msg; + iov.iov_len = msglen; + + bzero(&auio, sizeof (struct uio)); + auio.uio_loffset = 0; + auio.uio_iov = &iov; + auio.uio_iovcnt = 1; + auio.uio_resid = msglen; + if (flags & MSG_USERSPACE) + auio.uio_segflg = UIO_USERSPACE; + else + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_extflg = UIO_COPY_DEFAULT; + auio.uio_limit = 0; + auio.uio_fmode = KSOCKET_FMODE(ks); + + msghdr.msg_iov = &iov; + msghdr.msg_iovlen = 1; + msghdr.msg_name = (char *)name; + msghdr.msg_namelen = namelen; + msghdr.msg_control = NULL; + msghdr.msg_controllen = 0; + msghdr.msg_flags = flags | MSG_EOR; + + error = socket_sendmsg(KSTOSO(ks), &msghdr, &auio, cr); + if (error != 0) { + if (sent != NULL) + *sent = 0; + return (error); + } + if (sent != NULL) + *sent = msglen - auio.uio_resid; + return (0); +} + +int +ksocket_sendmsg(ksocket_t ks, struct nmsghdr *msg, int flags, + size_t *sent, struct cred *cr) +{ + int error; + ssize_t len; + int i; + struct uio auio; + + if (!KSOCKET_VALID(ks)) { + if (sent != NULL) + *sent = 0; + return (ENOTSOCK); + } + + bzero(&auio, sizeof (struct uio)); + auio.uio_loffset = 0; + auio.uio_iov = msg->msg_iov; + auio.uio_iovcnt = msg->msg_iovlen; + if (flags & MSG_USERSPACE) + auio.uio_segflg = UIO_USERSPACE; + else + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_extflg = UIO_COPY_DEFAULT; + auio.uio_limit = 0; + auio.uio_fmode = KSOCKET_FMODE(ks); + len = 0; + for (i = 0; i < msg->msg_iovlen; i++) { + ssize_t iovlen; + iovlen = (msg->msg_iov)[i].iov_len; + len += iovlen; + if (len < 0 || iovlen < 0) + return (EINVAL); + } + auio.uio_resid = len; + + msg->msg_flags = flags | MSG_EOR; + + error = socket_sendmsg(KSTOSO(ks), msg, &auio, cr); + if (error != 0) { + if (sent != NULL) + *sent = 0; + return (error); + } + + if (sent != NULL) + *sent = len - auio.uio_resid; + return (0); +} + + +int +ksocket_recv(ksocket_t ks, void *msg, size_t msglen, int flags, + size_t *recv, struct cred *cr) +{ + int error; + struct nmsghdr msghdr; + struct uio auio; + struct iovec iov; + + if (!KSOCKET_VALID(ks)) { + if (recv != NULL) + *recv = 0; + return (ENOTSOCK); + } + + iov.iov_base = msg; + iov.iov_len = msglen; + + bzero(&auio, sizeof (struct uio)); + auio.uio_loffset = 0; + auio.uio_iov = &iov; + auio.uio_iovcnt = 1; + auio.uio_resid = msglen; + if (flags & MSG_USERSPACE) + auio.uio_segflg = UIO_USERSPACE; + else + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_extflg = UIO_COPY_DEFAULT; + auio.uio_limit = 0; + auio.uio_fmode = KSOCKET_FMODE(ks); + + msghdr.msg_name = NULL; + msghdr.msg_namelen = 0; + msghdr.msg_control = NULL; + msghdr.msg_controllen = 0; + msghdr.msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL | + MSG_DONTWAIT | MSG_USERSPACE); + + error = socket_recvmsg(KSTOSO(ks), &msghdr, &auio, cr); + if (error != 0) { + if (recv != NULL) + *recv = 0; + return (error); + } + + if (recv != NULL) + *recv = msglen - auio.uio_resid; + return (0); +} + +int +ksocket_recvfrom(ksocket_t ks, void *msg, size_t msglen, int flags, + struct sockaddr *name, socklen_t *namelen, size_t *recv, struct cred *cr) +{ + int error; + struct nmsghdr msghdr; + struct uio auio; + struct iovec iov; + + if (!KSOCKET_VALID(ks)) { + if (recv != NULL) + *recv = 0; + return (ENOTSOCK); + } + + iov.iov_base = msg; + iov.iov_len = msglen; + + bzero(&auio, sizeof (struct uio)); + auio.uio_loffset = 0; + auio.uio_iov = &iov; + auio.uio_iovcnt = 1; + auio.uio_resid = msglen; + if (flags & MSG_USERSPACE) + auio.uio_segflg = UIO_USERSPACE; + else + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_extflg = UIO_COPY_DEFAULT; + auio.uio_limit = 0; + auio.uio_fmode = KSOCKET_FMODE(ks); + + msghdr.msg_name = (char *)name; + msghdr.msg_namelen = *namelen; + msghdr.msg_control = NULL; + msghdr.msg_controllen = 0; + msghdr.msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL | + MSG_DONTWAIT | MSG_USERSPACE); + + error = socket_recvmsg(KSTOSO(ks), &msghdr, &auio, cr); + if (error != 0) { + if (recv != NULL) + *recv = 0; + return (error); + } + if (recv != NULL) + *recv = msglen - auio.uio_resid; + + bcopy(msghdr.msg_name, name, msghdr.msg_namelen); + bcopy(&msghdr.msg_namelen, namelen, sizeof (msghdr.msg_namelen)); + return (0); +} + +int +ksocket_recvmsg(ksocket_t ks, struct nmsghdr *msg, int flags, size_t *recv, + struct cred *cr) +{ + int error; + ssize_t len; + int i; + struct uio auio; + + if (!KSOCKET_VALID(ks)) { + if (recv != NULL) + *recv = 0; + return (ENOTSOCK); + } + + bzero(&auio, sizeof (struct uio)); + auio.uio_loffset = 0; + auio.uio_iov = msg->msg_iov; + auio.uio_iovcnt = msg->msg_iovlen; + if (msg->msg_flags & MSG_USERSPACE) + auio.uio_segflg = UIO_USERSPACE; + else + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_extflg = UIO_COPY_DEFAULT; + auio.uio_limit = 0; + auio.uio_fmode = KSOCKET_FMODE(ks); + len = 0; + + for (i = 0; i < msg->msg_iovlen; i++) { + ssize_t iovlen; + iovlen = (msg->msg_iov)[i].iov_len; + len += iovlen; + if (len < 0 || iovlen < 0) + return (EINVAL); + } + auio.uio_resid = len; + + msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL | + MSG_DONTWAIT | MSG_USERSPACE); + + error = socket_recvmsg(KSTOSO(ks), msg, &auio, cr); + if (error != 0) { + if (recv != NULL) + *recv = 0; + return (error); + } + if (recv != NULL) + *recv = len - auio.uio_resid; + return (0); + +} + +int +ksocket_shutdown(ksocket_t ks, int how, struct cred *cr) +{ + struct sonode *so; + + if (!KSOCKET_VALID(ks)) + return (ENOTSOCK); + + so = KSTOSO(ks); + + return (socket_shutdown(so, how, cr)); +} + +int +ksocket_close(ksocket_t ks, struct cred *cr) +{ + struct sonode *so; + so = KSTOSO(ks); + + mutex_enter(&so->so_lock); + + if (!KSOCKET_VALID(ks)) { + mutex_exit(&so->so_lock); + return (ENOTSOCK); + } + + so->so_state |= SS_CLOSING; + + if (so->so_count > 1) { + mutex_enter(&so->so_acceptq_lock); + cv_broadcast(&so->so_acceptq_cv); + mutex_exit(&so->so_acceptq_lock); + cv_broadcast(&so->so_rcv_cv); + cv_broadcast(&so->so_state_cv); + cv_broadcast(&so->so_want_cv); + cv_broadcast(&so->so_snd_cv); + cv_broadcast(&so->so_copy_cv); + } + while (so->so_count > 1) + cv_wait(&so->so_closing_cv, &so->so_lock); + + mutex_exit(&so->so_lock); + /* Remove callbacks, if any */ + (void) ksocket_setcallbacks(ks, NULL, NULL, cr); + + (void) socket_close(so, 0, cr); + socket_destroy(so); + + return (0); +} + +int +ksocket_getsockname(ksocket_t ks, struct sockaddr *addr, socklen_t *addrlen, + struct cred *cr) +{ + struct sonode *so; + + if (!KSOCKET_VALID(ks)) + return (ENOTSOCK); + + so = KSTOSO(ks); + + if (addrlen == NULL || (addr == NULL && *addrlen != 0)) + return (EFAULT); + + return (socket_getsockname(so, addr, addrlen, cr)); +} + +int +ksocket_getpeername(ksocket_t ks, struct sockaddr *addr, socklen_t *addrlen, + struct cred *cr) +{ + struct sonode *so; + + if (!KSOCKET_VALID(ks)) + return (ENOTSOCK); + + so = KSTOSO(ks); + + if (addrlen == NULL || (addr == NULL && *addrlen != 0)) + return (EFAULT); + + return (socket_getpeername(so, addr, addrlen, B_FALSE, cr)); +} + +int +ksocket_getsockopt(ksocket_t ks, int level, int optname, void *optval, + int *optlen, struct cred *cr) +{ + struct sonode *so; + + if (!KSOCKET_VALID(ks)) + return (ENOTSOCK); + + so = KSTOSO(ks); + + if (optlen == NULL) + return (EFAULT); + if (*optlen > SO_MAXARGSIZE) + return (EINVAL); + + return (socket_getsockopt(so, level, optname, optval, + (socklen_t *)optlen, 0, cr)); +} + +int +ksocket_setsockopt(ksocket_t ks, int level, int optname, const void *optval, + int optlen, struct cred *cr) +{ + struct sonode *so; + + if (!KSOCKET_VALID(ks)) + return (ENOTSOCK); + + so = KSTOSO(ks); + + if (optval == NULL) + optlen = 0; + + return (socket_setsockopt(so, level, optname, optval, + (t_uscalar_t)optlen, cr)); +} + +/* ARGSUSED */ +int +ksocket_setcallbacks(ksocket_t ks, ksocket_callbacks_t *cb, void *arg, + struct cred *cr) +{ + struct sonode *so; + + if (!KSOCKET_VALID(ks)) + return (ENOTSOCK); + + so = KSTOSO(ks); + + if (cb == NULL && arg != NULL) + return (EFAULT); + if (cb == NULL) { + mutex_enter(&so->so_lock); + bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t)); + so->so_ksock_cb_arg = NULL; + mutex_exit(&so->so_lock); + } else { + mutex_enter(&so->so_lock); + SETCALLBACK(so, cb, connected, KSOCKET_CB_CONNECTED) + SETCALLBACK(so, cb, connectfailed, KSOCKET_CB_CONNECTFAILED) + SETCALLBACK(so, cb, disconnected, KSOCKET_CB_DISCONNECTED) + SETCALLBACK(so, cb, newdata, KSOCKET_CB_NEWDATA) + SETCALLBACK(so, cb, newconn, KSOCKET_CB_NEWCONN) + SETCALLBACK(so, cb, cansend, KSOCKET_CB_CANSEND) + SETCALLBACK(so, cb, oobdata, KSOCKET_CB_OOBDATA) + SETCALLBACK(so, cb, cantsendmore, KSOCKET_CB_CANTSENDMORE) + SETCALLBACK(so, cb, cantrecvmore, KSOCKET_CB_CANTRECVMORE) + so->so_ksock_cb_arg = arg; + mutex_exit(&so->so_lock); + } + return (0); +} + +int +ksocket_ioctl(ksocket_t ks, int cmd, intptr_t arg, int *rvalp, struct cred *cr) +{ + struct sonode *so; + int rval; + + if (!KSOCKET_VALID(ks)) + return (ENOTSOCK); + + so = KSTOSO(ks); + + switch (cmd) { + default: + /* STREAM iotcls are not supported */ + if ((cmd & 0xffffff00U) == STR) { + rval = EOPNOTSUPP; + } else { + rval = socket_ioctl(so, cmd, arg, + KSOCKET_FMODE(ks) | FKIOCTL, cr, rvalp); + } + break; + case FIOASYNC: + case SIOCSPGRP: + case FIOSETOWN: + case SIOCGPGRP: + case FIOGETOWN: + rval = EOPNOTSUPP; + break; + } + + return (rval); +} + +int +ksocket_sendmblk(ksocket_t ks, struct nmsghdr *msg, int flags, + mblk_t **mpp, cred_t *cr) +{ + struct sonode *so; + int i_val; + socklen_t val_len; + mblk_t *mp = *mpp; + int error; + + if (!KSOCKET_VALID(ks)) + return (ENOTSOCK); + + so = KSTOSO(ks); + + if (flags & MSG_MBLK_QUICKRELE) { + error = socket_getsockopt(so, SOL_SOCKET, SO_SND_COPYAVOID, + &i_val, &val_len, 0, CRED()); + if (error != 0) + return (error); + + /* Zero copy is not enable */ + if (i_val == 0) + return (ECANCELED); + + for (; mp != NULL; mp = mp->b_cont) + mp->b_datap->db_struioflag |= STRUIO_ZC; + } + + error = socket_sendmblk(so, msg, flags, cr, mpp); + + return (error); +} + + +void +ksocket_hold(ksocket_t ks) +{ + struct sonode *so; + so = KSTOSO(ks); + + if (!mutex_owned(&so->so_lock)) { + mutex_enter(&so->so_lock); + so->so_count++; + mutex_exit(&so->so_lock); + } else + so->so_count++; +} + +void +ksocket_rele(ksocket_t ks) +{ + struct sonode *so; + + so = KSTOSO(ks); + /* + * When so_count equals 1 means no thread working on this ksocket + */ + if (so->so_count < 2) + cmn_err(CE_PANIC, "ksocket_rele: sonode ref count 0 or 1"); + + if (!mutex_owned(&so->so_lock)) { + mutex_enter(&so->so_lock); + if (--so->so_count == 1) + cv_signal(&so->so_closing_cv); + mutex_exit(&so->so_lock); + } else { + if (--so->so_count == 1) + cv_signal(&so->so_closing_cv); + } +} diff --git a/usr/src/uts/common/io/ksocket/ksocket_impl.h b/usr/src/uts/common/io/ksocket/ksocket_impl.h new file mode 100644 index 0000000000..ac5251540f --- /dev/null +++ b/usr/src/uts/common/io/ksocket/ksocket_impl.h @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _INET_KSOCKET_KSOCKET_IMPL_H +#define _INET_KSOCKET_KSOCKET_IMPL_H + +#define KSTOSO(ks) ((struct sonode *)(ks)) +#define SOTOKS(so) ((ksocket_t)(uintptr_t)(so)) + +#define IS_KERNEL_SOCKET(so) ((so)->so_mode & SM_KERNEL) + +#define KSOCKET_MOD_VERSION "kernel socket module" + +#define __KSOCKET_EV_connected KSOCKET_EV_CONNECTED +#define __KSOCKET_EV_connectfailed KSOCKET_EV_CONNECTFAILED +#define __KSOCKET_EV_disconnected KSOCKET_EV_DISCONNECTED +#define __KSOCKET_EV_oobdata KSOCKET_EV_OOBDATA +#define __KSOCKET_EV_newdata KSOCKET_EV_NEWDATA +#define __KSOCKET_EV_newconn KSOCKET_EV_NEWCONN +#define __KSOCKET_EV_cansend KSOCKET_EV_CANSEND +#define __KSOCKET_EV_cantsendmore KSOCKET_EV_CANTSENDMORE +#define __KSOCKET_EV_cantrecvmore KSOCKET_EV_CANTRECVMORE +#define __KSOCKET_EV_error KSOCKET_EV_ERROR + +#define KSOCKET_CALLBACK(so, cbfn, arg) \ + if ((so)->so_ksock_callbacks.ksock_cb_##cbfn != NULL) { \ + (*(so)->so_ksock_callbacks.ksock_cb_##cbfn)(SOTOKS(so), \ + __KSOCKET_EV_##cbfn, (so)->so_ksock_cb_arg, (arg)); \ + } + +#define KSOCKET_FMODE(so) FREAD|FWRITE| \ + ((KSTOSO(so)->so_state & (SS_NDELAY|SS_NONBLOCK)) ? FNDELAY : 0) + +#define KSOCKET_VALID(ks) \ + ((ks) != NULL && (KSTOSO(ks))->so_mode & SM_KERNEL && \ + !((KSTOSO(ks))->so_state & SS_CLOSING)) + +#define SETCALLBACK(so, cb, cbfn, cbflg) \ + if ((cb)->ksock_cb_flags & (cbflg)) { \ + (so)->so_ksock_callbacks.ksock_cb_##cbfn \ + = (cb)->ksock_cb_##cbfn; \ + if ((cb)->ksock_cb_##cbfn == NULL) \ + (so)->so_ksock_callbacks.ksock_cb_flags \ + &= ~(cbflg); \ + else \ + (so)->so_ksock_callbacks.ksock_cb_flags \ + |= (cbflg); \ + } + + +#endif /* _INET_KSOCKET_KSOCKET_IMPL_H */ diff --git a/usr/src/uts/common/io/ksocket/ksocket_mod.c b/usr/src/uts/common/io/ksocket/ksocket_mod.c new file mode 100644 index 0000000000..da3b4091a5 --- /dev/null +++ b/usr/src/uts/common/io/ksocket/ksocket_mod.c @@ -0,0 +1,57 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/sunddi.h> +#include <sys/errno.h> +#include <sys/modctl.h> + +#include <io/ksocket/ksocket_impl.h> + +static struct modlmisc modlmisc = { + &mod_miscops, KSOCKET_MOD_VERSION +}; + +static struct modlinkage modlinkage = { + MODREV_1, (void *)&modlmisc, NULL +}; + +int +_init(void) +{ + return (mod_install(&modlinkage)); +} + +int +_fini(void) +{ + return (mod_remove(&modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi.h b/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi.h index cc42247897..6d59ce3810 100644 --- a/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi.h +++ b/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi.h @@ -546,7 +546,7 @@ typedef struct iscsi_conn { kcondvar_t conn_state_change; boolean_t conn_state_destroy; - struct sonode *conn_socket; /* aka. kernel net. socket */ + void *conn_socket; /* kernel socket */ /* base connection information */ iscsi_sockaddr_t conn_base_addr; @@ -846,7 +846,7 @@ typedef struct iscsi_network { int (*connect)(void *, struct sockaddr *, int, int, int); int (*listen)(void *, int); void* (*accept)(void *, struct sockaddr *, int *); - int (*getsockname)(void *); + int (*getsockname)(void *, struct sockaddr *, socklen_t *); int (*getsockopt)(void *, int, int, void *, int *, int); int (*setsockopt)(void *, int, int, void *, int); int (*shutdown)(void *, int); diff --git a/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_ioctl.c b/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_ioctl.c index e5967dab8c..611b2bc967 100644 --- a/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_ioctl.c +++ b/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_ioctl.c @@ -237,12 +237,16 @@ iscsi_ioctl_conn_props_get(iscsi_hba_t *ihp, iscsi_conn_props_t *cp) iscsi_sess_t *isp; iscsi_conn_t *icp; boolean_t rtn; + struct sockaddr_in6 t_addr; + socklen_t t_addrlen; /* Let's check the version. */ if (cp->cp_vers != ISCSI_INTERFACE_VERSION) { return (B_FALSE); } + bzero(&t_addr, sizeof (struct sockaddr_in6)); + t_addrlen = sizeof (struct sockaddr_in6); /* Let's find the session. */ rw_enter(&ihp->hba_sess_list_rwlock, RW_READER); if (iscsi_sess_get(cp->cp_sess_oid, ihp, &isp) != 0) { @@ -263,18 +267,15 @@ iscsi_ioctl_conn_props_get(iscsi_hba_t *ihp, iscsi_conn_props_t *cp) ASSERT(icp->conn_sig == ISCSI_SIG_CONN); if (icp->conn_oid == cp->cp_oid) { - - if (icp->conn_socket->so_laddr.soa_len <= - sizeof (cp->cp_local)) { - bcopy(icp->conn_socket->so_laddr.soa_sa, - &cp->cp_local, - icp->conn_socket->so_laddr.soa_len); + iscsi_net->getsockname(icp->conn_socket, + (struct sockaddr *)&t_addr, &t_addrlen); + if (t_addrlen <= sizeof (cp->cp_local)) { + bcopy(&t_addr, &cp->cp_local, t_addrlen); } - if (icp->conn_socket->so_faddr.soa_len <= - sizeof (cp->cp_peer)) { - bcopy(icp->conn_socket->so_faddr.soa_sa, - &cp->cp_peer, - icp->conn_socket->so_faddr.soa_len); + ksocket_getpeername((ksocket_t)(icp->conn_socket), + (struct sockaddr *)&t_addr, &t_addrlen, CRED()); + if (t_addrlen <= sizeof (cp->cp_peer)) { + bcopy(&t_addr, &cp->cp_peer, t_addrlen); } if (icp->conn_state == ISCSI_CONN_STATE_LOGGED_IN) { diff --git a/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_login.c b/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_login.c index 8a1c1914b4..c1a201f73c 100644 --- a/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_login.c +++ b/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_login.c @@ -1934,10 +1934,12 @@ iscsi_login_failure_str(uchar_t status_class, uchar_t status_detail) static iscsi_status_t iscsi_login_connect(iscsi_conn_t *icp) { - iscsi_hba_t *ihp; - iscsi_sess_t *isp; - struct sockaddr *addr; - struct sonode *so = NULL; + iscsi_hba_t *ihp; + iscsi_sess_t *isp; + struct sockaddr *addr; + struct sockaddr_in6 t_addr; + struct sonode *so = NULL; + socklen_t t_addrlen; ASSERT(icp != NULL); isp = icp->conn_sess; @@ -1946,6 +1948,8 @@ iscsi_login_connect(iscsi_conn_t *icp) ASSERT(ihp != NULL); addr = &icp->conn_curr_addr.sin; + t_addrlen = sizeof (struct sockaddr_in6); + bzero(&t_addr, sizeof (struct sockaddr_in6)); so = iscsi_net->socket(addr->sa_family, SOCK_STREAM, 0); if (so == NULL) { cmn_err(CE_WARN, "iscsi connection(%u) unable " @@ -1982,7 +1986,8 @@ iscsi_login_connect(iscsi_conn_t *icp) } icp->conn_socket = so; - if (iscsi_net->getsockname(icp->conn_socket) != 0) { + if (iscsi_net->getsockname(icp->conn_socket, + (struct sockaddr *)&t_addr, &t_addrlen) != 0) { cmn_err(CE_NOTE, "iscsi connection(%u) failed to get " "socket information", icp->conn_oid); } diff --git a/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_net.c b/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_net.c index 23e64684a1..1f06106bf2 100644 --- a/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_net.c +++ b/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_net.c @@ -34,8 +34,9 @@ #include <sys/fs/dv_node.h> /* declares: devfs_lookupname */ #include <sys/bootconf.h> #include <sys/bootprops.h> - +#include <netinet/in.h> #include "iscsi.h" +#include <sys/ksocket.h> /* * This is a high level description of the default @@ -60,42 +61,42 @@ * The following listing describes the iscsi_net * entry points: * - * socket - Creates TCP/IP socket connection. In the - * default implementation creates a sonode - * via the sockfs kernel layer. - * bind - Performs standard TCP/IP BSD operation. In - * the default implementation this only act - * as a soft binding based on the IP and routing - * tables. It would be preferred if this was - * a hard binding but that is currently not - * possible with Solaris's networking stack. - * connect - Performs standard TCP/IP BSD operation. This - * establishes the TCP SYN to the peer IP address. - * listen - Performs standard TCP/IP BSD operation. This - * listens for incoming peer connections. - * accept - Performs standard TCP/IP BSD operation. This - * accepts incoming peer connections. - * shutdown - This disconnects the TCP/IP connection while - * maintaining the resources. - * close - This disconnects the TCP/IP connection and - * releases the resources. + * socket - Creates TCP/IP socket connection. In the + * default implementation creates a sonode + * via the sockfs kernel layer. + * bind - Performs standard TCP/IP BSD operation. In + * the default implementation this only act + * as a soft binding based on the IP and routing + * tables. It would be preferred if this was + * a hard binding but that is currently not + * possible with Solaris's networking stack. + * connect - Performs standard TCP/IP BSD operation. This + * establishes the TCP SYN to the peer IP address. + * listen - Performs standard TCP/IP BSD operation. This + * listens for incoming peer connections. + * accept - Performs standard TCP/IP BSD operation. This + * accepts incoming peer connections. + * shutdown - This disconnects the TCP/IP connection while + * maintaining the resources. + * close - This disconnects the TCP/IP connection and + * releases the resources. * - * getsockopt - Gets socket option for specified socket. - * setsockopt - Sets socket option for specified socket. + * getsockopt - Gets socket option for specified socket. + * setsockopt - Sets socket option for specified socket. * * The current socket options that are used by the initiator * are listed below. * - * TCP_CONN_NOTIFY_THRESHOLD - * TCP_CONN_ABORT_THRESHOLD - * TCP_ABORT_THRESHOLD - * TCP_NODELAY - * SO_RCVBUF - * SO_SNDBUF + * TCP_CONN_NOTIFY_THRESHOLD + * TCP_CONN_ABORT_THRESHOLD + * TCP_ABORT_THRESHOLD + * TCP_NODELAY + * SO_RCVBUF + * SO_SNDBUF * * iscsi_net_poll - Poll socket interface for a specified amount - * of data. If data not received in timeout - * period fail request. + * of data. If data not received in timeout + * period fail request. * iscsi_net_sendmsg - Send message on socket connection * iscsi_net_recvmsg - Receive message on socket connection * @@ -109,8 +110,8 @@ * generate or validate the iSCSI * header digest CRC. * ISCSI_NET_DATA_DIGESt - The interface should either - * generate or validate the iSCSI - * data digest CRC. + * generate or validate the iSCSI + * data digest CRC. */ @@ -144,25 +145,18 @@ const int is_incoming_opcode_invalid[256] = { /* 0xEX */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xFX */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, }; -/* - * Define macros to manipulate snode, vnode, and open device flags - */ -#define VTYP_VALID(i) (((i) == VCHR) || ((i) == VBLK)) -#define STYP_VALID(i) (((i) == S_IFCHR) || ((i) == S_IFBLK)) -#define STYP_TO_VTYP(i) (((i) == S_IFCHR) ? VCHR : VBLK) #define IP_4_BITS 32 #define IP_6_BITS 128 extern int modrootloaded; -extern ib_boot_prop_t *iscsiboot_prop; +extern ib_boot_prop_t *iscsiboot_prop; /* prototypes */ /* for iSCSI boot */ static int net_up = 0; static iscsi_status_t iscsi_net_interface(); -static int iscsi_ldi_vp_from_name(char *path, vnode_t **vpp); /* boot prototypes end */ static void * iscsi_net_socket(int domain, int type, int protocol); @@ -173,7 +167,7 @@ static int iscsi_net_connect(void *socket, struct sockaddr * static int iscsi_net_listen(void *socket, int backlog); static void * iscsi_net_accept(void *socket, struct sockaddr *addr, int *addr_len); -static int iscsi_net_getsockname(void *socket); +static int iscsi_net_getsockname(void *socket, struct sockaddr *, socklen_t *); static int iscsi_net_getsockopt(void *socket, int level, int option_name, void *option_val, int *option_len, int flags); static int iscsi_net_setsockopt(void *socket, int level, @@ -198,7 +192,7 @@ static void iscsi_net_set_postconnect_options(void *socket); /* * +--------------------------------------------------------------------+ - * | network interface registration functions | + * | network interface registration functions | * +--------------------------------------------------------------------+ */ @@ -287,7 +281,7 @@ iscsi_net_set_postconnect_options(void *socket) /* * +--------------------------------------------------------------------+ - * | register network interfaces | + * | register network interfaces | * +--------------------------------------------------------------------+ */ @@ -297,93 +291,53 @@ iscsi_net_set_postconnect_options(void *socket) static void * iscsi_net_socket(int domain, int type, int protocol) { - vnode_t *dvp = NULL, - *vp = NULL; - struct snode *csp = NULL; - int err = 0; - major_t maj; + ksocket_t socket; + int err = 0; if (!modrootloaded && !net_up && iscsiboot_prop) { if (iscsi_net_interface() == ISCSI_STATUS_SUCCESS) net_up = 1; } - /* ---- solookup: start ---- */ - if ((vp = solookup(domain, type, protocol, NULL, &err)) == NULL) { - - /* - * solookup calls sogetvp if the vp is not found in - * the cache. Since the call to sogetvp is hardwired - * to use USERSPACE and declared static we'll do the - * work here instead. - */ - if (!modrootloaded) { - err = iscsi_ldi_vp_from_name("/devices/pseudo/tcp@0:" - "tcp", &vp); - } else { - err = lookupname(type == SOCK_STREAM ? "/dev/tcp" : - "/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); - } - if (err) { - return (NULL); - } + err = ksocket_socket(&socket, domain, type, protocol, KSOCKET_SLEEP, + CRED()); + if (!err) + return ((void *)socket); + else + return (NULL); - /* ---- check that it is the correct vnode ---- */ - if (vp->v_type != VCHR) { - VN_RELE(vp); - return (NULL); - } - - csp = VTOS(VTOS(vp)->s_commonvp); - if (!(csp->s_flag & SDIPSET)) { - char *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); - err = ddi_dev_pathname(vp->v_rdev, S_IFCHR, - pathname); - if (err == 0) { - err = devfs_lookupname(pathname, NULLVPP, - &dvp); - } - VN_RELE(vp); - kmem_free(pathname, MAXPATHLEN); - if (err != 0) { - return (NULL); - } - vp = dvp; - } - - maj = getmajor(vp->v_rdev); - if (!STREAMSTAB(maj)) { - VN_RELE(vp); - return (NULL); - } - } - /* ---- solookup: end ---- */ - return (socreate(vp, domain, type, protocol, SOV_DEFAULT, NULL, &err)); } /* * iscsi_net_bind - bind socket to a specific sockaddr */ +/* ARGSUSED */ static int iscsi_net_bind(void *socket, struct sockaddr *name, int name_len, int backlog, int flags) { - return (sobind((struct sonode *)socket, name, name_len, - backlog, flags)); + ksocket_t ks = (ksocket_t)socket; + int error; + error = ksocket_bind(ks, name, name_len, CRED()); + if (error == 0 && backlog != 0) + error = ksocket_listen(ks, backlog, CRED()); + + return (error); } /* * iscsi_net_connect - connect socket to peer sockaddr */ +/* ARGSUSED */ static int iscsi_net_connect(void *socket, struct sockaddr *name, int name_len, int fflag, int flags) { + ksocket_t ks = (ksocket_t)socket; int rval; iscsi_net_set_preconnect_options(socket); - rval = soconnect((struct sonode *)socket, name, - name_len, fflag, flags); + rval = ksocket_connect(ks, name, name_len, CRED()); iscsi_net_set_postconnect_options(socket); return (rval); @@ -395,7 +349,8 @@ iscsi_net_connect(void *socket, struct sockaddr *name, int name_len, static int iscsi_net_listen(void *socket, int backlog) { - return (solisten((struct sonode *)socket, backlog)); + ksocket_t ks = (ksocket_t)socket; + return (ksocket_listen(ks, backlog, CRED())); } /* @@ -404,41 +359,35 @@ iscsi_net_listen(void *socket, int backlog) static void * iscsi_net_accept(void *socket, struct sockaddr *addr, int *addr_len) { - struct sonode *listening_socket; - - (void) soaccept((struct sonode *)socket, - ((struct sonode *)socket)->so_flag, - &listening_socket); - if (listening_socket != NULL) { - bcopy(listening_socket->so_faddr_sa, addr, - (socklen_t)listening_socket->so_faddr_len); - *addr_len = listening_socket->so_faddr_len; - } else { - *addr_len = 0; - } + ksocket_t listen_ks; + ksocket_t ks = (ksocket_t)socket; - return ((void *)listening_socket); + ksocket_accept(ks, addr, (socklen_t *)addr_len, &listen_ks, CRED()); + + return ((void *)listen_ks); } /* * iscsi_net_getsockname - */ static int -iscsi_net_getsockname(void *socket) +iscsi_net_getsockname(void *socket, struct sockaddr *addr, socklen_t *addrlen) { - return (sogetsockname((struct sonode *)socket)); + ksocket_t ks = (ksocket_t)socket; + return (ksocket_getsockname(ks, addr, addrlen, CRED())); } /* * iscsi_net_getsockopt - get value of option on socket */ +/* ARGSUSED */ static int iscsi_net_getsockopt(void *socket, int level, int option_name, void *option_val, int *option_len, int flags) { - return (sogetsockopt((struct sonode *)socket, level, - option_name, option_val, (socklen_t *)option_len, - flags)); + ksocket_t ks = (ksocket_t)socket; + return (ksocket_getsockopt(ks, level, option_name, option_val, + option_len, CRED())); } /* @@ -448,8 +397,9 @@ static int iscsi_net_setsockopt(void *socket, int level, int option_name, void *option_val, int option_len) { - return (sosetsockopt((struct sonode *)socket, level, - option_name, option_val, option_len)); + ksocket_t ks = (ksocket_t)socket; + return (ksocket_setsockopt(ks, level, option_name, option_val, + option_len, CRED())); } /* @@ -458,7 +408,8 @@ iscsi_net_setsockopt(void *socket, int level, int option_name, static int iscsi_net_shutdown(void *socket, int how) { - return (soshutdown((struct sonode *)socket, how)); + ksocket_t ks = (ksocket_t)socket; + return (ksocket_shutdown(ks, how, CRED())); } /* @@ -467,26 +418,32 @@ iscsi_net_shutdown(void *socket, int how) static void iscsi_net_close(void *socket) { - vnode_t *vp = SOTOV((struct sonode *)socket); - (void) soshutdown((struct sonode *)socket, 2); - (void) VOP_CLOSE(vp, 0, 1, 0, kcred, NULL); - VN_RELE(vp); + ksocket_t ks = (ksocket_t)socket; + (void) ksocket_close(ks, CRED()); } /* * iscsi_net_poll - poll socket for data */ +/* ARGSUSED */ static size_t iscsi_net_poll(void *socket, clock_t timeout) { int pflag; - uchar_t pri; - rval_t rval; + char msg[64]; + size_t recv = 0; + struct timeval tl; + ksocket_t ks = (ksocket_t)socket; + /* timeout is millisecond */ + tl.tv_sec = timeout / 1000; + tl.tv_usec = (timeout % 1000) * 1000; + + (void) ksocket_setsockopt(ks, SOL_SOCKET, SO_RCVTIMEO, &tl, + sizeof (struct timeval), CRED()); - pri = 0; pflag = MSG_ANY; - return (kstrgetmsg(SOTOV((struct sonode *)socket), NULL, NULL, - &pri, &pflag, timeout, &rval)); + bzero(msg, sizeof (msg)); + return (ksocket_recv(ks, msg, sizeof (msg), pflag, &recv, CRED())); } /* @@ -496,24 +453,12 @@ iscsi_net_poll(void *socket, clock_t timeout) static size_t iscsi_net_sendmsg(void *socket, struct msghdr *msg) { - int i = 0; - int total_len = 0; - struct uio uio; - - /* Initialization of the uio structure. */ - bzero(&uio, sizeof (uio)); - uio.uio_iov = msg->msg_iov; - uio.uio_iovcnt = msg->msg_iovlen; - uio.uio_segflg = UIO_SYSSPACE; - - for (i = 0; i < msg->msg_iovlen; i++) { - total_len += (msg->msg_iov)[i].iov_len; - } - uio.uio_resid = total_len; - - (void) sosendmsg((struct sonode *)socket, msg, &uio); - DTRACE_PROBE2(sosendmsg, size_t, total_len, size_t, uio.uio_resid); - return (total_len - uio.uio_resid); + ksocket_t ks = (ksocket_t)socket; + size_t sent = 0; + int flag = msg->msg_flags; + (void) ksocket_sendmsg(ks, msg, flag, &sent, CRED()); + DTRACE_PROBE1(ksocket_sendmsg, size_t, sent); + return (sent); } /* @@ -523,80 +468,25 @@ iscsi_net_sendmsg(void *socket, struct msghdr *msg) static size_t iscsi_net_recvmsg(void *socket, struct msghdr *msg, int timeout) { - int idx; - int total_len = 0; - struct uio uio; - uchar_t pri = 0; - int prflag = MSG_ANY; - rval_t rval; - struct sonode *sonode = (struct sonode *)socket; - - /* Initialization of the uio structure. */ - bzero(&uio, sizeof (uio)); - uio.uio_iov = msg->msg_iov; - uio.uio_iovcnt = msg->msg_iovlen; - uio.uio_segflg = UIO_SYSSPACE; - - for (idx = 0; idx < msg->msg_iovlen; idx++) { - total_len += (msg->msg_iov)[idx].iov_len; - } - uio.uio_resid = total_len; - - /* If timeout requested on receive */ - if (timeout > 0) { - boolean_t loopback = B_FALSE; - - /* And this isn't a loopback connection */ - if (sonode->so_laddr.soa_sa->sa_family == AF_INET) { - struct sockaddr_in *lin = - (struct sockaddr_in *)sonode->so_laddr.soa_sa; - struct sockaddr_in *fin = - (struct sockaddr_in *)sonode->so_faddr.soa_sa; - - if ((lin->sin_family == fin->sin_family) && - (bcmp(&lin->sin_addr, &fin->sin_addr, - sizeof (struct in_addr)) == 0)) { - loopback = B_TRUE; - } - } else { - struct sockaddr_in6 *lin6 = - (struct sockaddr_in6 *)sonode->so_laddr.soa_sa; - struct sockaddr_in6 *fin6 = - (struct sockaddr_in6 *)sonode->so_faddr.soa_sa; - - if ((lin6->sin6_family == fin6->sin6_family) && - (bcmp(&lin6->sin6_addr, &fin6->sin6_addr, - sizeof (struct in6_addr)) == 0)) { - loopback = B_TRUE; - } - } - - if (loopback == B_FALSE) { - /* - * Then poll device for up to the timeout - * period or the requested data is received. - */ - if (kstrgetmsg(SOTOV(sonode), - NULL, NULL, &pri, &prflag, timeout * 1000, - &rval) == ETIME) { - return (0); - } - } - } - + int prflag = msg->msg_flags; + ksocket_t ks = (ksocket_t)socket; + size_t recv = 0; + struct timeval tl; + + tl.tv_sec = timeout; + tl.tv_usec = 0; + + /* Set recv timeout */ + if (ksocket_setsockopt(ks, SOL_SOCKET, SO_RCVTIMEO, &tl, + sizeof (struct timeval), CRED())) + return (0); /* * Receive the requested data. Block until all - * data is received. - * - * resid occurs only when the connection is - * disconnected. In that case it will return - * the amount of data that was not received. - * In general this is the total amount we - * requested. + * data is received or timeout. */ - (void) sorecvmsg((struct sonode *)socket, msg, &uio); - DTRACE_PROBE2(sorecvmsg, size_t, total_len, size_t, uio.uio_resid); - return (total_len - uio.uio_resid); + ksocket_recvmsg(ks, msg, prflag, &recv, CRED()); + DTRACE_PROBE1(ksocket_recvmsg, size_t, recv); + return (recv); } /* @@ -701,7 +591,7 @@ iscsi_net_sendpdu(void *socket, iscsi_hdr_t *ihp, char *data, int flags) msg.msg_flags = MSG_WAITALL; msg.msg_iovlen = iovlen; - send_len = iscsi_net->sendmsg((struct sonode *)socket, &msg); + send_len = iscsi_net->sendmsg(socket, &msg); DTRACE_PROBE2(sendmsg, size_t, total_len, size_t, send_len); if (total_len != send_len) { return (ISCSI_STATUS_TCP_TX_ERROR); @@ -873,7 +763,6 @@ iscsi_net_recvdata(void *socket, iscsi_hdr_t *ihp, char *data, } if (dlength) { - /* calculate pad */ pad_len = ((ISCSI_PAD_WORD_LEN - (dlength & (ISCSI_PAD_WORD_LEN - 1))) & @@ -1067,83 +956,3 @@ iscsi_net_interface() return (ISCSI_STATUS_SUCCESS); } } - -/* - * vp is needed to create the socket for the time being. - */ -static int -iscsi_ldi_vp_from_name(char *path, vnode_t **vpp) -{ - vnode_t *vp = NULL; - int ret; - - /* sanity check required input parameters */ - if ((path == NULL) || (vpp == NULL)) - return (EINVAL); - - if (modrootloaded) { - cred_t *saved_cred = curthread->t_cred; - - /* we don't want lookupname to fail because of credentials */ - curthread->t_cred = kcred; - - /* - * all lookups should be done in the global zone. but - * lookupnameat() won't actually do this if an absolute - * path is passed in. since the ldi interfaces require an - * absolute path we pass lookupnameat() a pointer to - * the character after the leading '/' and tell it to - * start searching at the current system root directory. - */ - ASSERT(*path == '/'); - ret = lookupnameat(path + 1, UIO_SYSSPACE, FOLLOW, NULLVPP, - &vp, rootdir); - - /* restore this threads credentials */ - curthread->t_cred = saved_cred; - - if (ret == 0) { - if (!vn_matchops(vp, spec_getvnodeops()) || - !VTYP_VALID(vp->v_type)) { - VN_RELE(vp); - return (ENXIO); - } - } - } - - if (vp == NULL) { - dev_info_t *dip; - dev_t dev; - int spec_type; - - /* - * Root is not mounted, the minor node is not specified, - * or an OBP path has been specified. - */ - - /* - * Determine if path can be pruned to produce an - * OBP or devfs path for resolve_pathname. - */ - if (strncmp(path, "/devices/", 9) == 0) - path += strlen("/devices"); - - /* - * if no minor node was specified the DEFAULT minor node - * will be returned. if there is no DEFAULT minor node - * one will be fabricated of type S_IFCHR with the minor - * number equal to the instance number. - */ - ret = resolve_pathname(path, &dip, &dev, &spec_type); - if (ret != 0) - return (ENODEV); - - ASSERT(STYP_VALID(spec_type)); - vp = makespecvp(dev, STYP_TO_VTYP(spec_type)); - spec_assoc_vp_with_devi(vp, dip); - ddi_release_devi(dip); - } - - *vpp = vp; - return (0); -} diff --git a/usr/src/uts/common/io/scsi/adapters/iscsi/isns_client.c b/usr/src/uts/common/io/scsi/adapters/iscsi/isns_client.c index fd5d226e0f..5ed6acdc2b 100644 --- a/usr/src/uts/common/io/scsi/adapters/iscsi/isns_client.c +++ b/usr/src/uts/common/io/scsi/adapters/iscsi/isns_client.c @@ -1518,7 +1518,11 @@ void struct sockaddr_in6 s_in6; } sa_rsvr = { 0 }; void *so; + struct sockaddr_in6 t_addr; + socklen_t t_addrlen; + bzero(&t_addr, sizeof (struct sockaddr_in6)); + t_addrlen = sizeof (struct sockaddr_in6); if (isns_server_addr->a_addr.i_insize == sizeof (struct in_addr)) { /* IPv4 */ sa_rsvr.s_in4.sin_family = AF_INET; @@ -1555,7 +1559,8 @@ void return (NULL); } - (void) iscsi_net->getsockname(so); + (void) iscsi_net->getsockname(so, (struct sockaddr *)&t_addr, + &t_addrlen); return (so); } @@ -2961,6 +2966,8 @@ isns_service_esi_scn(iscsi_thread_t *thread, void *arg) isns_pdu_t *in_pdu; size_t bytes_received, in_pdu_size = 0; uint8_t *lhba_handle; + struct sockaddr_in6 t_addr; + socklen_t t_addrlen; union { struct sockaddr sin; struct sockaddr_in s_in4; @@ -2978,12 +2985,13 @@ isns_service_esi_scn(iscsi_thread_t *thread, void *arg) /* Done using the argument - free it */ kmem_free(larg, sizeof (*larg)); + bzero(&t_addr, sizeof (struct sockaddr_in6)); + t_addrlen = sizeof (struct sockaddr_in6); - if (((struct sonode *)listening_so)->so_laddr.soa_len <= - sizeof (local_conn_prop)) { - bcopy(((struct sonode *)listening_so)->so_laddr.soa_sa, - &local_conn_prop, - ((struct sonode *)listening_so)->so_laddr.soa_len); + (void) iscsi_net->getsockname(listening_so, + (struct sockaddr *)&t_addr, &t_addrlen); + if (t_addrlen <= sizeof (local_conn_prop)) { + bcopy(&t_addr, &local_conn_prop, t_addrlen); } if (iscsi_net->listen(listening_so, 5) < 0) { @@ -2999,8 +3007,7 @@ isns_service_esi_scn(iscsi_thread_t *thread, void *arg) /* Blocking call */ connecting_so = iscsi_net->accept( - (struct sonode *)listening_so, - &clnt_addr.sin, &clnt_len); + listening_so, &clnt_addr.sin, &clnt_len); mutex_enter(&esi_scn_thr_mutex); if (esi_scn_thr_to_shutdown == B_TRUE) { @@ -3092,10 +3099,14 @@ find_local_portal(iscsi_addr_t *isns_server_addr, struct sockaddr_in6 s_in6; } serv_addr = { 0 }; void *so; + struct sockaddr_in6 t_addr; + socklen_t t_addrlen; *local_addr = NULL; *listening_so = NULL; + bzero(&t_addr, sizeof (struct sockaddr_in6)); + t_addrlen = sizeof (struct sockaddr_in6); /* * Determine the local IP address. */ @@ -3104,16 +3115,14 @@ find_local_portal(iscsi_addr_t *isns_server_addr, return (B_FALSE); } - if (((struct sonode *)so)->so_laddr.soa_len > - sizeof (local_conn_prop)) { + iscsi_net->getsockname(so, (struct sockaddr *)&t_addr, &t_addrlen); + if (t_addrlen > sizeof (local_conn_prop)) { iscsi_net->close(so); return (B_FALSE); } - bcopy(((struct sonode *)so)->so_laddr.soa_sa, - &local_conn_prop, - ((struct sonode *)so)->so_laddr.soa_len); - + bcopy(&t_addr, &local_conn_prop, t_addrlen); + t_addrlen = sizeof (struct sockaddr_in6); if (local_conn_prop.soa4.sin_family == AF_INET) { *local_addr = (iscsi_addr_t *)kmem_zalloc(sizeof (iscsi_addr_t), KM_SLEEP); @@ -3160,11 +3169,10 @@ find_local_portal(iscsi_addr_t *isns_server_addr, return (B_FALSE); } - if (((struct sonode *)so)->so_laddr.soa_len <= - sizeof (local_conn_prop)) { - bcopy(((struct sonode *)so)->so_laddr.soa_sa, - &local_conn_prop, - ((struct sonode *)so)->so_laddr.soa_len); + (void) iscsi_net->getsockname(so, (struct sockaddr *)&t_addr, + &t_addrlen); + if (t_addrlen <= sizeof (local_conn_prop)) { + bcopy(&t_addr, &local_conn_prop, t_addrlen); (*local_addr)->a_port = ntohs(local_conn_prop.soa4.sin_port); } else { (*local_addr)->a_port = ISNS_DEFAULT_ESI_SCN_PORT; diff --git a/usr/src/uts/common/io/sock_conf.c b/usr/src/uts/common/io/sock_conf.c new file mode 100644 index 0000000000..b6d31de8ea --- /dev/null +++ b/usr/src/uts/common/io/sock_conf.c @@ -0,0 +1,251 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/sysmacros.h> +#include <sys/atomic.h> +#include <sys/strsubr.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/cmn_err.h> +#include <sys/modctl.h> +#include <sys/sdt.h> + +list_t smod_list; +kmutex_t smod_list_lock; + +so_create_func_t sock_comm_create_function; +so_destroy_func_t sock_comm_destroy_function; + +static smod_info_t *smod_create(const char *); +static void smod_destroy(smod_info_t *); + +extern void smod_add(smod_info_t *); + +void +smod_init(void) +{ + list_create(&smod_list, sizeof (smod_info_t), + offsetof(smod_info_t, smod_node)); + mutex_init(&smod_list_lock, NULL, MUTEX_DEFAULT, NULL); +} + +static smod_info_t * +smod_find(const char *modname) +{ + smod_info_t *smodp; + + ASSERT(MUTEX_HELD(&smod_list_lock)); + + for (smodp = list_head(&smod_list); smodp != NULL; + smodp = list_next(&smod_list, smodp)) + if (strcmp(smodp->smod_name, modname) == 0) + return (smodp); + return (NULL); +} + +/* + * Register the socket module. + */ +int +smod_register(const smod_reg_t *reg) +{ + smod_info_t *smodp; + + /* + * Make sure the socket module does not depend on capabilities + * not available on the system. + */ + if (reg->smod_version != SOCKMOD_VERSION || + reg->smod_dc_version != SOCK_DC_VERSION || + reg->smod_uc_version != SOCK_UC_VERSION) { + cmn_err(CE_WARN, + "Failed to register socket module %s: version mismatch", + reg->smod_name); + return (EINVAL); + } + +#ifdef DEBUG + mutex_enter(&smod_list_lock); + if ((smodp = smod_find(reg->smod_name)) != NULL) { + mutex_exit(&smod_list_lock); + return (EEXIST); + } + mutex_exit(&smod_list_lock); +#endif + + smodp = smod_create(reg->smod_name); + smodp->smod_version = reg->smod_version; + if (strcmp(smodp->smod_name, SOTPI_SMOD_NAME) == 0 || + strcmp(smodp->smod_name, "socksctp") == 0 || + strcmp(smodp->smod_name, "socksdp") == 0) { + ASSERT(smodp->smod_proto_create_func == NULL); + ASSERT(reg->__smod_priv != NULL); + smodp->smod_sock_create_func = + reg->__smod_priv->smodp_sock_create_func; + smodp->smod_sock_destroy_func = + reg->__smod_priv->smodp_sock_destroy_func; + smodp->smod_proto_create_func = NULL; + } else { + if (reg->smod_proto_create_func == NULL || + (reg->__smod_priv != NULL && + (reg->__smod_priv->smodp_sock_create_func != NULL || + reg->__smod_priv->smodp_sock_destroy_func != NULL))) { +#ifdef DEBUG + cmn_err(CE_CONT, "smod_register of %s failed", + smodp->smod_name); +#endif + smod_destroy(smodp); + return (EINVAL); + } + smodp->smod_proto_create_func = reg->smod_proto_create_func; + smodp->smod_sock_create_func = sock_comm_create_function; + smodp->smod_sock_destroy_func = sock_comm_destroy_function; + smodp->smod_uc_version = reg->smod_uc_version; + smodp->smod_dc_version = reg->smod_dc_version; + if (reg->__smod_priv != NULL) { + smodp->smod_proto_fallback_func = + reg->__smod_priv->smodp_proto_fallback_func; + } + } + smod_add(smodp); + return (0); +} + +/* + * Unregister the socket module + */ +int +smod_unregister(const char *mod_name) +{ + smod_info_t *smodp; + + mutex_enter(&smod_list_lock); + if ((smodp = smod_find(mod_name)) != NULL) { + if (smodp->smod_refcnt != 0) { + mutex_exit(&smod_list_lock); + return (EBUSY); + } else { + /* + * Delete the entry from the socket module list. + */ + list_remove(&smod_list, smodp); + mutex_exit(&smod_list_lock); + + smod_destroy(smodp); + return (0); + } + } + mutex_exit(&smod_list_lock); + + return (ENXIO); +} + +/* + * Initialize the socket module entry. + */ +static smod_info_t * +smod_create(const char *modname) +{ + smod_info_t *smodp; + int len; + + smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP); + len = strlen(modname) + 1; + smodp->smod_name = kmem_alloc(len, KM_SLEEP); + bcopy(modname, smodp->smod_name, len); + smodp->smod_name[len - 1] = '\0'; + return (smodp); +} + +/* + * Clean up the socket module part of the sockparams entry. + */ +static void +smod_destroy(smod_info_t *smodp) +{ + ASSERT(smodp->smod_name != NULL); + ASSERT(smodp->smod_refcnt == 0); + ASSERT(!list_link_active(&smodp->smod_node)); + ASSERT(strcmp(smodp->smod_name, "socktpi") != 0); + + kmem_free(smodp->smod_name, strlen(smodp->smod_name) + 1); + smodp->smod_name = NULL; + smodp->smod_proto_create_func = NULL; + smodp->smod_sock_create_func = NULL; + smodp->smod_sock_destroy_func = NULL; + kmem_free(smodp, sizeof (*smodp)); +} + +/* + * Add an entry at the front of the socket module list. + */ +void +smod_add(smod_info_t *smodp) +{ + ASSERT(smodp != NULL); + mutex_enter(&smod_list_lock); + list_insert_head(&smod_list, smodp); + mutex_exit(&smod_list_lock); +} + +/* + * Lookup the socket module table by the socket module name. + * If there is an existing entry, then increase the reference count. + * Otherwise we load the module and in the module register function create + * a new entry and add it to the end of the socket module table. + */ +smod_info_t * +smod_lookup_byname(const char *modname) +{ + smod_info_t *smodp; + int error; + +again: + /* + * If find an entry, increase the reference count and + * return the entry pointer. + */ + mutex_enter(&smod_list_lock); + if ((smodp = smod_find(modname)) != NULL) { + SMOD_INC_REF(smodp); + mutex_exit(&smod_list_lock); + return (smodp); + } + mutex_exit(&smod_list_lock); + + /* + * We have a sockmod, and it is not loaded. + * Load the module into the kernel, modload() will + * take care of the multiple threads. + */ + DTRACE_PROBE1(load__socket__module, char *, modname); + error = modload(SOCKMOD_PATH, modname); + if (error == -1) { + cmn_err(CE_CONT, "modload of %s/%s failed", + SOCKMOD_PATH, modname); + return (NULL); + } + goto again; +} diff --git a/usr/src/uts/common/io/strplumb.c b/usr/src/uts/common/io/strplumb.c index 27b9cc8843..33406bea05 100644 --- a/usr/src/uts/common/io/strplumb.c +++ b/usr/src/uts/common/io/strplumb.c @@ -62,6 +62,7 @@ #include <inet/ip6.h> #include <inet/tcp.h> #include <inet/sctp_ip.h> +#include <inet/udp_impl.h> #include <sys/strlog.h> #include <sys/log.h> diff --git a/usr/src/uts/common/netinet/icmp6.h b/usr/src/uts/common/netinet/icmp6.h index 2d8903d6f1..560b825595 100644 --- a/usr/src/uts/common/netinet/icmp6.h +++ b/usr/src/uts/common/netinet/icmp6.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _NETINET_ICMP6_H #define _NETINET_ICMP6_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -481,6 +478,7 @@ typedef struct icmp6_filter { #define ICMP6_FILTER_WILLBLOCK(type, filterp) \ ((((filterp)->__icmp6_filt[(type) >> 5]) & (1 << ((type) & 31))) == 0) +#define ICMP_IOC_DEFAULT_Q (('I' << 8) + 51) #ifdef __cplusplus } diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c index 13f592993a..d78f4bbdb0 100644 --- a/usr/src/uts/common/os/fio.c +++ b/usr/src/uts/common/os/fio.c @@ -23,12 +23,10 @@ /* All Rights Reserved */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/sysmacros.h> #include <sys/param.h> @@ -1167,11 +1165,8 @@ f_getfl(int fd, int *flagp) /* * BSD fcntl() FASYNC compatibility. - * - * SCTP doesn't have an associated stream and thus - * doesn't store flags on it. */ - if ((vp->v_type == VSOCK) && (vp->v_stream != NULL)) + if (vp->v_type == VSOCK) flag |= sock_getfasync(vp); *flagp = flag; error = 0; diff --git a/usr/src/uts/common/os/modconf.c b/usr/src/uts/common/os/modconf.c index 7c41975c48..cf25d86183 100644 --- a/usr/src/uts/common/os/modconf.c +++ b/usr/src/uts/common/os/modconf.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/systm.h> #include <sys/param.h> @@ -59,6 +57,7 @@ #include <sys/cpc_pcbe.h> #include <sys/kstat.h> #include <sys/fs/sdev_node.h> +#include <sys/socketvar.h> #include <sys/kiconv.h> extern int moddebug; @@ -186,6 +185,17 @@ struct mod_ops mod_strmodops = { }; /* + * Socket modules. + */ +static int mod_infosockmod(struct modlsockmod *, struct modlinkage *, int *); +static int mod_installsockmod(struct modlsockmod *, struct modlinkage *); +static int mod_removesockmod(struct modlsockmod *, struct modlinkage *); + +struct mod_ops mod_sockmodops = { + mod_installsockmod, mod_removesockmod, mod_infosockmod +}; + +/* * Scheduling classes. */ static int mod_infosched(struct modlsched *, struct modlinkage *, int *); @@ -1178,6 +1188,59 @@ mod_removestrmod(struct modlstrmod *modl, struct modlinkage *modlp) } /* + * Get status of a socket module. + */ +/*ARGSUSED*/ +static int +mod_infosockmod(struct modlsockmod *modl, struct modlinkage *modlp, int *p0) +{ + *p0 = -1; /* no useful info */ + return (0); +} + +/* + * Install a socket module. + */ +/*ARGSUSED*/ +static int +mod_installsockmod(struct modlsockmod *modl, struct modlinkage *modlp) +{ + struct modctl *mcp; + char *mod_name; + + mcp = mod_getctl(modlp); + ASSERT(mcp != NULL); + mod_name = mcp->mod_modname; + if (strcmp(mod_name, modl->sockmod_reg_info->smod_name) != 0) { +#ifdef DEBUG + cmn_err(CE_CONT, "mod_installsockmod: different names" + " %s != %s \n", mod_name, + modl->sockmod_reg_info->smod_name); +#endif + return (EINVAL); + } + + /* + * Register module. + */ + return (smod_register(modl->sockmod_reg_info)); +} + +/* + * Remove a socket module. + */ +/*ARGSUSED*/ +static int +mod_removesockmod(struct modlsockmod *modl, struct modlinkage *modlp) +{ + /* + * unregister from the global socket creation table + * check the refcnt in the lookup table + */ + return (smod_unregister(modl->sockmod_reg_info->smod_name)); +} + +/* * Get status of a scheduling class module. */ /*ARGSUSED1*/ diff --git a/usr/src/uts/common/os/move.c b/usr/src/uts/common/os/move.c index 7e1c2f2d62..d4a127794f 100644 --- a/usr/src/uts/common/os/move.c +++ b/usr/src/uts/common/os/move.c @@ -558,8 +558,6 @@ uioainit(uio_t *uiop, uioa_t *uioap) uioap->uioa_mbytes = 0; - uioap->uioa_mbytes = 0; - /* uio_t/uioa_t uio_t common struct copy */ *((uio_t *)uioap) = *uiop; diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c index 236626a4f0..42d0b8e17c 100644 --- a/usr/src/uts/common/os/streamio.c +++ b/usr/src/uts/common/os/streamio.c @@ -77,6 +77,7 @@ #include <sys/policy.h> #include <sys/dld.h> #include <sys/zone.h> +#include <sys/sodirect.h> /* * This define helps improve the readability of streams code while @@ -1110,50 +1111,7 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, } bp = getq_noenab(q, rbytes); - if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) { - /* - * A uioa flaged mblk_t chain, already uio processed, - * add it to the sodirect uioa pending free list. - * - * Note, a b_cont chain headed by a DBLK_UIOA enable - * mblk_t must have all mblk_t(s) DBLK_UIOA enabled. - */ - mblk_t *bpt = sodp->sod_uioaft; - - ASSERT(sodp != NULL); - ASSERT(msgdsize(bp) == sodp->sod_uioa.uioa_mbytes); - - /* - * Add first mblk_t of "bp" chain to current sodirect uioa - * free list tail mblk_t, if any, else empty list so new head. - */ - if (bpt == NULL) - sodp->sod_uioafh = bp; - else - bpt->b_cont = bp; - - /* - * Walk mblk_t "bp" chain to find tail and adjust rptr of - * each to reflect that uioamove() has consumed all data. - */ - bpt = bp; - for (;;) { - bpt->b_rptr = bpt->b_wptr; - if (bpt->b_cont == NULL) - break; - bpt = bpt->b_cont; - - ASSERT(bpt->b_datap->db_flags & DBLK_UIOA); - } - /* New sodirect uioa free list tail */ - sodp->sod_uioaft = bpt; - - /* Only 1 strget() with data returned per uioa_t */ - if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) { - sodp->sod_uioa.uioa_state &= UIOA_CLR; - sodp->sod_uioa.uioa_state |= UIOA_FINI; - } - } + sod_uioa_mblk_done(sodp, bp); return (bp); } diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c index 442ced2b51..469ef329db 100644 --- a/usr/src/uts/common/os/strsubr.c +++ b/usr/src/uts/common/os/strsubr.c @@ -286,7 +286,6 @@ static void outer_insert(syncq_t *, syncq_t *); static void outer_remove(syncq_t *, syncq_t *); static void write_now(syncq_t *); static void clr_qfull(queue_t *); -static void enable_svc(queue_t *); static void runbufcalls(void); static void sqenable(syncq_t *); static void sqfill_events(syncq_t *, queue_t *, mblk_t *, void (*)()); @@ -8401,6 +8400,21 @@ mblk_setcred(mblk_t *mp, cred_t *cr) } } +/* + * Set the cred and pid for each mblk in the message. It is assumed that + * the message passed in does not already have a cred. + */ +void +msg_setcredpid(mblk_t *mp, cred_t *cr, pid_t pid) +{ + while (mp != NULL) { + ASSERT(DB_CRED(mp) == NULL); + mblk_setcred(mp, cr); + DB_CPID(mp) = pid; + mp = mp->b_cont; + } +} + int hcksum_assoc(mblk_t *mp, multidata_t *mmd, pdesc_t *pd, uint32_t start, uint32_t stuff, uint32_t end, uint32_t value, diff --git a/usr/src/uts/common/smbsrv/smb_kproto.h b/usr/src/uts/common/smbsrv/smb_kproto.h index 2131c88e19..b14005074a 100644 --- a/usr/src/uts/common/smbsrv/smb_kproto.h +++ b/usr/src/uts/common/smbsrv/smb_kproto.h @@ -38,6 +38,7 @@ extern "C" { #include <sys/socket.h> #include <sys/strsubr.h> #include <sys/socketvar.h> +#include <sys/ksocket.h> #include <sys/cred.h> #include <smbsrv/smb_vops.h> #include <smbsrv/smb_xdr.h> @@ -307,19 +308,17 @@ uint32_t smb_decode_sd(struct smb_xa *, smb_sd_t *); /* * Socket functions */ -struct sonode *smb_socreate(int domain, int type, int protocol); -void smb_soshutdown(struct sonode *so); -void smb_sodestroy(struct sonode *so); -int smb_sorecv(struct sonode *so, void *msg, size_t len); -int smb_iov_sorecv(struct sonode *so, iovec_t *iop, int iovlen, - size_t total_len); +ksocket_t smb_socreate(int domain, int type, int protocol); +void smb_soshutdown(ksocket_t so); +void smb_sodestroy(ksocket_t so); +int smb_sorecv(ksocket_t so, void *msg, size_t len); int smb_net_init(void); void smb_net_fini(void); void smb_net_txl_constructor(smb_txlst_t *); void smb_net_txl_destructor(smb_txlst_t *); smb_txreq_t *smb_net_txr_alloc(void); void smb_net_txr_free(smb_txreq_t *); -int smb_net_txr_send(struct sonode *, smb_txlst_t *, smb_txreq_t *); +int smb_net_txr_send(ksocket_t, smb_txlst_t *, smb_txreq_t *); /* * SMB RPC interface @@ -489,7 +488,7 @@ void smb_request_cancel(smb_request_t *sr); /* * session functions (file smb_session.c) */ -smb_session_t *smb_session_create(struct sonode *, uint16_t, smb_server_t *); +smb_session_t *smb_session_create(ksocket_t, uint16_t, smb_server_t *); int smb_session_daemon(smb_session_list_t *); void smb_session_reconnection_check(smb_session_list_t *, smb_session_t *); void smb_session_timers(smb_session_list_t *); diff --git a/usr/src/uts/common/smbsrv/smb_ktypes.h b/usr/src/uts/common/smbsrv/smb_ktypes.h index 13f5783116..918746a701 100644 --- a/usr/src/uts/common/smbsrv/smb_ktypes.h +++ b/usr/src/uts/common/smbsrv/smb_ktypes.h @@ -46,6 +46,8 @@ extern "C" { #include <sys/stat.h> #include <sys/vnode.h> #include <sys/cred.h> +#include <netinet/in.h> +#include <sys/ksocket.h> #include <sys/fem.h> #include <sys/door.h> #include <smbsrv/smb.h> @@ -683,7 +685,7 @@ typedef struct smb_session { uint32_t capabilities; struct smb_sign signing; - struct sonode *sock; + ksocket_t sock; smb_slist_t s_req_list; smb_llist_t s_xa_list; @@ -1453,7 +1455,7 @@ typedef struct { typedef struct { kthread_t *ld_kth; kt_did_t ld_ktdid; - struct sonode *ld_so; + ksocket_t ld_so; struct sockaddr_in ld_sin; smb_session_list_t ld_session_list; } smb_listener_daemon_t; diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index cecccf50ab..451ce87f1f 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -329,6 +329,7 @@ CHKHDRS= \ kmem_impl.h \ kobj.h \ kobj_impl.h \ + ksocket.h \ kstat.h \ kstr.h \ ksyms.h \ @@ -503,6 +504,7 @@ CHKHDRS= \ sobject.h \ socket.h \ socket_impl.h \ + socket_proto.h \ socketvar.h \ sockio.h \ sodirect.h \ diff --git a/usr/src/uts/common/sys/idm/idm_so.h b/usr/src/uts/common/sys/idm/idm_so.h index 134896ed4f..42c39c6461 100644 --- a/usr/src/uts/common/sys/idm/idm_so.h +++ b/usr/src/uts/common/sys/idm/idm_so.h @@ -31,7 +31,7 @@ extern "C" { #endif #include <sys/idm/idm_transport.h> - +#include <sys/ksocket.h> /* * Define TCP window size (send and receive buffer sizes) */ @@ -41,7 +41,7 @@ extern "C" { /* sockets-specific portion of idm_svc_t */ typedef struct idm_so_svc_s { - struct sonode *is_so; + ksocket_t is_so; kthread_t *is_thread; kt_did_t is_thread_did; boolean_t is_thread_running; @@ -49,7 +49,7 @@ typedef struct idm_so_svc_s { /* sockets-specific portion of idm_conn_t */ typedef struct idm_so_conn_s { - struct sonode *ic_so; + ksocket_t ic_so; kthread_t *ic_tx_thread; kt_did_t ic_tx_thread_did; @@ -68,24 +68,24 @@ void idm_so_fini(); /* Socket functions */ -struct sonode * +ksocket_t idm_socreate(int domain, int type, int protocol); -void idm_soshutdown(struct sonode *so); +void idm_soshutdown(ksocket_t so); -void idm_sodestroy(struct sonode *so); +void idm_sodestroy(ksocket_t so); int idm_get_ipaddr(idm_addr_list_t **); -int idm_sorecv(struct sonode *so, void *msg, size_t len); +int idm_sorecv(ksocket_t so, void *msg, size_t len); -int idm_sosendto(struct sonode *so, void *buff, size_t len, +int idm_sosendto(ksocket_t so, void *buff, size_t len, struct sockaddr *name, socklen_t namelen); -int idm_iov_sosend(struct sonode *so, iovec_t *iop, int iovlen, +int idm_iov_sosend(ksocket_t so, iovec_t *iop, int iovlen, size_t total_len); -int idm_iov_sorecv(struct sonode *so, iovec_t *iop, int iovlen, +int idm_iov_sorecv(ksocket_t so, iovec_t *iop, int iovlen, size_t total_len); void idm_sotx_thread(void *arg); diff --git a/usr/src/uts/common/sys/iscsit/radius_packet.h b/usr/src/uts/common/sys/iscsit/radius_packet.h index bbf96d5cb2..80ee57a202 100644 --- a/usr/src/uts/common/sys/iscsit/radius_packet.h +++ b/usr/src/uts/common/sys/iscsit/radius_packet.h @@ -32,7 +32,7 @@ extern "C" { #include <netinet/in.h> #include <sys/types.h> - +#include <sys/ksocket.h> #include <sys/iscsit/radius_protocol.h> /* A total of RAD_RCV_TIMEOUT * RAD_RETRY_MAX seconds timeout. */ @@ -69,7 +69,7 @@ typedef struct radius_packet_data { * */ int -iscsit_snd_radius_request(void *socket, +iscsit_snd_radius_request(ksocket_t socket, iscsi_ipaddr_t rsvr_ip_addr, uint32_t rsvr_port, radius_packet_data_t *packet_data); @@ -85,7 +85,7 @@ iscsit_snd_radius_request(void *socket, * Return receive status. */ int -iscsit_rcv_radius_response(void *socket, +iscsit_rcv_radius_response(ksocket_t socket, uint8_t *shared_secret, uint32_t shared_secret_len, uint8_t *req_authenticator, diff --git a/usr/src/uts/common/sys/ksocket.h b/usr/src/uts/common/sys/ksocket.h new file mode 100644 index 0000000000..fb834b027f --- /dev/null +++ b/usr/src/uts/common/sys/ksocket.h @@ -0,0 +1,127 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_KSOCKET_H_ +#define _SYS_KSOCKET_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* Opaque kernel socket type */ +typedef struct __ksocket *ksocket_t; +struct nmsghdr; + +/* flag bit for each Callback Event */ +#define KSOCKET_CB_CONNECTED 0x00000001 +#define KSOCKET_CB_CONNECTFAILED 0x00000002 +#define KSOCKET_CB_DISCONNECTED 0x00000004 +#define KSOCKET_CB_NEWDATA 0x00000008 +#define KSOCKET_CB_NEWCONN 0x00000010 +#define KSOCKET_CB_CANSEND 0x00000020 +#define KSOCKET_CB_OOBDATA 0x00000040 +#define KSOCKET_CB_CANTSENDMORE 0x00000080 +#define KSOCKET_CB_CANTRECVMORE 0x00000100 +#define KSOCKET_CB_ERROR 0x00000200 + +/* + * Kernel Socket Callback Events + */ +typedef enum ksocket_event { + KSOCKET_EV_CONNECTED, + KSOCKET_EV_CONNECTFAILED, + KSOCKET_EV_DISCONNECTED, + KSOCKET_EV_OOBDATA, + KSOCKET_EV_NEWDATA, + KSOCKET_EV_NEWCONN, + KSOCKET_EV_CANSEND, + KSOCKET_EV_CANTSENDMORE, + KSOCKET_EV_CANTRECVMORE, + KSOCKET_EV_ERROR +} ksocket_callback_event_t; + +typedef void (*ksocket_callback_t)(ksocket_t, ksocket_callback_event_t, + void *, uintptr_t); + +typedef struct ksocket_callbacks { + uint32_t ksock_cb_flags; + ksocket_callback_t ksock_cb_connected; + ksocket_callback_t ksock_cb_connectfailed; + ksocket_callback_t ksock_cb_disconnected; + ksocket_callback_t ksock_cb_newdata; + ksocket_callback_t ksock_cb_newconn; + ksocket_callback_t ksock_cb_cansend; + ksocket_callback_t ksock_cb_oobdata; + ksocket_callback_t ksock_cb_cantsendmore; + ksocket_callback_t ksock_cb_cantrecvmore; + ksocket_callback_t ksock_cb_error; +} ksocket_callbacks_t; + +#define KSOCKET_SLEEP SOCKET_SLEEP +#define KSOCKET_NOSLEEP SOCKET_NOSLEEP + +extern int ksocket_socket(ksocket_t *, int, int, int, int, struct cred *); +extern int ksocket_bind(ksocket_t, struct sockaddr *, socklen_t, + struct cred *); +extern int ksocket_listen(ksocket_t, int, struct cred *); +extern int ksocket_accept(ksocket_t, struct sockaddr *, socklen_t *, + ksocket_t *, struct cred *); +extern int ksocket_connect(ksocket_t, const struct sockaddr *, socklen_t, + struct cred *); +extern int ksocket_send(ksocket_t, void *, size_t, int, size_t *, + struct cred *); +extern int ksocket_sendto(ksocket_t, void *, size_t, int, + struct sockaddr *, socklen_t, size_t *, struct cred *); +extern int ksocket_sendmsg(ksocket_t, struct nmsghdr *, int, size_t *, + struct cred *); +extern int ksocket_sendmblk(ksocket_t, struct nmsghdr *, int, mblk_t **, + struct cred *); +extern int ksocket_recv(ksocket_t, void *, size_t, int, size_t *, + struct cred *); +extern int ksocket_recvfrom(ksocket_t, void *, size_t, int, + struct sockaddr *, socklen_t *, size_t *, struct cred *); +extern int ksocket_recvmsg(ksocket_t, struct nmsghdr *, int, size_t *, + struct cred *); +extern int ksocket_shutdown(ksocket_t, int, struct cred *); +extern int ksocket_setsockopt(ksocket_t, int, int, const void *, int, + struct cred *); +extern int ksocket_getsockopt(ksocket_t, int, int, void *, int *, + struct cred *); +extern int ksocket_getpeername(ksocket_t, struct sockaddr *, socklen_t *, + struct cred *); +extern int ksocket_getsockname(ksocket_t, struct sockaddr *, socklen_t *, + struct cred *); +extern int ksocket_ioctl(ksocket_t, int, intptr_t, int *, struct cred *); +extern int ksocket_setcallbacks(ksocket_t, ksocket_callbacks_t *, void *, + struct cred *); +extern int ksocket_close(ksocket_t, struct cred *); +extern void ksocket_hold(ksocket_t); +extern void ksocket_rele(ksocket_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_KSOCKET_H_ */ diff --git a/usr/src/uts/common/sys/modctl.h b/usr/src/uts/common/sys/modctl.h index 47a83b15d9..ed0811c580 100644 --- a/usr/src/uts/common/sys/modctl.h +++ b/usr/src/uts/common/sys/modctl.h @@ -26,8 +26,6 @@ #ifndef _SYS_MODCTL_H #define _SYS_MODCTL_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * loadable module support. */ @@ -73,6 +71,7 @@ extern struct mod_ops mod_miscops; extern struct mod_ops mod_schedops; extern struct mod_ops mod_strmodops; extern struct mod_ops mod_syscallops; +extern struct mod_ops mod_sockmodops; #ifdef _SYSCALL32_IMPL extern struct mod_ops mod_syscallops32; #endif @@ -191,6 +190,13 @@ struct modldev { struct devname_ops *dev_ops; }; +/* For socket Modules. */ +struct modlsockmod { + struct mod_ops *sockmod_modops; + char *sockmod_linkinfo; + struct smod_reg_s *sockmod_reg_info; +}; + /* For kiconv modules */ struct modlkiconv { struct mod_ops *kiconv_modops; diff --git a/usr/src/uts/common/sys/socket.h b/usr/src/uts/common/sys/socket.h index 0432b529be..593505a426 100644 --- a/usr/src/uts/common/sys/socket.h +++ b/usr/src/uts/common/sys/socket.h @@ -120,6 +120,15 @@ typedef void *_RESTRICT_KYWD Psocklen_t; #ifdef _KERNEL #define SO_SND_COPYAVOID 0x0800 /* Internal: use zero-copy */ +#define SO_SND_BUFINFO 0x1000 /* Internal: get buffer info */ + /* when doing zero-copy */ + +struct so_snd_bufinfo { + ushort_t sbi_wroff; /* Write offset */ + ssize_t sbi_maxblk; /* Max size of a single mblk */ + ssize_t sbi_maxpsz; /* Max total size of a mblk chain */ + ushort_t sbi_tail; /* Extra space available at the end */ +}; #endif /* _KERNEL */ /* @@ -143,6 +152,7 @@ typedef void *_RESTRICT_KYWD Psocklen_t; #define SO_ANON_MLP 0x100a /* create MLP on anonymous bind */ #define SO_MAC_EXEMPT 0x100b /* allow dominated unlabeled peers */ #define SO_DOMAIN 0x100c /* get socket domain */ +#define SO_RCVPSH 0x100d /* receive interval to push data */ /* "Socket"-level control message types: */ #define SCM_RIGHTS 0x1010 /* access rights (array of int) */ @@ -167,6 +177,21 @@ typedef void *_RESTRICT_KYWD Psocklen_t; */ #define SO_ACCEPTOR 0x20000 /* acceptor socket */ #define SO_SOCKSTR 0x40000 /* normal socket stream */ +#define SO_FALLBACK 0x80000 /* fallback to TPI socket */ + +/* + * Flags for socket_create() and socket_newconn() + */ +#define SOCKET_SLEEP KM_SLEEP +#define SOCKET_NOSLEEP KM_NOSLEEP + + +/* + * flags used by sockfs when falling back to tpi socket + */ +#define SO_FB_START 0x1 +#define SO_FB_FINISH 0x2 + #endif /* _KERNEL */ /* @@ -340,6 +365,8 @@ struct msghdr32 { #define MSG_CTRUNC 0x10 /* Control data truncated */ #define MSG_TRUNC 0x20 /* Normal data truncated */ #define MSG_WAITALL 0x40 /* Wait for complete recv or error */ +#define MSG_DUPCTRL 0x800 /* Save control message for use with */ + /* with left over data */ /* End of XPGv2 compliance */ #define MSG_DONTWAIT 0x80 /* Don't block for this recv */ #define MSG_NOTIFICATION 0x100 /* Notification, not data */ @@ -347,6 +374,18 @@ struct msghdr32 { #define MSG_MAXIOVLEN 16 +#ifdef _KERNEL + +/* + * for kernel socket only + */ +#define MSG_MBLK_QUICKRELE 0x10000000 /* free mblk chain */ + /* in timely manner */ +#define MSG_USERSPACE 0x20000000 /* buffer from user space */ + +#endif /* _KERNEL */ + + /* Added for XPGv2 compliance */ #define SHUT_RD 0 #define SHUT_WR 1 diff --git a/usr/src/uts/common/sys/socket_proto.h b/usr/src/uts/common/sys/socket_proto.h new file mode 100644 index 0000000000..8f60ea9e31 --- /dev/null +++ b/usr/src/uts/common/sys/socket_proto.h @@ -0,0 +1,182 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SOCKET_PROTO_H_ +#define _SYS_SOCKET_PROTO_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/socket.h> + +/* + * Generation count + */ +typedef uint64_t sock_connid_t; + +#define SOCK_CONNID_INIT(id) { \ + (id) = 0; \ +} +#define SOCK_CONNID_BUMP(id) (++(id)) +#define SOCK_CONNID_LT(id1, id2) ((int64_t)((id1)-(id2)) < 0) + +/* Socket protocol properties */ +struct sock_proto_props { + uint_t sopp_flags; /* options to set */ + ushort_t sopp_wroff; /* write offset */ + ssize_t sopp_txhiwat; /* tx hi water mark */ + ssize_t sopp_txlowat; /* tx lo water mark */ + ssize_t sopp_rxhiwat; /* recv high water mark */ + ssize_t sopp_rxlowat; /* recv low water mark */ + ssize_t sopp_maxblk; /* maximum message block size */ + ssize_t sopp_maxpsz; /* maximum packet size */ + ssize_t sopp_minpsz; /* minimum packet size */ + ushort_t sopp_tail; /* space available at the end */ + uint_t sopp_zcopyflag; /* zero copy flag */ + boolean_t sopp_oobinline; /* OOB inline */ + uint_t sopp_rcvtimer; /* delayed recv notification (time) */ + uint32_t sopp_rcvthresh; /* delayed recv notification (bytes) */ + socklen_t sopp_maxaddrlen; /* maximum size of protocol address */ +}; + +/* flags to determine which socket options are set */ +#define SOCKOPT_WROFF 0x0001 /* set write offset */ +#define SOCKOPT_RCVHIWAT 0x0002 /* set read side high water */ +#define SOCKOPT_RCVLOWAT 0x0004 /* set read side high water */ +#define SOCKOPT_MAXBLK 0x0008 /* set maximum message block size */ +#define SOCKOPT_TAIL 0x0010 /* set the extra allocated space */ +#define SOCKOPT_ZCOPY 0x0020 /* set/unset zero copy for sendfile */ +#define SOCKOPT_MAXPSZ 0x0040 /* set maxpsz for protocols */ +#define SOCKOPT_OOBINLINE 0x0080 /* set oob inline processing */ +#define SOCKOPT_RCVTIMER 0x0100 +#define SOCKOPT_RCVTHRESH 0x0200 +#define SOCKOPT_MAXADDRLEN 0x0400 /* set max address length */ +#define SOCKOPT_MINPSZ 0x0800 /* set minpsz for protocols */ + +#define IS_SO_OOB_INLINE(so) ((so)->so_proto_props.sopp_oobinline) + +#ifdef _KERNEL + +struct T_capability_ack; + +typedef struct sock_upcalls_s sock_upcalls_t; +typedef struct sock_downcalls_s sock_downcalls_t; + +/* + * Upcall and downcall handle for sockfs and transport layer. + */ +typedef struct __sock_upper_handle *sock_upper_handle_t; +typedef struct __sock_lower_handle *sock_lower_handle_t; + +struct sock_downcalls_s { + void (*sd_activate)(sock_lower_handle_t, sock_upper_handle_t, + sock_upcalls_t *, int, cred_t *); + int (*sd_accept)(sock_lower_handle_t, sock_lower_handle_t, + sock_upper_handle_t, cred_t *); + int (*sd_bind)(sock_lower_handle_t, struct sockaddr *, socklen_t, + cred_t *); + int (*sd_listen)(sock_lower_handle_t, int, cred_t *); + int (*sd_connect)(sock_lower_handle_t, const struct sockaddr *, + socklen_t, sock_connid_t *, cred_t *); + int (*sd_getpeername)(sock_lower_handle_t, struct sockaddr *, + socklen_t *, cred_t *); + int (*sd_getsockname)(sock_lower_handle_t, struct sockaddr *, + socklen_t *, cred_t *); + int (*sd_getsockopt)(sock_lower_handle_t, int, int, void *, + socklen_t *, cred_t *); + int (*sd_setsockopt)(sock_lower_handle_t, int, int, const void *, + socklen_t, cred_t *); + int (*sd_send)(sock_lower_handle_t, mblk_t *, struct nmsghdr *, + cred_t *); + int (*sd_send_uio)(sock_lower_handle_t, uio_t *, struct nmsghdr *, + cred_t *); + int (*sd_recv_uio)(sock_lower_handle_t, uio_t *, struct nmsghdr *, + cred_t *); + short (*sd_poll)(sock_lower_handle_t, short, int, cred_t *); + int (*sd_shutdown)(sock_lower_handle_t, int, cred_t *); + void (*sd_clr_flowctrl)(sock_lower_handle_t); + int (*sd_ioctl)(sock_lower_handle_t, int, intptr_t, int, + int32_t *, cred_t *); + int (*sd_close)(sock_lower_handle_t, int, cred_t *); +}; + +typedef sock_lower_handle_t (*so_proto_create_func_t)(int, int, int, + sock_downcalls_t **, uint_t *, int *, int, cred_t *); + +typedef void (*so_proto_quiesced_cb_t)(sock_upper_handle_t, queue_t *, + struct T_capability_ack *, struct sockaddr *, socklen_t, + struct sockaddr *, socklen_t, short); +typedef void (*so_proto_fallback_func_t)(sock_lower_handle_t, queue_t *, + boolean_t, so_proto_quiesced_cb_t); + +/* + * Upcalls and related information + */ + +/* + * su_opctl() actions + */ +typedef enum sock_opctl_action { + SOCK_OPCTL_ENAB_ACCEPT = 0, + SOCK_OPCTL_SHUT_SEND, + SOCK_OPCTL_SHUT_RECV +} sock_opctl_action_t; + +struct sock_upcalls_s { + sock_upper_handle_t (*su_newconn)(sock_upper_handle_t, + sock_lower_handle_t, sock_downcalls_t *, cred_t *, pid_t, + sock_upcalls_t **); + void (*su_connected)(sock_upper_handle_t, sock_connid_t, cred_t *, + pid_t); + int (*su_disconnected)(sock_upper_handle_t, sock_connid_t, int); + void (*su_opctl)(sock_upper_handle_t, sock_opctl_action_t, + uintptr_t); + ssize_t (*su_recv)(sock_upper_handle_t, mblk_t *, size_t, int, + int *, boolean_t *); + void (*su_set_proto_props)(sock_upper_handle_t, + struct sock_proto_props *); + void (*su_txq_full)(sock_upper_handle_t, boolean_t); + void (*su_signal_oob)(sock_upper_handle_t, ssize_t); + void (*su_zcopy_notify)(sock_upper_handle_t); + void (*su_set_error)(sock_upper_handle_t, int); +}; + +#define SOCK_UC_VERSION sizeof (sock_upcalls_t) +#define SOCK_DC_VERSION sizeof (sock_downcalls_t) + +#define SOCKET_RECVHIWATER (48 * 1024) +#define SOCKET_RECVLOWATER 1024 + +#define SOCKET_NO_RCVTIMER 0 +#define SOCKET_TIMER_INTERVAL 50 + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SOCKET_PROTO_H_ */ diff --git a/usr/src/uts/common/sys/socketvar.h b/usr/src/uts/common/sys/socketvar.h index 37a699345a..510d9445cf 100644 --- a/usr/src/uts/common/sys/socketvar.h +++ b/usr/src/uts/common/sys/socketvar.h @@ -48,25 +48,18 @@ #include <sys/file.h> #include <sys/param.h> #include <sys/zone.h> +#include <sys/sdt.h> +#include <sys/modctl.h> +#include <sys/atomic.h> +#include <sys/socket.h> +#include <sys/ksocket.h> #include <sys/sodirect.h> -#include <inet/kssl/ksslapi.h> #ifdef __cplusplus extern "C" { #endif /* - * Internal representation used for addresses. - */ -struct soaddr { - struct sockaddr *soa_sa; /* Actual address */ - t_uscalar_t soa_len; /* Length in bytes for kmem_free */ - t_uscalar_t soa_maxlen; /* Allocated length */ -}; -/* Maximum size address for transports that have ADDR_size == 1 */ -#define SOA_DEFSIZE 128 - -/* * Internal representation of the address used to represent addresses * in the loopback transport for AF_UNIX. While the sockaddr_un is used * as the sockfs layer address for AF_UNIX the pathnames contained in @@ -97,6 +90,10 @@ struct sockaddr_ux { struct so_ux_addr sou_addr; }; +#if defined(_KERNEL) || defined(_KMEMUSER) + +#include <sys/socket_proto.h> + typedef struct sonodeops sonodeops_t; typedef struct sonode sonode_t; @@ -105,236 +102,149 @@ typedef struct sonode sonode_t; * name space and can not be opened using open() - only the socket, socketpair * and accept calls create sonodes. * - * When an AF_UNIX socket is bound to a pathname the sockfs - * creates a VSOCK vnode in the underlying file system. However, the vnodeops - * etc in this VNODE remain those of the underlying file system. - * Sockfs uses the v_stream pointer in the underlying file system VSOCK node - * to find the sonode bound to the pathname. The bound pathname vnode - * is accessed through so_ux_vp. - * - * A socket always corresponds to a VCHR stream representing the transport - * provider (e.g. /dev/tcp). This information is retrieved from the kernel - * socket configuration table and entered into so_accessvp. sockfs uses - * this to perform VOP_ACCESS checks before allowing an open of the transport - * provider. + * The locking of sockfs uses the so_lock mutex plus the SOLOCKED and + * SOREADLOCKED flags in so_flag. The mutex protects all the state in the + * sonode. It is expected that the underlying transport protocol serializes + * socket operations, so sockfs will not normally not single-thread + * operations. However, certain sockets, including TPI based ones, can only + * handle one control operation at a time. The SOLOCKED flag is used to + * single-thread operations from sockfs users to prevent e.g. multiple bind() + * calls to operate on the same sonode concurrently. The SOREADLOCKED flag is + * used to ensure that only one thread sleeps in kstrgetmsg for a given + * sonode. This is needed to ensure atomic operation for things like + * MSG_WAITALL. * - * The locking of sockfs uses the so_lock mutex plus the SOLOCKED - * and SOREADLOCKED flags in so_flag. The mutex protects all the state - * in the sonode. The SOLOCKED flag is used to single-thread operations from - * sockfs users to prevent e.g. multiple bind() calls to operate on the - * same sonode concurrently. The SOREADLOCKED flag is used to ensure that - * only one thread sleeps in kstrgetmsg for a given sonode. This is needed - * to ensure atomic operation for things like MSG_WAITALL. + * The so_fallback_rwlock is used to ensure that for sockets that can + * fall back to TPI, the fallback is not initiated until all pending + * operations have completed. * * Note that so_lock is sometimes held across calls that might go to sleep * (kmem_alloc and soallocproto*). This implies that no other lock in * the system should be held when calling into sockfs; from the system call - * side or from strrput. If locks are held while calling into sockfs - * the system might hang when running low on memory. + * side or from strrput (in case of TPI based sockets). If locks are held + * while calling into sockfs the system might hang when running low on memory. */ struct sonode { struct vnode *so_vnode; /* vnode associated with this sonode */ - sonodeops_t *so_ops; /* operations vector for this sonode */ - - /* - * These fields are initialized once. - */ - dev_t so_dev; /* device the sonode represents */ - struct vnode *so_accessvp; /* vnode for the /dev entry */ + sonodeops_t *so_ops; /* operations vector for this sonode */ + void *so_priv; /* sonode private data */ - /* The locks themselves */ + krwlock_t so_fallback_rwlock; kmutex_t so_lock; /* protects sonode fields */ - kmutex_t so_plumb_lock; /* serializes plumbs, and the related */ - /* fields so_version and so_pushcnt */ + kcondvar_t so_state_cv; /* synchronize state changes */ - kcondvar_t so_ack_cv; /* wait for TPI acks */ - kcondvar_t so_connind_cv; /* wait for T_CONN_IND */ kcondvar_t so_want_cv; /* wait due to SOLOCKED */ /* These fields are protected by so_lock */ - uint_t so_state; /* internal state flags SS_*, below */ - uint_t so_mode; /* characteristics on socket. SM_* */ - mblk_t *so_ack_mp; /* TPI ack received from below */ - mblk_t *so_conn_ind_head; /* b_next list of T_CONN_IND */ - mblk_t *so_conn_ind_tail; - mblk_t *so_unbind_mp; /* Preallocated T_UNBIND_REQ message */ + uint_t so_state; /* internal state flags SS_*, below */ + uint_t so_mode; /* characteristics on socket. SM_* */ + ushort_t so_flag; /* flags, see below */ + int so_count; /* count of opened references */ + + sock_connid_t so_proto_connid; /* protocol generation number */ - ushort_t so_flag; /* flags, see below */ - dev_t so_fsid; /* file system identifier */ - time_t so_atime; /* time of last access */ - time_t so_mtime; /* time of last modification */ - time_t so_ctime; /* time of last attributes change */ - int so_count; /* count of opened references */ + ushort_t so_error; /* error affecting connection */ + struct sockparams *so_sockparams; /* vnode or socket module */ /* Needed to recreate the same socket for accept */ short so_family; short so_type; short so_protocol; short so_version; /* From so_socket call */ - short so_pushcnt; /* Number of modules above "sockmod" */ + + /* Accept queue */ + kmutex_t so_acceptq_lock; /* protects accept queue */ + struct sonode *so_acceptq_next; /* acceptq list node */ + struct sonode *so_acceptq_head; + struct sonode **so_acceptq_tail; + unsigned int so_acceptq_len; + unsigned int so_backlog; /* Listen backlog */ + kcondvar_t so_acceptq_cv; /* wait for new conn. */ /* Options */ short so_options; /* From socket call, see socket.h */ struct linger so_linger; /* SO_LINGER value */ - int so_sndbuf; /* SO_SNDBUF value */ - int so_rcvbuf; /* SO_RCVBUF value */ - int so_sndlowat; /* send low water mark */ - int so_rcvlowat; /* receive low water mark */ -#ifdef notyet - int so_sndtimeo; /* Not yet implemented */ - int so_rcvtimeo; /* Not yet implemented */ -#endif /* notyet */ - ushort_t so_error; /* error affecting connection */ - ushort_t so_delayed_error; /* From T_uderror_ind */ - int so_backlog; /* Listen backlog */ +#define so_sndbuf so_proto_props.sopp_txhiwat /* SO_SNDBUF value */ +#define so_sndlowat so_proto_props.sopp_txlowat /* tx low water mark */ +#define so_rcvbuf so_proto_props.sopp_rxhiwat /* SO_RCVBUF value */ +#define so_rcvlowat so_proto_props.sopp_rxlowat /* rx low water mark */ +#define so_max_addr_len so_proto_props.sopp_maxaddrlen +#define so_minpsz so_proto_props.sopp_minpsz +#define so_maxpsz so_proto_props.sopp_maxpsz + + clock_t so_sndtimeo; /* send timeout */ + clock_t so_rcvtimeo; /* recv timeout */ - /* - * The counts (so_oobcnt and so_oobsigcnt) track the number of - * urgent indicates that are (logically) queued on the stream head - * read queue. The urgent data is queued on the stream head - * as follows. - * - * In the normal case the SIGURG is not generated until - * the T_EXDATA_IND arrives at the stream head. However, transports - * that have an early indication that urgent data is pending - * (e.g. TCP receiving a "new" urgent pointer value) can send up - * an M_PCPROTO/SIGURG message to generate the signal early. - * - * The mark is indicated by either: - * - a T_EXDATA_IND (with no M_DATA b_cont) with MSGMARK set. - * When this message is consumed by sorecvmsg the socket layer - * sets SS_RCVATMARK until data has been consumed past the mark. - * - a message with MSGMARKNEXT set (indicating that the - * first byte of the next message constitutes the mark). When - * the last byte of the MSGMARKNEXT message is consumed in - * the stream head the stream head sets STRATMARK. This flag - * is cleared when at least one byte is read. (Note that - * the MSGMARKNEXT messages can be of zero length when there - * is no previous data to which the marknext can be attached.) - * - * While the T_EXDATA_IND method is the common case which is used - * with all TPI transports, the MSGMARKNEXT method is needed to - * indicate the mark when e.g. the TCP urgent byte has not been - * received yet but the TCP urgent pointer has made TCP generate - * the M_PCSIG/SIGURG. - * - * The signal (the M_PCSIG carrying the SIGURG) and the mark - * indication can not be delivered as a single message, since - * the signal should be delivered as high priority and any mark - * indication must flow with the data. This implies that immediately - * when the SIGURG has been delivered if the stream head queue is - * empty it is impossible to determine if this will be the position - * of the mark. This race condition is resolved by using MSGNOTMARKNEXT - * messages and the STRNOTATMARK flag in the stream head. The - * SIOCATMARK code calls the stream head to wait for either a - * non-empty queue or one of the STR*ATMARK flags being set. - * This implies that any transport that is sending M_PCSIG(SIGURG) - * should send the appropriate MSGNOTMARKNEXT message (which can be - * zero length) after sending an M_PCSIG to prevent SIOCATMARK - * from sleeping unnecessarily. - */ mblk_t *so_oobmsg; /* outofline oob data */ - uint_t so_oobsigcnt; /* Number of SIGURG generated */ - uint_t so_oobcnt; /* Number of T_EXDATA_IND queued */ + ssize_t so_oobmark; /* offset of the oob data */ + pid_t so_pgrp; /* pgrp for signals */ - /* From T_info_ack */ - t_uscalar_t so_tsdu_size; - t_uscalar_t so_etsdu_size; - t_scalar_t so_addr_size; - t_uscalar_t so_opt_size; - t_uscalar_t so_tidu_size; - t_scalar_t so_serv_type; + cred_t *so_peercred; /* connected socket peer cred */ + pid_t so_cpid; /* connected socket peer cached pid */ + zoneid_t so_zoneid; /* opener's zoneid */ - /* From T_capability_ack */ - t_uscalar_t so_acceptor_id; + struct pollhead so_poll_list; /* common pollhead */ + short so_pollev; /* events that should be generated */ - /* Internal provider information */ - struct tpi_provinfo *so_provinfo; + /* Receive */ + unsigned int so_rcv_queued; + mblk_t *so_rcv_q_head; + mblk_t *so_rcv_q_last_head; + mblk_t *so_rcv_head; /* 1st mblk in the list */ + mblk_t *so_rcv_last_head; /* last mblk in b_next chain */ + kcondvar_t so_rcv_cv; + uint_t so_rcv_wanted; /* # of bytes wanted by app */ + timeout_id_t so_rcv_timer_tid; - /* - * The local and remote addresses have multiple purposes - * but one of the key reasons for their existence and careful - * tracking in sockfs is to support getsockname and getpeername - * when the transport does not handle the TI_GET*NAME ioctls - * and caching when it does (signaled by valid bits in so_state). - * When all transports support the new TPI (with T_ADDR_REQ) - * we can revisit this code. - * The other usage of so_faddr is to keep the "connected to" - * address for datagram sockets. - * Finally, for AF_UNIX both local and remote addresses are used - * to record the sockaddr_un since we use a separate namespace - * in the loopback transport. - */ - struct soaddr so_laddr; /* Local address */ - struct soaddr so_faddr; /* Peer address */ -#define so_laddr_sa so_laddr.soa_sa -#define so_faddr_sa so_faddr.soa_sa -#define so_laddr_len so_laddr.soa_len -#define so_faddr_len so_faddr.soa_len -#define so_laddr_maxlen so_laddr.soa_maxlen -#define so_faddr_maxlen so_faddr.soa_maxlen - mblk_t *so_eaddr_mp; /* for so_delayed_error */ +#define so_rcv_thresh so_proto_props.sopp_rcvthresh +#define so_rcv_timer_interval so_proto_props.sopp_rcvtimer - /* - * For AF_UNIX sockets: - * so_ux_laddr/faddr records the internal addresses used with the - * transport. - * so_ux_vp and v_stream->sd_vnode form the cross- - * linkage between the underlying fs vnode corresponding to - * the bound sockaddr_un and the socket node. - */ - struct so_ux_addr so_ux_laddr; /* laddr bound with the transport */ - struct so_ux_addr so_ux_faddr; /* temporary peer address */ - struct vnode *so_ux_bound_vp; /* bound AF_UNIX file system vnode */ - struct sonode *so_next; /* next sonode on socklist */ - struct sonode *so_prev; /* previous sonode on socklist */ - mblk_t *so_discon_ind_mp; /* T_DISCON_IND received from below */ - - /* put here for delayed processing */ - void *so_priv; /* sonode private data */ - cred_t *so_peercred; /* connected socket peer cred */ - pid_t so_cpid; /* connected socket peer cached pid */ - zoneid_t so_zoneid; /* opener's zoneid */ + /* Send */ + boolean_t so_snd_qfull; /* Transmit full */ + kcondvar_t so_snd_cv; - kmem_cache_t *so_cache; /* object cache of this "sonode". */ - void *so_obj; /* object to free */ + boolean_t so_rcv_wakeup; + boolean_t so_snd_wakeup; - /* - * For NL7C sockets: - * - * so_nl7c_flags the NL7C state of URL processing. - * - * so_nl7c_rcv_mp mblk_t chain of already received data to be - * passed up to the app after NL7C gives up on - * a socket. - * - * so_nl7c_rcv_rval returned rval for last mblk_t from above. - * - * so_nl7c_uri the URI currently being processed. - * - * so_nl7c_rtime URI request gethrestime_sec(). - * - * so_nl7c_addr pointer returned by nl7c_addr_lookup(). - */ - uint64_t so_nl7c_flags; - mblk_t *so_nl7c_rcv_mp; - int64_t so_nl7c_rcv_rval; - void *so_nl7c_uri; - time_t so_nl7c_rtime; - void *so_nl7c_addr; - - /* For sockets acting as an in-kernel SSL proxy */ - kssl_endpt_type_t so_kssl_type; /* is proxy/is proxied/none */ - kssl_ent_t so_kssl_ent; /* SSL config entry */ - kssl_ctx_t so_kssl_ctx; /* SSL session context */ + /* Communication channel with protocol */ + sock_lower_handle_t so_proto_handle; + sock_downcalls_t *so_downcalls; + + struct sock_proto_props so_proto_props; /* protocol settings */ + boolean_t so_flowctrld; /* Flow controlled */ + uint_t so_copyflag; /* Copy related flag */ + kcondvar_t so_copy_cv; /* Copy cond variable */ + + /* kernel sockets */ + ksocket_callbacks_t so_ksock_callbacks; + void *so_ksock_cb_arg; /* callback argument */ + kcondvar_t so_closing_cv; /* != NULL for sodirect_t enabled socket */ - sodirect_t *so_direct; + sodirect_t *so_direct; }; +/* + * We do an initial check for events without holding locks. However, + * if there are no event available, then we redo the check for POLLIN + * events under the lock. + */ +#define SO_HAVE_DATA(so) \ + ((so)->so_rcv_timer_tid == 0 && (so->so_rcv_queued > 0)) || \ + ((so)->so_rcv_queued > (so)->so_rcv_thresh) || \ + ((so)->so_state & SS_CANTRCVMORE) + +/* + * Events handled by the protocol (in case sd_poll is set) + */ +#define SO_PROTO_POLLEV (POLLIN|POLLRDNORM|POLLRDBAND) + + +#endif /* _KERNEL || _KMEMUSER */ + /* flags */ #define SOMOD 0x0001 /* update socket modification time */ #define SOACC 0x0002 /* update socket access time */ @@ -345,6 +255,8 @@ struct sonode { #define SOCLONE 0x0080 /* child of clone driver */ #define SOASYNC_UNBIND 0x0100 /* wait for ACK of async unbind */ +#define SOCK_IS_NONSTR(so) ((so)->so_vnode->v_stream == NULL) + /* * Socket state bits. */ @@ -360,31 +272,59 @@ struct sonode { #define SS_ASYNC 0x00000100 /* async i/o notify */ #define SS_ACCEPTCONN 0x00000200 /* listen done */ -#define SS_HASCONNIND 0x00000400 /* T_CONN_IND for poll */ +/* unused 0x00000400 */ /* was SS_HASCONNIND */ #define SS_SAVEDEOR 0x00000800 /* Saved MSG_EOR rcv side state */ #define SS_RCVATMARK 0x00001000 /* at mark on input */ #define SS_OOBPEND 0x00002000 /* OOB pending or present - poll */ #define SS_HAVEOOBDATA 0x00004000 /* OOB data present */ #define SS_HADOOBDATA 0x00008000 /* OOB data consumed */ +#define SS_CLOSING 0x00010000 /* in process of closing */ -#define SS_FADDR_NOXLATE 0x00020000 /* No xlation of faddr for AF_UNIX */ - -#define SS_HASDATA 0x00040000 /* NCAfs: data available */ -#define SS_DONEREAD 0x00080000 /* NCAfs: all data read */ -#define SS_MOREDATA 0x00100000 /* NCAfs: NCA has more data */ +/* unused 0x00020000 */ /* was SS_FADDR_NOXLATE */ +/* unused 0x00040000 */ /* was SS_HASDATA */ +/* unused 0x00080000 */ /* was SS_DONEREAD */ +/* unused 0x00100000 */ /* was SS_MOREDATA */ +/* unused 0x00200000 */ /* was SS_DIRECT */ -#define SS_DIRECT 0x00200000 /* transport is directly below */ #define SS_SODIRECT 0x00400000 /* transport supports sodirect */ -#define SS_LADDR_VALID 0x01000000 /* so_laddr valid for user */ -#define SS_FADDR_VALID 0x02000000 /* so_faddr valid for user */ +/* unused 0x01000000 */ /* was SS_LADDR_VALID */ +/* unused 0x02000000 */ /* was SS_FADDR_VALID */ + +#define SS_SENTLASTREADSIG 0x10000000 /* last rx signal has been sent */ +#define SS_SENTLASTWRITESIG 0x20000000 /* last tx signal has been sent */ + +#define SS_FALLBACK_PENDING 0x40000000 +#define SS_FALLBACK_COMP 0x80000000 + /* Set of states when the socket can't be rebound */ #define SS_CANTREBIND (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING|\ SS_CANTSENDMORE|SS_CANTRCVMORE|SS_ACCEPTCONN) /* + * Sockets that can fall back to TPI must ensure that fall back is not + * initiated while a thread is using a socket. + */ +#define SO_BLOCK_FALLBACK(so, fn) { \ + ASSERT(MUTEX_NOT_HELD(&(so)->so_lock)); \ + rw_enter(&(so)->so_fallback_rwlock, RW_READER); \ + if ((so)->so_state & SS_FALLBACK_COMP) { \ + rw_exit(&(so)->so_fallback_rwlock); \ + return (fn); \ + } \ +} + +#define SO_UNBLOCK_FALLBACK(so) { \ + rw_exit(&(so)->so_fallback_rwlock); \ +} + +/* Poll events */ +#define SO_POLLEV_IN 0x1 /* POLLIN wakeup needed */ +#define SO_POLLEV_ALWAYS 0x2 /* wakeups */ + +/* * Characteristics of sockets. Not changed after the socket is created. */ #define SM_PRIV 0x001 /* privileged for broadcast, raw... */ @@ -399,6 +339,10 @@ struct sonode { #define SM_ACCEPTOR_ID 0x100 /* so_acceptor_id is valid */ +#define SM_KERNEL 0x200 /* kernel socket */ + +#define SM_ACCEPTSUPP 0x400 /* can handle accept() */ + /* * Socket versions. Used by the socket library when calling _so_socket(). */ @@ -409,21 +353,177 @@ struct sonode { #define SOV_XPG4_2 4 /* Xnet socket */ #if defined(_KERNEL) || defined(_KMEMUSER) + +/* + * sonode create and destroy functions. + */ +typedef struct sonode *(*so_create_func_t)(struct sockparams *, + int, int, int, int, int, int *, cred_t *); +typedef void (*so_destroy_func_t)(struct sonode *); + +/* STREAM device information */ +typedef struct sdev_info { + char *sd_devpath; + int sd_devpathlen; /* Is 0 if sp_devpath is a static string */ + vnode_t *sd_vnode; +} sdev_info_t; + +#define SOCKMOD_VERSION 1 +/* name of the TPI pseudo socket module */ +#define SOTPI_SMOD_NAME "socktpi" + +typedef struct __smod_priv_s { + so_create_func_t smodp_sock_create_func; + so_destroy_func_t smodp_sock_destroy_func; + so_proto_fallback_func_t smodp_proto_fallback_func; +} __smod_priv_t; + /* - * Used for mapping family/type/protocol to vnode. - * Defined here so that crash can use it. + * Socket module register information + */ +typedef struct smod_reg_s { + int smod_version; + char *smod_name; + size_t smod_uc_version; + size_t smod_dc_version; + so_proto_create_func_t smod_proto_create_func; + + /* __smod_priv_data must be NULL */ + __smod_priv_t *__smod_priv; +} smod_reg_t; + +/* + * Socket module information + */ +typedef struct smod_info { + int smod_version; + char *smod_name; + uint_t smod_refcnt; /* # of entries */ + size_t smod_uc_version; /* upcall version */ + size_t smod_dc_version; /* down call version */ + so_proto_create_func_t smod_proto_create_func; + so_proto_fallback_func_t smod_proto_fallback_func; + so_create_func_t smod_sock_create_func; + so_destroy_func_t smod_sock_destroy_func; + list_node_t smod_node; +} smod_info_t; + +/* + * sockparams + * + * Used for mapping family/type/protocol to module */ struct sockparams { - int sp_domain; - int sp_type; - int sp_protocol; - char *sp_devpath; - int sp_devpathlen; /* Is 0 if sp_devpath is a static string */ - vnode_t *sp_vnode; - struct sockparams *sp_next; + /* + * The family, type, protocol, sdev_info and smod_info are + * set when the entry is created, and they will never change + * thereafter. + */ + int sp_family; + int sp_type; + int sp_protocol; + + sdev_info_t sp_sdev_info; /* STREAM device */ + char *sp_smod_name; /* socket module name */ + smod_info_t *sp_smod_info; /* socket module */ + + kmutex_t sp_lock; /* lock for refcnt */ + uint64_t sp_refcnt; /* entry reference count */ + + /* + * The entries below are only modified while holding + * splist_lock as a writer. + */ + int sp_flags; /* see below */ + list_node_t sp_node; }; -extern struct sockparams *sphead; + +/* + * sockparams flags + */ +#define SOCKPARAMS_EPHEMERAL 0x1 /* temp. entry, not on global list */ + +extern void sockparams_init(void); +extern struct sockparams *sockparams_hold_ephemeral_bydev(int, int, int, + const char *, int, int *); +extern struct sockparams *sockparams_hold_ephemeral_bymod(int, int, int, + const char *, int, int *); +extern void sockparams_ephemeral_drop_last_ref(struct sockparams *); + +extern void smod_init(void); +extern void smod_add(smod_info_t *); +extern int smod_register(const smod_reg_t *); +extern int smod_unregister(const char *); +extern smod_info_t *smod_lookup_byname(const char *); + +#define SOCKPARAMS_HAS_DEVICE(sp) \ + ((sp)->sp_sdev_info.sd_devpath != NULL) + +/* Increase the smod_info_t reference count */ +#define SMOD_INC_REF(smodp) { \ + ASSERT((smodp) != NULL); \ + DTRACE_PROBE1(smodinfo__inc__ref, struct smod_info *, (smodp)); \ + atomic_inc_uint(&(smodp)->smod_refcnt); \ +} + +/* + * Decreace the socket module entry reference count. + * When no one mapping to the entry, we try to unload the module from the + * kernel. If the module can't unload, just leave the module entry with + * a zero refcnt. + */ +#define SMOD_DEC_REF(sp, smodp) { \ + ASSERT((smodp) != NULL); \ + ASSERT((smodp)->smod_refcnt != 0); \ + atomic_dec_uint(&(smodp)->smod_refcnt); \ + /* \ + * No need to atomically check the return value because the \ + * socket module framework will verify that no one is using \ + * the module before unloading. Worst thing that can happen \ + * here is multiple calls to mod_remove_by_name(), which is OK. \ + */ \ + if ((smodp)->smod_refcnt == 0) \ + (void) mod_remove_by_name((sp)->sp_smod_name); \ +} + +/* Increase the reference count */ +#define SOCKPARAMS_INC_REF(sp) { \ + ASSERT((sp) != NULL); \ + DTRACE_PROBE1(sockparams__inc__ref, struct sockparams *, (sp)); \ + mutex_enter(&(sp)->sp_lock); \ + (sp)->sp_refcnt++; \ + ASSERT((sp)->sp_refcnt != 0); \ + mutex_exit(&(sp)->sp_lock); \ +} + +/* + * Decrease the reference count. + * + * If the sockparams is ephemeral, then the thread dropping the last ref + * count will destroy the entry. + */ +#define SOCKPARAMS_DEC_REF(sp) { \ + ASSERT((sp) != NULL); \ + DTRACE_PROBE1(sockparams__dec__ref, struct sockparams *, (sp)); \ + mutex_enter(&(sp)->sp_lock); \ + ASSERT((sp)->sp_refcnt > 0); \ + if ((sp)->sp_refcnt == 1) { \ + if ((sp)->sp_flags & SOCKPARAMS_EPHEMERAL) { \ + mutex_exit(&(sp)->sp_lock); \ + sockparams_ephemeral_drop_last_ref((sp)); \ + } else { \ + (sp)->sp_refcnt--; \ + if ((sp)->sp_smod_info != NULL) \ + SMOD_DEC_REF(sp, (sp)->sp_smod_info); \ + (sp)->sp_smod_info = NULL; \ + mutex_exit(&(sp)->sp_lock); \ + } \ + } else { \ + (sp)->sp_refcnt--; \ + mutex_exit(&(sp)->sp_lock); \ + } \ +} /* * Used to traverse the list of AF_UNIX sockets to construct the kstat @@ -490,49 +590,71 @@ struct sendfile_queue { /* Socket network operations switch */ struct sonodeops { - int (*sop_accept)(struct sonode *, int, struct sonode **); - int (*sop_bind)(struct sonode *, struct sockaddr *, socklen_t, + int (*sop_init)(struct sonode *, struct sonode *, cred_t *, int); - int (*sop_listen)(struct sonode *, int); + int (*sop_accept)(struct sonode *, int, cred_t *, struct sonode **); + int (*sop_bind)(struct sonode *, struct sockaddr *, socklen_t, + int, cred_t *); + int (*sop_listen)(struct sonode *, int, cred_t *); int (*sop_connect)(struct sonode *, const struct sockaddr *, - socklen_t, int, int); + socklen_t, int, int, cred_t *); int (*sop_recvmsg)(struct sonode *, struct msghdr *, - struct uio *); + struct uio *, cred_t *); int (*sop_sendmsg)(struct sonode *, struct msghdr *, - struct uio *); - int (*sop_getpeername)(struct sonode *); - int (*sop_getsockname)(struct sonode *); - int (*sop_shutdown)(struct sonode *, int); + struct uio *, cred_t *); + int (*sop_sendmblk)(struct sonode *, struct msghdr *, int, + cred_t *, mblk_t **); + int (*sop_getpeername)(struct sonode *, struct sockaddr *, + socklen_t *, boolean_t, cred_t *); + int (*sop_getsockname)(struct sonode *, struct sockaddr *, + socklen_t *, cred_t *); + int (*sop_shutdown)(struct sonode *, int, cred_t *); int (*sop_getsockopt)(struct sonode *, int, int, void *, - socklen_t *, int); + socklen_t *, int, cred_t *); int (*sop_setsockopt)(struct sonode *, int, int, const void *, - socklen_t); + socklen_t, cred_t *); + int (*sop_ioctl)(struct sonode *, int, intptr_t, int, + cred_t *, int32_t *); + int (*sop_poll)(struct sonode *, short, int, short *, + struct pollhead **); + int (*sop_close)(struct sonode *, int, cred_t *); }; -#define SOP_ACCEPT(so, fflag, nsop) \ - ((so)->so_ops->sop_accept((so), (fflag), (nsop))) -#define SOP_BIND(so, name, namelen, flags) \ - ((so)->so_ops->sop_bind((so), (name), (namelen), (flags))) -#define SOP_LISTEN(so, backlog) \ - ((so)->so_ops->sop_listen((so), (backlog))) -#define SOP_CONNECT(so, name, namelen, fflag, flags) \ - ((so)->so_ops->sop_connect((so), (name), (namelen), (fflag), (flags))) -#define SOP_RECVMSG(so, msg, uiop) \ - ((so)->so_ops->sop_recvmsg((so), (msg), (uiop))) -#define SOP_SENDMSG(so, msg, uiop) \ - ((so)->so_ops->sop_sendmsg((so), (msg), (uiop))) -#define SOP_GETPEERNAME(so) \ - ((so)->so_ops->sop_getpeername((so))) -#define SOP_GETSOCKNAME(so) \ - ((so)->so_ops->sop_getsockname((so))) -#define SOP_SHUTDOWN(so, how) \ - ((so)->so_ops->sop_shutdown((so), (how))) -#define SOP_GETSOCKOPT(so, level, optionname, optval, optlenp, flags) \ +#define SOP_INIT(so, flag, cr, flags) \ + ((so)->so_ops->sop_init((so), (flag), (cr), (flags))) +#define SOP_ACCEPT(so, fflag, cr, nsop) \ + ((so)->so_ops->sop_accept((so), (fflag), (cr), (nsop))) +#define SOP_BIND(so, name, namelen, flags, cr) \ + ((so)->so_ops->sop_bind((so), (name), (namelen), (flags), (cr))) +#define SOP_LISTEN(so, backlog, cr) \ + ((so)->so_ops->sop_listen((so), (backlog), (cr))) +#define SOP_CONNECT(so, name, namelen, fflag, flags, cr) \ + ((so)->so_ops->sop_connect((so), (name), (namelen), (fflag), (flags), \ + (cr))) +#define SOP_RECVMSG(so, msg, uiop, cr) \ + ((so)->so_ops->sop_recvmsg((so), (msg), (uiop), (cr))) +#define SOP_SENDMSG(so, msg, uiop, cr) \ + ((so)->so_ops->sop_sendmsg((so), (msg), (uiop), (cr))) +#define SOP_SENDMBLK(so, msg, size, cr, mpp) \ + ((so)->so_ops->sop_sendmblk((so), (msg), (size), (cr), (mpp))) +#define SOP_GETPEERNAME(so, addr, addrlen, accept, cr) \ + ((so)->so_ops->sop_getpeername((so), (addr), (addrlen), (accept), (cr))) +#define SOP_GETSOCKNAME(so, addr, addrlen, cr) \ + ((so)->so_ops->sop_getsockname((so), (addr), (addrlen), (cr))) +#define SOP_SHUTDOWN(so, how, cr) \ + ((so)->so_ops->sop_shutdown((so), (how), (cr))) +#define SOP_GETSOCKOPT(so, level, optionname, optval, optlenp, flags, cr) \ ((so)->so_ops->sop_getsockopt((so), (level), (optionname), \ - (optval), (optlenp), (flags))) -#define SOP_SETSOCKOPT(so, level, optionname, optval, optlen) \ + (optval), (optlenp), (flags), (cr))) +#define SOP_SETSOCKOPT(so, level, optionname, optval, optlen, cr) \ ((so)->so_ops->sop_setsockopt((so), (level), (optionname), \ - (optval), (optlen))) + (optval), (optlen), (cr))) +#define SOP_IOCTL(so, cmd, arg, mode, cr, rvalp) \ + ((so)->so_ops->sop_ioctl((so), (cmd), (arg), (mode), (cr), (rvalp))) +#define SOP_POLL(so, events, anyyet, reventsp, phpp) \ + ((so)->so_ops->sop_poll((so), (events), (anyyet), (reventsp), (phpp))) +#define SOP_CLOSE(so, flag, cr) \ + ((so)->so_ops->sop_close((so), (flag), (cr))) #endif /* defined(_KERNEL) || defined(_KMEMUSER) */ @@ -544,6 +666,8 @@ struct sonodeops { #define ROUNDUP_cmsglen(len) \ (((len) + _CMSG_HDR_ALIGNMENT - 1) & ~(_CMSG_HDR_ALIGNMENT - 1)) +#define IS_NON_STREAM_SOCK(vp) \ + ((vp)->v_type == VSOCK && (vp)->v_stream == NULL) /* * Macros that operate on struct cmsghdr. * Used in parsing msg_control. @@ -686,10 +810,8 @@ extern int sockprinterr; #endif /* defined(DEBUG) */ extern struct vfsops sock_vfsops; -extern struct vnodeops *socktpi_vnodeops; -extern const struct fs_operation_def socktpi_vnodeops_template[]; - -extern sonodeops_t sotpi_sonodeops; +extern struct vnodeops *socket_vnodeops; +extern const struct fs_operation_def socket_vnodeops_template[]; extern dev_t sockdev; @@ -700,20 +822,10 @@ extern int sock_getmsg(vnode_t *, struct strbuf *, struct strbuf *, uchar_t *, int *, int, rval_t *); extern int sock_putmsg(vnode_t *, struct strbuf *, struct strbuf *, uchar_t, int, int); -struct sonode *sotpi_create(vnode_t *, int, int, int, int, struct sonode *, - int *); -extern int socktpi_open(struct vnode **, int, struct cred *, - caller_context_t *); -extern int so_sock2stream(struct sonode *); -extern void so_stream2sock(struct sonode *); +extern int sogetvp(char *, vnode_t **, int); extern int sockinit(int, char *); -extern struct vnode - *makesockvp(struct vnode *, int, int, int); -extern void sockfree(struct sonode *); -extern void so_update_attrs(struct sonode *, int); -extern int soconfig(int, int, int, char *, int); -extern struct vnode - *solookup(int, int, int, char *, int *); +extern int soconfig(int, int, int, char *, int, char *); +extern int solookup(int, int, int, struct sockparams **); extern void so_lock_single(struct sonode *); extern void so_unlock_single(struct sonode *, int); extern int so_lock_read(struct sonode *, int); @@ -723,10 +835,6 @@ extern void *sogetoff(mblk_t *, t_uscalar_t, t_uscalar_t, uint_t); extern void so_getopt_srcaddr(void *, t_uscalar_t, void **, t_uscalar_t *); extern int so_getopt_unix_close(void *, t_uscalar_t); -extern int so_addr_verify(struct sonode *, const struct sockaddr *, - socklen_t); -extern int so_ux_addr_xlate(struct sonode *, struct sockaddr *, - socklen_t, int, void **, socklen_t *); extern void fdbuf_free(struct fdbuf *); extern mblk_t *fdbuf_allocmsg(int, struct fdbuf *); extern int fdbuf_create(void *, int, struct fdbuf **); @@ -744,55 +852,13 @@ extern void soisdisconnected(struct sonode *, int); extern void socantsendmore(struct sonode *); extern void socantrcvmore(struct sonode *); extern void soseterror(struct sonode *, int); -extern int sogeterr(struct sonode *); -extern int sogetrderr(vnode_t *, int, int *); -extern int sogetwrerr(vnode_t *, int, int *); -extern void so_unix_close(struct sonode *); -extern mblk_t *soallocproto(size_t, int); -extern mblk_t *soallocproto1(const void *, ssize_t, ssize_t, int); -extern void soappendmsg(mblk_t *, const void *, ssize_t); -extern mblk_t *soallocproto2(const void *, ssize_t, const void *, ssize_t, - ssize_t, int); -extern mblk_t *soallocproto3(const void *, ssize_t, const void *, ssize_t, - const void *, ssize_t, ssize_t, int); -extern int sowaitprim(struct sonode *, t_scalar_t, t_scalar_t, - t_uscalar_t, mblk_t **, clock_t); -extern int sowaitokack(struct sonode *, t_scalar_t); -extern int sowaitack(struct sonode *, mblk_t **, clock_t); -extern void soqueueack(struct sonode *, mblk_t *); -extern int sowaitconnind(struct sonode *, int, mblk_t **); -extern void soqueueconnind(struct sonode *, mblk_t *); -extern int soflushconnind(struct sonode *, t_scalar_t); -extern void so_drain_discon_ind(struct sonode *); -extern void so_flush_discon_ind(struct sonode *); +extern int sogeterr(struct sonode *, boolean_t); extern int sowaitconnected(struct sonode *, int, int); -extern int sostream_direct(struct sonode *, struct uio *, - mblk_t *, cred_t *); -extern int sosend_dgram(struct sonode *, struct sockaddr *, - socklen_t, struct uio *, int); -extern int sosend_svc(struct sonode *, struct uio *, t_scalar_t, int, int); -extern void so_installhooks(struct sonode *); -extern int so_strinit(struct sonode *, struct sonode *); -extern int sotpi_recvmsg(struct sonode *, struct nmsghdr *, - struct uio *); -extern int sotpi_getpeername(struct sonode *); -extern int sotpi_getsockopt(struct sonode *, int, int, void *, - socklen_t *, int); -extern int sotpi_setsockopt(struct sonode *, int, int, const void *, - socklen_t); -extern int socktpi_ioctl(struct vnode *, int, intptr_t, int, - struct cred *, int *, caller_context_t *); -extern int sodisconnect(struct sonode *, t_scalar_t, int); extern ssize_t soreadfile(file_t *, uchar_t *, u_offset_t, int *, size_t); -extern int so_set_asyncsigs(vnode_t *, pid_t, int, int, cred_t *); -extern int so_set_events(struct sonode *, vnode_t *, cred_t *); -extern int so_flip_async(struct sonode *, vnode_t *, int, cred_t *); -extern int so_set_siggrp(struct sonode *, vnode_t *, pid_t, int, cred_t *); extern void *sock_kstat_init(zoneid_t); extern void sock_kstat_fini(zoneid_t, void *); extern struct sonode *getsonode(int, int *, file_t **); - /* * Function wrappers (mostly around the sonode switch) for * backward compatibility. @@ -805,44 +871,18 @@ extern int soconnect(struct sonode *, const struct sockaddr *, socklen_t, int, int); extern int sorecvmsg(struct sonode *, struct nmsghdr *, struct uio *); extern int sosendmsg(struct sonode *, struct nmsghdr *, struct uio *); -extern int sogetpeername(struct sonode *); -extern int sogetsockname(struct sonode *); extern int soshutdown(struct sonode *, int); extern int sogetsockopt(struct sonode *, int, int, void *, socklen_t *, int); extern int sosetsockopt(struct sonode *, int, int, const void *, t_uscalar_t); -extern struct sonode *socreate(vnode_t *, int, int, int, int, - struct sonode *, int *); +extern struct sonode *socreate(struct sockparams *, int, int, int, int, + int *); extern int so_copyin(const void *, void *, size_t, int); extern int so_copyout(const void *, void *, size_t, int); -extern int socktpi_access(struct vnode *, int, int, struct cred *, - caller_context_t *); -extern int socktpi_fid(struct vnode *, struct fid *, caller_context_t *); -extern int socktpi_fsync(struct vnode *, int, struct cred *, - caller_context_t *); -extern int socktpi_getattr(struct vnode *, struct vattr *, int, - struct cred *, caller_context_t *); -extern int socktpi_seek(struct vnode *, offset_t, offset_t *, - caller_context_t *); -extern int socktpi_setattr(struct vnode *, struct vattr *, int, - struct cred *, caller_context_t *); -extern int socktpi_setfl(vnode_t *, int, int, cred_t *, - caller_context_t *); - -/* SCTP sockfs */ -extern struct sonode *sosctp_create(vnode_t *, int, int, int, int, - struct sonode *, int *); -extern int sosctp_init(void); - -/* SDP sockfs */ -extern struct sonode *sosdp_create(vnode_t *, int, int, int, int, - struct sonode *, int *); -extern int sosdp_init(void); - #endif /* @@ -865,9 +905,11 @@ struct sockinfo { uint16_t si_faddr_family; char si_laddr_sun_path[MAXPATHLEN + 1]; /* NULL terminated */ char si_faddr_sun_path[MAXPATHLEN + 1]; + boolean_t si_faddr_noxlate; zoneid_t si_szoneid; }; +#define SOCKMOD_PATH "socketmod" /* dir where sockmods are stored */ #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/sockio.h b/usr/src/uts/common/sys/sockio.h index 012e7f3061..9e107ff3ef 100644 --- a/usr/src/uts/common/sys/sockio.h +++ b/usr/src/uts/common/sys/sockio.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -39,8 +39,6 @@ #ifndef _SYS_SOCKIO_H #define _SYS_SOCKIO_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * General socket ioctl definitions. */ @@ -316,7 +314,9 @@ extern "C" { #define SIOCSIPMPFAILBACK _IOW('i', 182, int) /* enable/disable */ /* FAILBACK */ -#define SIOCSENABLESDP _IOWR('i', 183, int) /* Enable SDP */ +#define SIOCSENABLESDP _IOWR('i', 183, int) /* Enable SDP */ + +#define SIOCSQPTR _IOWR('i', 184, int) /* set q_ptr of stream */ #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/sodirect.h b/usr/src/uts/common/sys/sodirect.h index c8acfcea44..f87d010f56 100644 --- a/usr/src/uts/common/sys/sodirect.h +++ b/usr/src/uts/common/sys/sodirect.h @@ -52,12 +52,15 @@ extern "C" { #endif +typedef int (*sod_enq_func)(); +typedef void (*sod_wakeup_func)(); + typedef struct sodirect_s { uint32_t sod_state; /* State bits */ uint32_t sod_want; /* Pending read byte count or 0 */ queue_t *sod_q; /* Socket Q */ - int (*sod_enqueue)(); /* Call to enqueue an mblk_t */ - void (*sod_wakeup)(); /* Call to awkake a read()er, if any */ + sod_enq_func sod_enqueue; /* Call to enqueue an mblk_t */ + sod_wakeup_func sod_wakeup; /* Call to awkake a read()er, if any */ mblk_t *sod_uioafh; /* To be freed list head, or NULL */ mblk_t *sod_uioaft; /* To be freed list tail */ kmutex_t *sod_lockp; /* Pointer to the lock needed */ @@ -107,10 +110,36 @@ typedef struct sodirect_s { #define SOD_QFULL(p) ((p)->sod_q->q_flag & QFULL) #define SOD_QCNT(p) ((p)->sod_q->q_count) -#define SOD_DISABLE(p) (p)->sod_state &= ~SOD_ENABLED +#define SOD_DISABLE(p) { \ + if ((p) != NULL) \ + (p)->sod_state &= ~SOD_ENABLED; \ +} #define SOD_QTOSODP(q) (q)->q_stream->sd_sodirect +#define SOD_SOTOSODP(so) ((sonode_t *)so)->so_direct + +#define SOD_UIOAFINI(sodp) { \ + if ((sodp) && (sodp)->sod_uioa.uioa_state & UIOA_ENABLED) { \ + (sodp)->sod_uioa.uioa_state &= UIOA_CLR; \ + (sodp)->sod_uioa.uioa_state |= UIOA_FINI; \ + } \ +} + +struct sonode; +struct sodirect_s; + +extern uio_t *sod_rcv_init(struct sonode *, int, struct uio **); +extern int sod_rcv_done(struct sonode *, struct uio *, struct uio *); + +extern mblk_t *sod_uioa_mblk_init(struct sodirect_s *, mblk_t *, size_t); +extern void sod_uioa_so_init(struct sonode *, struct sodirect_s *, + struct uio *); +extern ssize_t sod_uioa_mblk(struct sonode *, mblk_t *); +extern void sod_uioa_mblk_done(struct sodirect_s *, mblk_t *); +extern void sod_init(); +extern void sod_sock_init(struct sonode *, struct stdata *, sod_enq_func, + sod_wakeup_func, kmutex_t *); #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/squeue.h b/usr/src/uts/common/sys/squeue.h index ec09b3a88b..e14ded203a 100644 --- a/usr/src/uts/common/sys/squeue.h +++ b/usr/src/uts/common/sys/squeue.h @@ -85,6 +85,9 @@ extern void squeue_enter(squeue_t *, mblk_t *, mblk_t *, uint32_t, int, uint8_t); extern uintptr_t *squeue_getprivate(squeue_t *, sqprivate_t); +extern int squeue_synch_enter(squeue_t *, void *, uint8_t); +extern void squeue_synch_exit(squeue_t *, void *); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/squeue_impl.h b/usr/src/uts/common/sys/squeue_impl.h index 501377e53f..bd934cc0b3 100644 --- a/usr/src/uts/common/sys/squeue_impl.h +++ b/usr/src/uts/common/sys/squeue_impl.h @@ -102,6 +102,7 @@ struct squeue_s { clock_t sq_curr_time; /* Current tick (lbolt) */ kcondvar_t sq_worker_cv; /* cond var. worker thread blocks on */ kcondvar_t sq_poll_cv; /* cond variable poll_thr waits on */ + kcondvar_t sq_synch_cv; /* cond var. synch thread waits on */ kcondvar_t sq_ctrlop_done_cv; /* cond variable for ctrl ops */ clock_t sq_wait; /* lbolts to wait after a fill() */ timeout_id_t sq_tid; /* timer id of pending timeout() */ @@ -163,6 +164,7 @@ struct squeue_s { #define SQS_POLL_RESTART_DONE 0x01000000 #define SQS_POLL_THR_QUIESCE 0x02000000 +#define SQS_PAUSE 0x04000000 /* The squeue has been paused */ #define SQS_WORKER_THR_CONTROL \ (SQS_POLL_QUIESCE | SQS_POLL_RESTART | SQS_POLL_CLEANUP) diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h index 41097cab7f..8d1ac458df 100644 --- a/usr/src/uts/common/sys/stream.h +++ b/usr/src/uts/common/sys/stream.h @@ -425,6 +425,7 @@ typedef struct bcache { #define MSGMARKNEXT 0x10 /* Private: first byte of next msg marked */ #define MSGNOTMARKNEXT 0x20 /* Private: ... not marked */ #define MSGHASREF 0x40 /* Private: message has reference to owner */ +#define MSGWAITSYNC 0x80 /* Private: waiting for sync squeue enter */ /* * Streams message types. diff --git a/usr/src/uts/common/sys/strsubr.h b/usr/src/uts/common/sys/strsubr.h index 04c778feaa..33ec38cac5 100644 --- a/usr/src/uts/common/sys/strsubr.h +++ b/usr/src/uts/common/sys/strsubr.h @@ -1126,7 +1126,6 @@ extern void strclean(struct vnode *); extern void str_cn_clean(); /* XXX hook for consoles signal cleanup */ extern int strwrite(struct vnode *, struct uio *, cred_t *); extern int strwrite_common(struct vnode *, struct uio *, cred_t *, int); -extern int kstrwritemp(struct vnode *, mblk_t *, ushort_t); extern int strread(struct vnode *, struct uio *, cred_t *); extern int strioctl(struct vnode *, int, intptr_t, int, int, cred_t *, int *); extern int strrput(queue_t *, mblk_t *); @@ -1151,6 +1150,7 @@ extern int strcopyout(void *, void *, size_t, int); extern void strsignal(struct stdata *, int, int32_t); extern clock_t str_cv_wait(kcondvar_t *, kmutex_t *, clock_t, int); extern void disable_svc(queue_t *); +extern void enable_svc(queue_t *); extern void remove_runlist(queue_t *); extern void wait_svc(queue_t *); extern void backenable(queue_t *, uchar_t); @@ -1212,6 +1212,7 @@ extern mblk_t *allocb_cred_wait(size_t, uint_t, int *, cred_t *); extern mblk_t *allocb_tmpl(size_t, const mblk_t *); extern mblk_t *allocb_tryhard(size_t); extern void mblk_setcred(mblk_t *, cred_t *); +extern void msg_setcredpid(mblk_t *, cred_t *, pid_t); extern void strpollwakeup(vnode_t *, short); extern int putnextctl_wait(queue_t *, int); diff --git a/usr/src/uts/common/syscall/sendfile.c b/usr/src/uts/common/syscall/sendfile.c index 16ada25629..13b480a304 100644 --- a/usr/src/uts/common/syscall/sendfile.c +++ b/usr/src/uts/common/syscall/sendfile.c @@ -53,6 +53,8 @@ #include <sys/socket.h> #include <sys/socketvar.h> +#include <fs/sockfs/sockcommon.h> +#include <fs/sockfs/socktpi.h> #include <netinet/in.h> #include <sys/sendfile.h> @@ -71,103 +73,11 @@ extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *, int, ssize_t *); extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *, boolean_t); +extern sotpi_info_t *sotpi_sototpi(struct sonode *); #define readflg (V_WRITELOCK_FALSE) #define rwflag (V_WRITELOCK_TRUE) -/* - * kstrwritemp() has very similar semantics as that of strwrite(). - * The main difference is it obtains mblks from the caller and also - * does not do any copy as done in strwrite() from user buffers to - * kernel buffers. - * - * Currently, this routine is used by sendfile to send data allocated - * within the kernel without any copying. This interface does not use the - * synchronous stream interface as synch. stream interface implies - * copying. - */ -int -kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode) -{ - struct stdata *stp; - struct queue *wqp; - mblk_t *newmp; - char waitflag; - int tempmode; - int error = 0; - int done = 0; - struct sonode *so; - boolean_t direct; - - ASSERT(vp->v_stream); - stp = vp->v_stream; - - so = VTOSO(vp); - direct = (so->so_state & SS_DIRECT); - - /* - * This is the sockfs direct fast path. canputnext() need - * not be accurate so we don't grab the sd_lock here. If - * we get flow-controlled, we grab sd_lock just before the - * do..while loop below to emulate what strwrite() does. - */ - wqp = stp->sd_wrq; - if (canputnext(wqp) && direct && - !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) { - return (sostream_direct(so, NULL, mp, CRED())); - } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) { - /* Fast check of flags before acquiring the lock */ - mutex_enter(&stp->sd_lock); - error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0); - mutex_exit(&stp->sd_lock); - if (error != 0) { - if (!(stp->sd_flag & STPLEX) && - (stp->sd_wput_opt & SW_SIGPIPE)) { - tsignal(curthread, SIGPIPE); - error = EPIPE; - } - return (error); - } - } - - waitflag = WRITEWAIT; - if (stp->sd_flag & OLDNDELAY) - tempmode = fmode & ~FNDELAY; - else - tempmode = fmode; - - mutex_enter(&stp->sd_lock); - do { - if (canputnext(wqp)) { - mutex_exit(&stp->sd_lock); - if (stp->sd_wputdatafunc != NULL) { - newmp = (stp->sd_wputdatafunc)(vp, mp, NULL, - NULL, NULL, NULL); - if (newmp == NULL) { - /* The caller will free mp */ - return (ECOMM); - } - mp = newmp; - } - putnext(wqp, mp); - return (0); - } - error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1, - &done); - } while (error == 0 && !done); - - mutex_exit(&stp->sd_lock); - /* - * EAGAIN tells the application to try again. ENOMEM - * is returned only if the memory allocation size - * exceeds the physical limits of the system. ENOMEM - * can't be true here. - */ - if (error == ENOMEM) - error = EAGAIN; - return (error); -} - #define SEND_MAX_CHUNK 16 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) @@ -510,6 +420,7 @@ sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, size_t size = total_size; size_t extra; int tail_len; + struct nmsghdr msg; fflag = fp->f_flag; vp = fp->f_vnode; @@ -521,8 +432,17 @@ sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, if (total_size == 0) return (0); - wroff = (int)vp->v_stream->sd_wroff; - tail_len = (int)vp->v_stream->sd_tail; + if (vp->v_stream != NULL) { + wroff = (int)vp->v_stream->sd_wroff; + tail_len = (int)vp->v_stream->sd_tail; + } else { + struct sonode *so; + + so = VTOSO(vp); + wroff = so->so_proto_props.sopp_wroff; + tail_len = so->so_proto_props.sopp_tail; + } + extra = wroff + tail_len; buf_left = MIN(total_size, maxblk); @@ -530,6 +450,7 @@ sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, if (head == NULL) return (ENOMEM); head->b_wptr = head->b_rptr = head->b_rptr + wroff; + bzero(&msg, sizeof (msg)); auio.uio_extflg = UIO_COPY_DEFAULT; for (i = 0; i < copy_cnt; i++) { @@ -738,9 +659,10 @@ sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, } ASSERT(total_size == 0); - error = kstrwritemp(vp, head, fflag); + error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &head); if (error != 0) { - freemsg(head); + if (head != NULL) + freemsg(head); return (error); } ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size; @@ -776,19 +698,28 @@ sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, int maxblk, wroff, tail_len; struct sonode *so; stdata_t *stp; + struct nmsghdr msg; fflag = fp->f_flag; vp = fp->f_vnode; if (vp->v_type == VSOCK) { so = VTOSO(vp); - stp = vp->v_stream; - wroff = (int)stp->sd_wroff; - tail_len = (int)stp->sd_tail; - maxblk = (int)stp->sd_maxblk; + if (vp->v_stream != NULL) { + stp = vp->v_stream; + wroff = (int)stp->sd_wroff; + tail_len = (int)stp->sd_tail; + maxblk = (int)stp->sd_maxblk; + } else { + stp = NULL; + wroff = so->so_proto_props.sopp_wroff; + tail_len = so->so_proto_props.sopp_tail; + maxblk = so->so_proto_props.sopp_maxblk; + } extra = wroff + tail_len; } + bzero(&msg, sizeof (msg)); auio.uio_extflg = UIO_COPY_DEFAULT; for (i = 0; i < copy_cnt; i++) { if (ISSIG(curthread, JUSTLOOKING)) @@ -841,7 +772,8 @@ sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, size_t iov_len; iov_len = sfv_len; - if (so->so_kssl_ctx != NULL) + if (!SOCK_IS_NONSTR(so) && + SOTOTPI(so)->sti_kssl_ctx != NULL) iov_len = MIN(iov_len, maxblk); aiov.iov_len = iov_len; @@ -868,9 +800,12 @@ sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, return (error); } dmp->b_wptr += iov_len; - error = kstrwritemp(vp, dmp, fflag); + error = socket_sendmblk(VTOSO(vp), + &msg, fflag, CRED(), &dmp); + if (error != 0) { - freeb(dmp); + if (dmp != NULL) + freeb(dmp); return (error); } ttolwp(curthread)->lwp_ru.ioch += @@ -880,6 +815,9 @@ sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, sfv_off += iov_len; } } else { + ttolwp(curthread)->lwp_ru.ioch += + (ulong_t)sfv_len; + *count += sfv_len; aiov.iov_len = sfv_len; aiov.iov_base = (caddr_t)(uintptr_t)sfv_off; @@ -971,25 +909,30 @@ sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, return (ENOMEM); } } else { + uint_t copyflag; + + copyflag = stp != NULL ? stp->sd_copyflag : + so->so_proto_props.sopp_zcopyflag; /* * For sockets acting as an SSL proxy, we * need to adjust the size to the maximum * SSL record size set in the stream head. */ - if (so->so_kssl_ctx != NULL) + if (!SOCK_IS_NONSTR(so) && + _SOTOTPI(so)->sti_kssl_ctx != NULL) size = MIN(size, maxblk); if (vn_has_flocks(readvp) || readvp->v_flag & VNOMAP || - stp->sd_copyflag & STZCVMUNSAFE) { + copyflag & STZCVMUNSAFE) { segmapit = 0; - } else if (stp->sd_copyflag & STZCVMSAFE) { + } else if (copyflag & STZCVMSAFE) { segmapit = 1; } else { int on = 1; - if (SOP_SETSOCKOPT(VTOSO(vp), + if (socket_setsockopt(VTOSO(vp), SOL_SOCKET, SO_SND_COPYAVOID, - &on, sizeof (on)) == 0) + &on, sizeof (on), CRED()) == 0) segmapit = 1; } } @@ -1085,9 +1028,12 @@ sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv, if (vp->v_type == VSOCK) { dmp->b_wptr = dmp->b_rptr + cnt; - error = kstrwritemp(vp, dmp, fflag); + error = socket_sendmblk(VTOSO(vp), + &msg, fflag, CRED(), &dmp); + if (error != 0) { - freeb(dmp); + if (dmp != NULL) + freeb(dmp); VOP_RWUNLOCK(readvp, readflg, NULL); releasef(sfv->sfv_fd); @@ -1186,45 +1132,11 @@ sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, switch (vp->v_type) { case VSOCK: so = VTOSO(vp); - /* sendfile not supported for SCTP */ - if (so->so_protocol == IPPROTO_SCTP) { - error = EPROTONOSUPPORT; - goto err; - } is_sock = B_TRUE; - switch (so->so_family) { - case AF_INET: - case AF_INET6: - /* - * Make similar checks done in SOP_WRITE(). - */ - if (so->so_state & SS_CANTSENDMORE) { - tsignal(curthread, SIGPIPE); - error = EPIPE; - goto err; - } - if (so->so_type != SOCK_STREAM) { - error = EOPNOTSUPP; - goto err; - } - - if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) != - (SS_ISCONNECTED|SS_ISBOUND)) { - error = ENOTCONN; - goto err; - } - - if ((so->so_state & SS_DIRECT) && - (so->so_priv != NULL) && - (so->so_kssl_ctx == NULL)) { - maxblk = ((tcp_t *)so->so_priv)->tcp_mss; - } else { - maxblk = (int)vp->v_stream->sd_maxblk; - } - break; - default: - error = EAFNOSUPPORT; - goto err; + if (SOCK_IS_NONSTR(so)) { + maxblk = so->so_proto_props.sopp_maxblk; + } else { + maxblk = (int)vp->v_stream->sd_maxblk; } break; case VREG: @@ -1361,21 +1273,18 @@ sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, * senfilev() function to consume the sfv[]. */ if (is_sock) { - switch (so->so_family) { - case AF_INET: - case AF_INET6: - if (so->so_nl7c_flags != 0) - error = nl7c_sendfilev(so, &fileoff, - sfv, copy_cnt, &count); - else if ((total_size <= (4 * maxblk)) && - error == 0) - error = sendvec_small_chunk(fp, - &fileoff, sfv, copy_cnt, - total_size, maxblk, &count); - else - error = sendvec_chunk(fp, &fileoff, - sfv, copy_cnt, &count); - break; + if (!SOCK_IS_NONSTR(so) && + _SOTOTPI(so)->sti_nl7c_flags != 0) { + error = nl7c_sendfilev(so, &fileoff, + sfv, copy_cnt, &count); + } else if ((total_size <= (4 * maxblk)) && + error == 0) { + error = sendvec_small_chunk(fp, + &fileoff, sfv, copy_cnt, + total_size, maxblk, &count); + } else { + error = sendvec_chunk(fp, &fileoff, + sfv, copy_cnt, &count); } } else { ASSERT(vp->v_type == VREG); diff --git a/usr/src/uts/intel/Makefile.intel.shared b/usr/src/uts/intel/Makefile.intel.shared index 0eba71bc6f..62e23247bf 100644 --- a/usr/src/uts/intel/Makefile.intel.shared +++ b/usr/src/uts/intel/Makefile.intel.shared @@ -565,6 +565,7 @@ MISC_KMODS += kcf MISC_KMODS += kgssapi MISC_KMODS += kmech_dummy MISC_KMODS += kmech_krb5 +MISC_KMODS += ksocket MISC_KMODS += mac MISC_KMODS += mixer MISC_KMODS += net80211 @@ -685,6 +686,12 @@ MAC_KMODS += mac_ib DEVNAME_KMODS += sdev_nsconfig_mod # +# socketmod (kernel/socketmod) +# +SOCKET_KMODS += socksctp +SOCKET_KMODS += socksdp + +# # kiconv modules (/kernel/kiconv): # KICONV_KMODS += kiconv_emea kiconv_ja kiconv_ko kiconv_sc kiconv_tc diff --git a/usr/src/uts/intel/ia32/ml/modstubs.s b/usr/src/uts/intel/ia32/ml/modstubs.s index e29afc6c29..0569b9e394 100644 --- a/usr/src/uts/intel/ia32/ml/modstubs.s +++ b/usr/src/uts/intel/ia32/ml/modstubs.s @@ -497,7 +497,10 @@ fcnname/**/_info: \ NO_UNLOAD_STUB(sockfs, snf_segmap, nomod_einval); NO_UNLOAD_STUB(sockfs, sock_getfasync, nomod_zero); NO_UNLOAD_STUB(sockfs, nl7c_sendfilev, nomod_zero); - NO_UNLOAD_STUB(sockfs, sostream_direct, nomod_zero); + NO_UNLOAD_STUB(sockfs, sotpi_sototpi, nomod_zero); + NO_UNLOAD_STUB(sockfs, socket_sendmblk, nomod_zero); + NO_UNLOAD_STUB(sockfs, socket_setsockopt, nomod_zero); + NO_UNLOAD_STUB(sockfs, sod_uioa_mblk_done, nomod_zero); END_MODULE(sockfs); #endif @@ -1278,30 +1281,6 @@ fcnname/**/_info: \ #endif /* - * Stubs for SDP-IB driver. - */ -#ifndef SDPIB_MODULE - MODULE(sdpib,drv); - STUB(sdpib, sdp_create, nomod_zero); - STUB(sdpib, sdp_bind, nomod_einval); - STUB(sdpib, sdp_listen, nomod_einval); - STUB(sdpib, sdp_connect, nomod_einval); - STUB(sdpib, sdp_recv, nomod_einval); - STUB(sdpib, sdp_send, nomod_einval); - STUB(sdpib, sdp_getpeername, nomod_einval); - STUB(sdpib, sdp_getsockname, nomod_einval); - STUB(sdpib, sdp_disconnect, nomod_einval); - STUB(sdpib, sdp_shutdown, nomod_einval); - STUB(sdpib, sdp_get_opt, nomod_einval); - STUB(sdpib, sdp_set_opt, nomod_einval); - STUB(sdpib, sdp_close, nomod_void); - STUB(sdpib, sdp_polldata, nomod_zero); - STUB(sdpib, sdp_ioctl, nomod_einval); - END_MODULE(sdpib); -#endif - - -/* * Stubs for kssl, the kernel SSL proxy */ #ifndef KSSL_MODULE @@ -1348,6 +1327,35 @@ fcnname/**/_info: \ END_MODULE(iommulib); #endif +/* + * Stubs for kernel socket, for iscsi + */ +#ifndef KSOCKET_MODULE + MODULE(ksocket, misc); + NO_UNLOAD_STUB(ksocket, ksocket_setsockopt, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_getsockopt, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_getpeername, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_getsockname, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_socket, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_bind, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_listen, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_accept, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_connect, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_recv, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_recvfrom, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_recvmsg, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_send, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_sendto, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_sendmsg, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_ioctl, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_setcallbacks, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_hold, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_rele, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_shutdown, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_close, nomod_minus_one); + END_MODULE(ksocket); +#endif + / this is just a marker for the area of text that contains stubs ENTRY_NP(stubs_end) diff --git a/usr/src/uts/intel/icmp/Makefile b/usr/src/uts/intel/icmp/Makefile index 25a104ffbb..259530f9dc 100644 --- a/usr/src/uts/intel/icmp/Makefile +++ b/usr/src/uts/intel/icmp/Makefile @@ -21,11 +21,9 @@ # # uts/intel/icmp/Makefile # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# # This makefile drives the production of the icmp IP driver # # intel implementation architecture dependent @@ -43,7 +41,7 @@ MODULE = icmp OBJECTS = $(ICMP_OBJS:%=$(OBJS_DIR)/%) LINTS = $(ICMP_OBJS:%.o=$(LINTS_DIR)/%.ln) ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) -ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE) +ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE) $(ROOT_SOCK_DIR)/$(MODULE) CONF_SRCDIR = $(UTSBASE)/common/inet/ip # @@ -66,9 +64,9 @@ LINT_TARGET = $(MODULE).lint INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOTLINK) $(ROOT_CONFFILE) # -# depends on ip +# depends on ip and sockfs # -LDFLAGS += -dy -Ndrv/ip +LDFLAGS += -dy -Ndrv/ip -Nfs/sockfs # # For now, disable these lint checks; maintainers should endeavor @@ -100,7 +98,7 @@ clean.lint: $(CLEAN_LINT_DEPS) install: $(INSTALL_DEPS) $(SISCHECK_DEPS) -$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOTMODULE) +$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOT_SOCK_DIR) $(ROOTMODULE) -$(RM) $@; ln $(ROOTMODULE) $@ # diff --git a/usr/src/uts/intel/icmp/icmp.global-objs.debug64 b/usr/src/uts/intel/icmp/icmp.global-objs.debug64 index ba041c7e17..eeeeedc77e 100644 --- a/usr/src/uts/intel/icmp/icmp.global-objs.debug64 +++ b/usr/src/uts/intel/icmp/icmp.global-objs.debug64 @@ -19,10 +19,9 @@ # CDDL HEADER END # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" cb_inet_devops fsw @@ -30,5 +29,8 @@ inet_dev_info inet_devops modldrv modlinkage +modlsockmod modlstrmod netdev_privs +smodpriv +smodreg diff --git a/usr/src/uts/intel/idm/Makefile b/usr/src/uts/intel/idm/Makefile index 463a8be02a..870fc039ed 100644 --- a/usr/src/uts/intel/idm/Makefile +++ b/usr/src/uts/intel/idm/Makefile @@ -60,7 +60,7 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) # DEBUG_FLGS = DEBUG_DEFS += $(DEBUG_FLGS) -LDFLAGS += -dy -Nfs/sockfs +LDFLAGS += -dy -Nfs/sockfs -Nmisc/ksocket # # Default build targets. diff --git a/usr/src/uts/intel/ip/ip.global-objs.debug64 b/usr/src/uts/intel/ip/ip.global-objs.debug64 index f4bcb8ab0c..2e501f8abc 100644 --- a/usr/src/uts/intel/ip/ip.global-objs.debug64 +++ b/usr/src/uts/intel/ip/ip.global-objs.debug64 @@ -64,6 +64,7 @@ gcgrp4_hash gcgrp6_hash gcgrp_hash_size gcgrp_lock +icmp_fallback_sock_winit icmp_frag_size_table icmp_g_t_info_ack icmp_ipha @@ -104,6 +105,10 @@ ip_cgtp_filter_rev ip_conn_cache ip_debug ip_g_all_ones +ip_helper_stream_cache +ip_helper_stream_info +ip_helper_stream_rinit +ip_helper_stream_winit ip_ioctl_ftbl ip_ire_cleanup_cnt ip_ire_cpu_ratio @@ -140,6 +145,7 @@ ip_squeue_worker_wait ip_thread_data ip_thread_list ip_thread_rwlock +ip_use_helper_cache ip_wput_frag_mdt_min ipcl_bind_fanout_size ipcl_conn_hash_maxsize @@ -251,6 +257,10 @@ sendq_loop_cnt sin6_null sin_null skip_sctp_cksum +sock_tcp_downcalls +sock_rts_downcalls +sock_rawip_downcalls +sock_udp_downcalls sqset_global_list sqset_global_size sqset_lock @@ -264,6 +274,7 @@ tcp_acceptor_winit tcp_conn_cache tcp_conn_hash_size tcp_drop_ack_unsent_cnt +tcp_fallback_sock_winit tcp_free_list_max_cnt tcp_fusion_rcv_unread_min tcp_g_kstat @@ -303,10 +314,12 @@ tcp_winit tcp_outbound_squeue_switch tcpinfov4 tcpinfov6 +tli_errs tsol_strict_error tun_spd_hashsize udp_bind_fanout_size udp_conn_cache +udp_fallback_sock_winit udp_g_t_info_ack_ipv4 udp_g_t_info_ack_ipv6 udp_lrinit diff --git a/usr/src/uts/intel/ip/ip.global-objs.obj64 b/usr/src/uts/intel/ip/ip.global-objs.obj64 index 3866432363..b773f8a5e0 100644 --- a/usr/src/uts/intel/ip/ip.global-objs.obj64 +++ b/usr/src/uts/intel/ip/ip.global-objs.obj64 @@ -64,6 +64,7 @@ gcgrp4_hash gcgrp6_hash gcgrp_hash_size gcgrp_lock +icmp_fallback_sock_winit icmp_frag_size_table icmp_g_t_info_ack icmp_ipha @@ -104,6 +105,10 @@ ip_cgtp_filter_rev ip_conn_cache ip_debug ip_g_all_ones +ip_helper_stream_cache +ip_helper_stream_info +ip_helper_stream_rinit +ip_helper_stream_winit ip_ioctl_ftbl ip_ire_cleanup_cnt ip_ire_cpu_ratio @@ -140,6 +145,7 @@ ip_squeue_worker_wait ip_thread_data ip_thread_list ip_thread_rwlock +ip_use_helper_cache ip_wput_frag_mdt_min ipcl_bind_fanout_size ipcl_conn_hash_maxsize @@ -243,6 +249,10 @@ sctprinit sctpwinit sin6_null sin_null +sock_tcp_downcalls +sock_rts_downcalls +sock_rawip_downcalls +sock_udp_downcalls sqset_global_list sqset_global_size sqset_lock @@ -256,6 +266,7 @@ tcp_acceptor_winit tcp_conn_cache tcp_conn_hash_size tcp_drop_ack_unsent_cnt +tcp_fallback_sock_winit tcp_free_list_max_cnt tcp_fusion_rcv_unread_min tcp_g_kstat @@ -295,10 +306,12 @@ tcp_winit tcp_outbound_squeue_switch tcpinfov4 tcpinfov6 +tli_errs tsol_strict_error tun_spd_hashsize udp_bind_fanout_size udp_conn_cache +udp_fallback_sock_winit udp_g_t_info_ack_ipv4 udp_g_t_info_ack_ipv6 udp_lrinit diff --git a/usr/src/uts/intel/iscsi/Makefile b/usr/src/uts/intel/iscsi/Makefile index 480f9caffa..efff98b964 100644 --- a/usr/src/uts/intel/iscsi/Makefile +++ b/usr/src/uts/intel/iscsi/Makefile @@ -61,7 +61,7 @@ INC_PATH += -I$(UTSBASE)/common/io/scsi/adapters/iscsi # # Note dependancy on misc/scsi. # -LDFLAGS += -dy -N"misc/scsi" -N"fs/sockfs" -N"sys/doorfs" -Nmisc/md5 +LDFLAGS += -dy -N"misc/scsi" -N"fs/sockfs" -N"sys/doorfs" -Nmisc/md5 -Nmisc/ksocket LINTFLAGS += -a -erroff=E_BAD_PTR_CAST_ALIGN -erroff=E_PTRDIFF_OVERFLOW LINTFLAGS64 += -a -erroff=E_BAD_PTR_CAST_ALIGN -erroff=E_PTRDIFF_OVERFLOW diff --git a/usr/src/uts/intel/iscsit/Makefile b/usr/src/uts/intel/iscsit/Makefile index 1df1235747..7ecd8be223 100644 --- a/usr/src/uts/intel/iscsit/Makefile +++ b/usr/src/uts/intel/iscsit/Makefile @@ -59,7 +59,7 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) # Overrides and depends_on # MODSTUBS_DIR = $(OBJS_DIR) -LDFLAGS += -dy -Ndrv/stmf -Nmisc/idm -Nfs/sockfs -Nmisc/md5 +LDFLAGS += -dy -Ndrv/stmf -Nmisc/idm -Nfs/sockfs -Nmisc/md5 -Nmisc/ksocket INC_PATH += -I$(UTSBASE)/common/io/comstar/port/iscsit diff --git a/usr/src/uts/intel/ksocket/Makefile b/usr/src/uts/intel/ksocket/Makefile new file mode 100644 index 0000000000..288c777b46 --- /dev/null +++ b/usr/src/uts/intel/ksocket/Makefile @@ -0,0 +1,84 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# +# This makefile drives the production of the kernel socket module +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = ksocket +OBJECTS = $(KSOCKET_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(KSOCKET_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_MISC_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# Overrides. +# +CFLAGS += $(CCVERBOSE) +LDFLAGS += -dy -Nfs/sockfs + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/intel/rts/Makefile b/usr/src/uts/intel/rts/Makefile index 2247001290..8e8ec349a5 100644 --- a/usr/src/uts/intel/rts/Makefile +++ b/usr/src/uts/intel/rts/Makefile @@ -21,11 +21,9 @@ # # uts/intel/rts/Makefile # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# # This makefile drives the production of the rts IP driver # # intel implementation architecture dependent @@ -43,6 +41,7 @@ MODULE = rts OBJECTS = $(RTS_OBJS:%=$(OBJS_DIR)/%) LINTS = $(RTS_OBJS:%.o=$(LINTS_DIR)/%.ln) ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) +ROOTLINK = $(ROOT_SOCK_DIR)/$(MODULE) CONF_SRCDIR = $(UTSBASE)/common/inet/ip # @@ -65,9 +64,9 @@ LINT_TARGET = $(MODULE).lint INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOTLINK) $(ROOT_CONFFILE) # -# depends on ip +# depends on ip and sockfs # -LDFLAGS += -dy -Ndrv/ip +LDFLAGS += -dy -Ndrv/ip -Nfs/sockfs # # For now, disable these lint checks; maintainers should endeavor @@ -99,7 +98,7 @@ clean.lint: $(CLEAN_LINT_DEPS) install: $(INSTALL_DEPS) $(SISCHECK_DEPS) -$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOTMODULE) +$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOT_SOCK_DIR) $(ROOTMODULE) -$(RM) $@; ln $(ROOTMODULE) $@ # diff --git a/usr/src/uts/intel/rts/rts.global-objs.debug64 b/usr/src/uts/intel/rts/rts.global-objs.debug64 index 4c699f6410..75b422acf6 100644 --- a/usr/src/uts/intel/rts/rts.global-objs.debug64 +++ b/usr/src/uts/intel/rts/rts.global-objs.debug64 @@ -19,14 +19,15 @@ # CDDL HEADER END # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" cb_inet_devops inet_dev_info inet_devops modldrv modlinkage +modlsockmod netdev_privs +smodreg diff --git a/usr/src/uts/intel/smbsrv/Makefile b/usr/src/uts/intel/smbsrv/Makefile index f8482ba8ce..77ef7351ba 100644 --- a/usr/src/uts/intel/smbsrv/Makefile +++ b/usr/src/uts/intel/smbsrv/Makefile @@ -19,11 +19,9 @@ # CDDL HEADER END # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" -# # This makefile drives the production of the cifs server file system # kernel module. # @@ -53,7 +51,8 @@ include $(UTSBASE)/intel/Makefile.intel # Module dependencies # # -LDFLAGS += -dy -Nfs/sockfs -Ndrv/ip -Nstrmod/rpcmod -Nsys/doorfs -Nmisc/kcf +LDFLAGS += -dy -Nfs/sockfs -Nmisc/ksocket -Ndrv/ip -Nstrmod/rpcmod -Nsys/doorfs +LDFLAGS += -Nmisc/kcf # # Define targets diff --git a/usr/src/uts/intel/socksctp/Makefile b/usr/src/uts/intel/socksctp/Makefile new file mode 100644 index 0000000000..fa316464ad --- /dev/null +++ b/usr/src/uts/intel/socksctp/Makefile @@ -0,0 +1,95 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# This makefile drives the production of the nca driver +# kernel module. +# +# intel architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = socksctp +OBJECTS = $(SCTP_SOCK_MOD_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(SCTP_SOCK_MOD_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_SOCK_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# lint pass one enforcement and OS version +# +CFLAGS += $(CCVERBOSE) + +LDFLAGS += -dy -Nfs/sockfs -Ndrv/ip + +# +# For now, disable these lint checks; maintainers should endeavor +# to investigate and remove these for maximum lint coverage. +# Please do not carry these forward to new Makefiles. +# +LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN +LINTTAGS += -erroff=E_PTRDIFF_OVERFLOW + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/intel/socksdp/Makefile b/usr/src/uts/intel/socksdp/Makefile new file mode 100644 index 0000000000..966b436fce --- /dev/null +++ b/usr/src/uts/intel/socksdp/Makefile @@ -0,0 +1,87 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# This makefile drives the production of the nca driver +# kernel module. +# +# intel architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = socksdp +OBJECTS = $(SDP_SOCK_MOD_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(SDP_SOCK_MOD_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_SOCK_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/intel/Makefile.intel + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# lint pass one enforcement and OS version +# +CFLAGS += $(CCVERBOSE) + +LDFLAGS += -dy -Nfs/sockfs -Ndrv/ip -Ndrv/sdpib + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/intel/tcp/Makefile b/usr/src/uts/intel/tcp/Makefile index 5bd267f765..d083460646 100644 --- a/usr/src/uts/intel/tcp/Makefile +++ b/usr/src/uts/intel/tcp/Makefile @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -22,10 +21,9 @@ # # uts/intel/tcp/Makefile # -# Copyright 2004 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#pragma ident "%Z%%M% %I% %E% SMI" # # This makefile drives the production of the tcp driver kernel module. # @@ -44,7 +42,7 @@ MODULE = tcp OBJECTS = $(TCP_OBJS:%=$(OBJS_DIR)/%) LINTS = $(TCP_OBJS:%.o=$(LINTS_DIR)/%.ln) ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) -ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE) +ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE) $(ROOT_SOCK_DIR)/$(MODULE) CONF_SRCDIR = $(UTSBASE)/common/inet/tcp # @@ -75,9 +73,9 @@ CINLINES = -xinline=tcp_set_ws_value,tcp_fill_header CFLAGS += $(CINLINES) # -# depends on ip and md5 +# depends on ip, md5 and sockfs # -LDFLAGS += -dy -Ndrv/ip -Ncrypto/md5 +LDFLAGS += -dy -Ndrv/ip -Ncrypto/md5 -Nfs/sockfs # # Default build targets. @@ -100,7 +98,7 @@ clean.lint: $(CLEAN_LINT_DEPS) install: $(INSTALL_DEPS) -$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOTMODULE) +$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOT_SOCK_DIR) $(ROOTMODULE) -$(RM) $@; ln $(ROOTMODULE) $@ # diff --git a/usr/src/uts/intel/udp/Makefile b/usr/src/uts/intel/udp/Makefile index dad550d3cf..c6238ebd8c 100644 --- a/usr/src/uts/intel/udp/Makefile +++ b/usr/src/uts/intel/udp/Makefile @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -22,11 +21,9 @@ # # uts/intel/udp/Makefile # -# Copyright 2004 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#pragma ident "%Z%%M% %I% %E% SMI" -# # This makefile drives the production of the udp driver kernel module. # # intel implementation architecture dependent @@ -44,7 +41,7 @@ MODULE = udp OBJECTS = $(UDP_OBJS:%=$(OBJS_DIR)/%) LINTS = $(UDP_OBJS:%.o=$(LINTS_DIR)/%.ln) ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) -ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE) +ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE) $(ROOT_SOCK_DIR)/$(MODULE) CONF_SRCDIR = $(UTSBASE)/common/inet/udp # @@ -67,9 +64,9 @@ LINT_TARGET = $(MODULE).lint INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOTLINK) $(ROOT_CONFFILE) # -# depends on ip +# depends on ip and sockfs # -LDFLAGS += -dy -Ndrv/ip +LDFLAGS += -dy -Ndrv/ip -Nfs/sockfs # # Default build targets. @@ -92,7 +89,7 @@ clean.lint: $(CLEAN_LINT_DEPS) install: $(INSTALL_DEPS) -$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOTMODULE) +$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOT_SOCK_DIR) $(ROOTMODULE) -$(RM) $@; ln $(ROOTMODULE) $@ # diff --git a/usr/src/uts/sparc/Makefile.sparc.shared b/usr/src/uts/sparc/Makefile.sparc.shared index 80a188f75a..061befa7e3 100644 --- a/usr/src/uts/sparc/Makefile.sparc.shared +++ b/usr/src/uts/sparc/Makefile.sparc.shared @@ -385,6 +385,7 @@ MISC_KMODS += s1394 MISC_KMODS += hpcsvc pcihp pciehpc pcishpc MISC_KMODS += rsmops MISC_KMODS += kcf +MISC_KMODS += ksocket MISC_KMODS += ibcm MISC_KMODS += ibdm MISC_KMODS += ibmf @@ -486,6 +487,12 @@ MAC_KMODS += mac_ib DEVNAME_KMODS += sdev_nsconfig_mod # +# socketmod (kernel/socketmod) +# +SOCKET_KMODS += socksctp +SOCKET_KMODS += socksdp + +# # kiconv modules (/kernel/kiconv): # KICONV_KMODS += kiconv_emea kiconv_ja kiconv_ko kiconv_sc kiconv_tc diff --git a/usr/src/uts/sparc/icmp/Makefile b/usr/src/uts/sparc/icmp/Makefile index 5fd067b116..55c11a1ea0 100644 --- a/usr/src/uts/sparc/icmp/Makefile +++ b/usr/src/uts/sparc/icmp/Makefile @@ -20,11 +20,9 @@ # # # uts/sparc/icmp/Makefile -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" -# # This makefile drives the production of the icmp IP driver # # sparc architecture dependent @@ -42,7 +40,7 @@ MODULE = icmp OBJECTS = $(ICMP_OBJS:%=$(OBJS_DIR)/%) LINTS = $(ICMP_OBJS:%.o=$(LINTS_DIR)/%.ln) ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) -ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE) +ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE) $(ROOT_SOCK_DIR)/$(MODULE) CONF_SRCDIR = $(UTSBASE)/common/inet/ip # @@ -70,9 +68,9 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOTLINK) $(ROOT_CONFFILE) CFLAGS += $(CCVERBOSE) # -# depends on ip +# depends on ip and sockfs # -LDFLAGS += -dy -Ndrv/ip +LDFLAGS += -dy -Ndrv/ip -Nfs/sockfs # # For now, disable these lint checks; maintainers should endeavor @@ -104,7 +102,7 @@ clean.lint: $(CLEAN_LINT_DEPS) install: $(INSTALL_DEPS) $(SISCHECK_DEPS) -$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOTMODULE) +$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOT_SOCK_DIR) $(ROOTMODULE) -$(RM) $@; ln $(ROOTMODULE) $@ # diff --git a/usr/src/uts/sparc/icmp/icmp.global-objs.debug64 b/usr/src/uts/sparc/icmp/icmp.global-objs.debug64 index ba041c7e17..eeeeedc77e 100644 --- a/usr/src/uts/sparc/icmp/icmp.global-objs.debug64 +++ b/usr/src/uts/sparc/icmp/icmp.global-objs.debug64 @@ -19,10 +19,9 @@ # CDDL HEADER END # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" cb_inet_devops fsw @@ -30,5 +29,8 @@ inet_dev_info inet_devops modldrv modlinkage +modlsockmod modlstrmod netdev_privs +smodpriv +smodreg diff --git a/usr/src/uts/sparc/idm/Makefile b/usr/src/uts/sparc/idm/Makefile index 6b03fb56df..27535cf198 100644 --- a/usr/src/uts/sparc/idm/Makefile +++ b/usr/src/uts/sparc/idm/Makefile @@ -58,7 +58,7 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) # DEBUG_FLGS = DEBUG_DEFS += $(DEBUG_FLGS) -LDFLAGS += -dy -Nfs/sockfs +LDFLAGS += -dy -Nfs/sockfs -Nmisc/ksocket # # Default build targets. diff --git a/usr/src/uts/sparc/ip/ip.global-objs.debug64 b/usr/src/uts/sparc/ip/ip.global-objs.debug64 index f4bcb8ab0c..fabffbc5f5 100644 --- a/usr/src/uts/sparc/ip/ip.global-objs.debug64 +++ b/usr/src/uts/sparc/ip/ip.global-objs.debug64 @@ -64,6 +64,7 @@ gcgrp4_hash gcgrp6_hash gcgrp_hash_size gcgrp_lock +icmp_fallback_sock_winit icmp_frag_size_table icmp_g_t_info_ack icmp_ipha @@ -104,6 +105,10 @@ ip_cgtp_filter_rev ip_conn_cache ip_debug ip_g_all_ones +ip_helper_stream_cache +ip_helper_stream_info +ip_helper_stream_rinit +ip_helper_stream_winit ip_ioctl_ftbl ip_ire_cleanup_cnt ip_ire_cpu_ratio @@ -140,6 +145,7 @@ ip_squeue_worker_wait ip_thread_data ip_thread_list ip_thread_rwlock +ip_use_helper_cache ip_wput_frag_mdt_min ipcl_bind_fanout_size ipcl_conn_hash_maxsize @@ -251,6 +257,10 @@ sendq_loop_cnt sin6_null sin_null skip_sctp_cksum +sock_rawip_downcalls +sock_rts_downcalls +sock_tcp_downcalls +sock_udp_downcalls sqset_global_list sqset_global_size sqset_lock @@ -264,6 +274,7 @@ tcp_acceptor_winit tcp_conn_cache tcp_conn_hash_size tcp_drop_ack_unsent_cnt +tcp_fallback_sock_winit tcp_free_list_max_cnt tcp_fusion_rcv_unread_min tcp_g_kstat @@ -303,10 +314,12 @@ tcp_winit tcp_outbound_squeue_switch tcpinfov4 tcpinfov6 +tli_errs tsol_strict_error tun_spd_hashsize udp_bind_fanout_size udp_conn_cache +udp_fallback_sock_winit udp_g_t_info_ack_ipv4 udp_g_t_info_ack_ipv6 udp_lrinit diff --git a/usr/src/uts/sparc/ip/ip.global-objs.obj64 b/usr/src/uts/sparc/ip/ip.global-objs.obj64 index 3866432363..c7fb907f8c 100644 --- a/usr/src/uts/sparc/ip/ip.global-objs.obj64 +++ b/usr/src/uts/sparc/ip/ip.global-objs.obj64 @@ -64,6 +64,7 @@ gcgrp4_hash gcgrp6_hash gcgrp_hash_size gcgrp_lock +icmp_fallback_sock_winit icmp_frag_size_table icmp_g_t_info_ack icmp_ipha @@ -104,6 +105,10 @@ ip_cgtp_filter_rev ip_conn_cache ip_debug ip_g_all_ones +ip_helper_stream_cache +ip_helper_stream_info +ip_helper_stream_rinit +ip_helper_stream_winit ip_ioctl_ftbl ip_ire_cleanup_cnt ip_ire_cpu_ratio @@ -140,6 +145,7 @@ ip_squeue_worker_wait ip_thread_data ip_thread_list ip_thread_rwlock +ip_use_helper_cache ip_wput_frag_mdt_min ipcl_bind_fanout_size ipcl_conn_hash_maxsize @@ -243,6 +249,10 @@ sctprinit sctpwinit sin6_null sin_null +sock_rawip_downcalls +sock_rts_downcalls +sock_tcp_downcalls +sock_udp_downcalls sqset_global_list sqset_global_size sqset_lock @@ -256,6 +266,7 @@ tcp_acceptor_winit tcp_conn_cache tcp_conn_hash_size tcp_drop_ack_unsent_cnt +tcp_fallback_sock_winit tcp_free_list_max_cnt tcp_fusion_rcv_unread_min tcp_g_kstat @@ -295,10 +306,12 @@ tcp_winit tcp_outbound_squeue_switch tcpinfov4 tcpinfov6 +tli_errs tsol_strict_error tun_spd_hashsize udp_bind_fanout_size udp_conn_cache +udp_fallback_sock_winit udp_g_t_info_ack_ipv4 udp_g_t_info_ack_ipv6 udp_lrinit diff --git a/usr/src/uts/sparc/iscsi/Makefile b/usr/src/uts/sparc/iscsi/Makefile index 0e35ba9d0d..437d9b5838 100644 --- a/usr/src/uts/sparc/iscsi/Makefile +++ b/usr/src/uts/sparc/iscsi/Makefile @@ -61,7 +61,7 @@ INC_PATH += -I$(UTSBASE)/common/io/scsi/adapters/iscsi # # Note dependancy on misc/scsi. # -LDFLAGS += -dy -N"misc/scsi" -N"fs/sockfs" -N"sys/doorfs" -N"misc/md5" +LDFLAGS += -dy -N"misc/scsi" -N"fs/sockfs" -N"sys/doorfs" -N"misc/md5" -Nmisc/ksocket LINTFLAGS += -a -erroff=E_BAD_PTR_CAST_ALIGN -erroff=E_PTRDIFF_OVERFLOW LINTFLAGS64 += -a -erroff=E_BAD_PTR_CAST_ALIGN -erroff=E_PTRDIFF_OVERFLOW diff --git a/usr/src/uts/sparc/iscsit/Makefile b/usr/src/uts/sparc/iscsit/Makefile index 1df1235747..7ecd8be223 100644 --- a/usr/src/uts/sparc/iscsit/Makefile +++ b/usr/src/uts/sparc/iscsit/Makefile @@ -59,7 +59,7 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) # Overrides and depends_on # MODSTUBS_DIR = $(OBJS_DIR) -LDFLAGS += -dy -Ndrv/stmf -Nmisc/idm -Nfs/sockfs -Nmisc/md5 +LDFLAGS += -dy -Ndrv/stmf -Nmisc/idm -Nfs/sockfs -Nmisc/md5 -Nmisc/ksocket INC_PATH += -I$(UTSBASE)/common/io/comstar/port/iscsit diff --git a/usr/src/uts/sparc/ksocket/Makefile b/usr/src/uts/sparc/ksocket/Makefile new file mode 100644 index 0000000000..287a7cfda6 --- /dev/null +++ b/usr/src/uts/sparc/ksocket/Makefile @@ -0,0 +1,84 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. + +# +# This makefile drives the production of the kernel socket module +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = ksocket +OBJECTS = $(KSOCKET_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(KSOCKET_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_MISC_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sparc/Makefile.sparc + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# Overrides. +# +CFLAGS += $(CCVERBOSE) +LDFLAGS += -dy -Nfs/sockfs + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/intel/Makefile.targ diff --git a/usr/src/uts/sparc/ml/modstubs.s b/usr/src/uts/sparc/ml/modstubs.s index e315c9857c..e3379799a7 100644 --- a/usr/src/uts/sparc/ml/modstubs.s +++ b/usr/src/uts/sparc/ml/modstubs.s @@ -385,7 +385,10 @@ stubs_base: NO_UNLOAD_STUB(sockfs, snf_segmap, nomod_einval); NO_UNLOAD_STUB(sockfs, sock_getfasync, nomod_zero); NO_UNLOAD_STUB(sockfs, nl7c_sendfilev, nomod_zero); - NO_UNLOAD_STUB(sockfs, sostream_direct, nomod_zero); + NO_UNLOAD_STUB(sockfs, sotpi_sototpi, nomod_zero); + NO_UNLOAD_STUB(sockfs, socket_sendmblk, nomod_zero); + NO_UNLOAD_STUB(sockfs, socket_setsockopt, nomod_zero); + NO_UNLOAD_STUB(sockfs, sod_uioa_mblk_done, nomod_zero); END_MODULE(sockfs); #endif @@ -1232,27 +1235,6 @@ stubs_base: END_MODULE(softmac); #endif -#ifndef SDPIB_MODULE - MODULE(sdpib,drv); - STUB(sdpib, sdp_create, nomod_zero); - STUB(sdpib, sdp_bind, nomod_einval); - STUB(sdpib, sdp_listen, nomod_einval); - STUB(sdpib, sdp_connect, nomod_einval); - STUB(sdpib, sdp_recv, nomod_einval); - STUB(sdpib, sdp_send, nomod_einval); - STUB(sdpib, sdp_getpeername, nomod_einval); - STUB(sdpib, sdp_getsockname, nomod_einval); - STUB(sdpib, sdp_disconnect, nomod_einval); - STUB(sdpib, sdp_shutdown, nomod_einval); - STUB(sdpib, sdp_get_opt, nomod_einval); - STUB(sdpib, sdp_set_opt, nomod_einval); - STUB(sdpib, sdp_close, nomod_void); - STUB(sdpib, sdp_polldata, nomod_zero); - STUB(sdpib, sdp_ioctl, nomod_einval); - END_MODULE(sdpib); -#endif - - /* * Stubs for kssl, the kernel SSL proxy */ @@ -1294,6 +1276,35 @@ stubs_base: END_MODULE(ipnet); #endif +/* + * Stubs for kernel socket, for iscsi + */ +#ifndef KSOCKET_MODULE + MODULE(ksocket, misc); + NO_UNLOAD_STUB(ksocket, ksocket_setsockopt, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_getsockopt, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_getpeername, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_getsockname, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_socket, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_bind, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_listen, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_accept, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_connect, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_recv, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_recvfrom, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_recvmsg, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_send, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_sendto, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_sendmsg, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_ioctl, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_setcallbacks, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_hold, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_rele, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_shutdown, nomod_minus_one); + NO_UNLOAD_STUB(ksocket, ksocket_close, nomod_minus_one); + END_MODULE(ksocket); +#endif + ! this is just a marker for the area of text that contains stubs .seg ".text" .global stubs_end diff --git a/usr/src/uts/sparc/rts/Makefile b/usr/src/uts/sparc/rts/Makefile index ff635303bc..4078c24237 100644 --- a/usr/src/uts/sparc/rts/Makefile +++ b/usr/src/uts/sparc/rts/Makefile @@ -20,11 +20,9 @@ # # # uts/sparc/rts/Makefile -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" -# # This makefile drives the production of the rts IP driver # # sparc architecture dependent @@ -42,6 +40,7 @@ MODULE = rts OBJECTS = $(RTS_OBJS:%=$(OBJS_DIR)/%) LINTS = $(RTS_OBJS:%.o=$(LINTS_DIR)/%.ln) ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) +ROOTLINK = $(ROOT_SOCK_DIR)/$(MODULE) CONF_SRCDIR = $(UTSBASE)/common/inet/ip # @@ -69,9 +68,9 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOTLINK) $(ROOT_CONFFILE) CFLAGS += $(CCVERBOSE) # -# depends on tun +# depends on ip and sockfs # -LDFLAGS += -dy -Ndrv/ip +LDFLAGS += -dy -Ndrv/ip -Nfs/sockfs # # For now, disable these lint checks; maintainers should endeavor @@ -103,7 +102,7 @@ clean.lint: $(CLEAN_LINT_DEPS) install: $(INSTALL_DEPS) $(SISCHECK_DEPS) -$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOTMODULE) +$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOT_SOCK_DIR) $(ROOTMODULE) -$(RM) $@; ln $(ROOTMODULE) $@ # diff --git a/usr/src/uts/sparc/rts/rts.global-objs.debug64 b/usr/src/uts/sparc/rts/rts.global-objs.debug64 index 4c699f6410..75b422acf6 100644 --- a/usr/src/uts/sparc/rts/rts.global-objs.debug64 +++ b/usr/src/uts/sparc/rts/rts.global-objs.debug64 @@ -19,14 +19,15 @@ # CDDL HEADER END # # -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" cb_inet_devops inet_dev_info inet_devops modldrv modlinkage +modlsockmod netdev_privs +smodreg diff --git a/usr/src/uts/sparc/smbsrv/Makefile b/usr/src/uts/sparc/smbsrv/Makefile index 71c4cc5398..023d1c1cd5 100644 --- a/usr/src/uts/sparc/smbsrv/Makefile +++ b/usr/src/uts/sparc/smbsrv/Makefile @@ -19,11 +19,8 @@ # CDDL HEADER END # # -# -# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. -# -#ident "%Z%%M% %I% %E% SMI" # # This makefile drives the production of the cifs server file system @@ -52,7 +49,8 @@ include $(UTSBASE)/sparc/Makefile.sparc # # Module dependencies # -LDFLAGS += -dy -Nfs/sockfs -Ndrv/ip -Nstrmod/rpcmod -Nsys/doorfs -Nmisc/kcf +LDFLAGS += -dy -Nfs/sockfs -Nmisc/ksocket -Ndrv/ip -Nstrmod/rpcmod -Nsys/doorfs +LDFLAGS += -Nmisc/kcf # # Define targets diff --git a/usr/src/uts/sparc/socksctp/Makefile b/usr/src/uts/sparc/socksctp/Makefile new file mode 100644 index 0000000000..5acab4cfb1 --- /dev/null +++ b/usr/src/uts/sparc/socksctp/Makefile @@ -0,0 +1,96 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. + +# +# This makefile drives the production of the nca driver +# kernel module. +# +# sparc architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = socksctp +OBJECTS = $(SCTP_SOCK_MOD_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(SCTP_SOCK_MOD_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_SOCK_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sparc/Makefile.sparc + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# lint pass one enforcement and OS version +# +CFLAGS += $(CCVERBOSE) + +LDFLAGS += -dy -Nfs/sockfs -Ndrv/ip + +# +# For now, disable these lint checks; maintainers should endeavor +# to investigate and remove these for maximum lint coverage. +# Please do not carry these forward to new Makefiles. +# +LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN +LINTTAGS += -erroff=E_PTRDIFF_OVERFLOW + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/sparc/Makefile.targ diff --git a/usr/src/uts/sparc/socksdp/Makefile b/usr/src/uts/sparc/socksdp/Makefile new file mode 100644 index 0000000000..6970c44faf --- /dev/null +++ b/usr/src/uts/sparc/socksdp/Makefile @@ -0,0 +1,88 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. + +# +# This makefile drives the production of the nca driver +# kernel module. +# +# sparc architecture dependent +# + +# +# Path to the base of the uts directory tree (usually /usr/src/uts). +# +UTSBASE = ../.. + +# +# Define the module and object file sets. +# +MODULE = socksdp +OBJECTS = $(SDP_SOCK_MOD_OBJS:%=$(OBJS_DIR)/%) +LINTS = $(SDP_SOCK_MOD_OBJS:%.o=$(LINTS_DIR)/%.ln) +ROOTMODULE = $(ROOT_SOCK_DIR)/$(MODULE) + +# +# Include common rules. +# +include $(UTSBASE)/sparc/Makefile.sparc + +# +# Define targets +# +ALL_TARGET = $(BINARY) +LINT_TARGET = $(MODULE).lint +INSTALL_TARGET = $(BINARY) $(ROOTMODULE) + +# +# lint pass one enforcement and OS version +# +CFLAGS += $(CCVERBOSE) + +LDFLAGS += -dy -Nfs/sockfs -Ndrv/ip + +# +# Default build targets. +# +.KEEP_STATE: + +def: $(DEF_DEPS) + +all: $(ALL_DEPS) + +clean: $(CLEAN_DEPS) + +clobber: $(CLOBBER_DEPS) + +lint: $(LINT_DEPS) + +modlintlib: $(MODLINTLIB_DEPS) + +clean.lint: $(CLEAN_LINT_DEPS) + +install: $(INSTALL_DEPS) + +# +# Include common targets. +# +include $(UTSBASE)/sparc/Makefile.targ diff --git a/usr/src/uts/sparc/tcp/Makefile b/usr/src/uts/sparc/tcp/Makefile index 192fda758f..7276ecfaeb 100644 --- a/usr/src/uts/sparc/tcp/Makefile +++ b/usr/src/uts/sparc/tcp/Makefile @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -21,11 +20,9 @@ # # # uts/sparc/tcp/Makefile -# Copyright 2004 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" -# # This makefile drives the production of the tcp driver kernel module. # # sparc architecture dependent @@ -43,7 +40,7 @@ MODULE = tcp OBJECTS = $(TCP_OBJS:%=$(OBJS_DIR)/%) LINTS = $(TCP_OBJS:%.o=$(LINTS_DIR)/%.ln) ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) -ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE) +ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE) $(ROOT_SOCK_DIR)/$(MODULE) CONF_SRCDIR = $(UTSBASE)/common/inet/tcp # @@ -77,9 +74,9 @@ CFLAGS += $(CCVERBOSE) CFLAGS += -xinline=tcp_set_ws_value,tcp_fill_header # -# depends on ip and md5 +# depends on ip, md5 and sockfs # -LDFLAGS += -dy -Ndrv/ip -Ncrypto/md5 +LDFLAGS += -dy -Ndrv/ip -Ncrypto/md5 -Nfs/sockfs # # Default build targets. @@ -102,7 +99,7 @@ clean.lint: $(CLEAN_LINT_DEPS) install: $(INSTALL_DEPS) -$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOTMODULE) +$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOT_SOCK_DIR) $(ROOTMODULE) -$(RM) $@; ln $(ROOTMODULE) $@ # diff --git a/usr/src/uts/sparc/udp/Makefile b/usr/src/uts/sparc/udp/Makefile index c0deb87087..07a4435112 100644 --- a/usr/src/uts/sparc/udp/Makefile +++ b/usr/src/uts/sparc/udp/Makefile @@ -2,9 +2,8 @@ # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -21,11 +20,9 @@ # # # uts/sparc/udp/Makefile -# Copyright 2004 Sun Microsystems, Inc. All rights reserved. +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" -# # This makefile drives the production of the udp driver kernel module. # # sparc architecture dependent @@ -43,7 +40,7 @@ MODULE = udp OBJECTS = $(UDP_OBJS:%=$(OBJS_DIR)/%) LINTS = $(UDP_OBJS:%.o=$(LINTS_DIR)/%.ln) ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE) -ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE) +ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE) $(ROOT_SOCK_DIR)/$(MODULE) CONF_SRCDIR = $(UTSBASE)/common/inet/udp # @@ -71,9 +68,9 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOTLINK) $(ROOT_CONFFILE) CFLAGS += $(CCVERBOSE) # -# depends on ip +# depends on ip and sockfs # -LDFLAGS += -dy -Ndrv/ip +LDFLAGS += -dy -Ndrv/ip -Nfs/sockfs # # Default build targets. @@ -96,7 +93,7 @@ clean.lint: $(CLEAN_LINT_DEPS) install: $(INSTALL_DEPS) -$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOTMODULE) +$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOT_SOCK_DIR) $(ROOTMODULE) -$(RM) $@; ln $(ROOTMODULE) $@ # |