summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYu Xiangning <Eric.Yu@Sun.COM>2008-12-11 20:04:13 -0800
committerYu Xiangning <Eric.Yu@Sun.COM>2008-12-11 20:04:13 -0800
commit0f1702c5201310f0529cd5abb77652e5e9b241b6 (patch)
tree83bbea7ada9d11097f73645900c06c37d8346669
parent9a9ae70f32271d74856130e37667ca926b27feb4 (diff)
downloadillumos-joyent-0f1702c5201310f0529cd5abb77652e5e9b241b6.tar.gz
PSARC 2007/587 Volo -- Low Latency Socket Framework
PSARC 2008/694 Volo Interfaces Amendment 6765829 Integration of project Volo PSARC/2007/587 6644935 mblk cred_t reference counting limits scalability 6693633 TCP receive does not scale because of heavy refcounting of cred structures 4764841 connect/accept is slow on Solaris when compared to Linux 5105708 socket creation retains hold on accessvp 4764836 setsockopt is slow on Solaris when compared to Linux 4772191 socket close(2) is slow on Solaris when compared to Linux --HG-- rename : usr/src/uts/common/fs/sockfs/socksctp.c => usr/src/uts/common/inet/sockmods/socksctp.c rename : usr/src/uts/common/fs/sockfs/socksctp.h => usr/src/uts/common/inet/sockmods/socksctp.h rename : usr/src/uts/common/fs/sockfs/socksctpsubr.c => usr/src/uts/common/inet/sockmods/socksctpsubr.c rename : usr/src/uts/common/fs/sockfs/socksdp.c => usr/src/uts/common/inet/sockmods/socksdp.c rename : usr/src/uts/common/fs/sockfs/socksdp.h => usr/src/uts/common/inet/sockmods/socksdp.h
-rw-r--r--usr/src/cmd/cmd-inet/etc/sock2path41
-rw-r--r--usr/src/cmd/cmd-inet/usr.bin/netstat/unix.c25
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/soconfig.c48
-rw-r--r--usr/src/cmd/mdb/Makefile.common1
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/net.c48
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/vfs.c484
-rw-r--r--usr/src/cmd/mdb/common/modules/sockfs/sockfs.c154
-rw-r--r--usr/src/cmd/mdb/intel/amd64/sockfs/Makefile33
-rw-r--r--usr/src/cmd/mdb/intel/ia32/sockfs/Makefile32
-rw-r--r--usr/src/cmd/mdb/sparc/v9/sockfs/Makefile33
-rw-r--r--usr/src/pkgdefs/SUNWckr/prototype_com1
-rw-r--r--usr/src/pkgdefs/SUNWckr/prototype_i38613
-rw-r--r--usr/src/pkgdefs/SUNWckr/prototype_sparc7
-rw-r--r--usr/src/pkgdefs/SUNWhea/prototype_com2
-rw-r--r--usr/src/pkgdefs/SUNWibsdp/postinstall9
-rw-r--r--usr/src/pkgdefs/SUNWibsdp/preremove9
-rw-r--r--usr/src/pkgdefs/SUNWibsdp/prototype_i3868
-rw-r--r--usr/src/pkgdefs/SUNWibsdp/prototype_sparc7
-rw-r--r--usr/src/pkgdefs/SUNWmdb/prototype_i3862
-rw-r--r--usr/src/pkgdefs/SUNWmdb/prototype_sparc1
-rw-r--r--usr/src/pkgdefs/SUNWmdbr/prototype_i38610
-rw-r--r--usr/src/pkgdefs/SUNWmdbr/prototype_sparc12
-rw-r--r--usr/src/pkgdefs/common_files/i.sock2path29
-rw-r--r--usr/src/uts/Makefile.targ20
-rw-r--r--usr/src/uts/Makefile.uts13
-rw-r--r--usr/src/uts/common/Makefile.files18
-rw-r--r--usr/src/uts/common/Makefile.rules14
-rw-r--r--usr/src/uts/common/c2/audit_event.c274
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_negotiate.c24
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_net.c160
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_server.c31
-rw-r--r--usr/src/uts/common/fs/smbsrv/smb_session.c22
-rw-r--r--usr/src/uts/common/fs/sockfs/nl7c.c47
-rw-r--r--usr/src/uts/common/fs/sockfs/nl7c.h19
-rw-r--r--usr/src/uts/common/fs/sockfs/nl7chttp.c35
-rw-r--r--usr/src/uts/common/fs/sockfs/nl7curi.c74
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon.c1092
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon.h246
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon_sops.c1696
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon_subr.c1970
-rw-r--r--usr/src/uts/common/fs/sockfs/sockcommon_vnops.c482
-rw-r--r--usr/src/uts/common/fs/sockfs/socknotify.c379
-rw-r--r--usr/src/uts/common/fs/sockfs/sockparams.c723
-rw-r--r--usr/src/uts/common/fs/sockfs/socksctp.c2773
-rw-r--r--usr/src/uts/common/fs/sockfs/socksctpvnops.c875
-rwxr-xr-xusr/src/uts/common/fs/sockfs/socksdp.h85
-rwxr-xr-xusr/src/uts/common/fs/sockfs/socksdpsubr.c214
-rw-r--r--usr/src/uts/common/fs/sockfs/socksdpvnops.c535
-rw-r--r--usr/src/uts/common/fs/sockfs/sockssl.c9
-rw-r--r--usr/src/uts/common/fs/sockfs/sockstr.c744
-rw-r--r--usr/src/uts/common/fs/sockfs/socksubr.c693
-rw-r--r--usr/src/uts/common/fs/sockfs/socksyscalls.c642
-rw-r--r--usr/src/uts/common/fs/sockfs/socktpi.c2735
-rw-r--r--usr/src/uts/common/fs/sockfs/socktpi.h282
-rw-r--r--usr/src/uts/common/fs/sockfs/socktpi_impl.h99
-rw-r--r--usr/src/uts/common/fs/sockfs/sockvnops.c1438
-rw-r--r--usr/src/uts/common/inet/inetddi.c55
-rw-r--r--usr/src/uts/common/inet/ip.h24
-rw-r--r--usr/src/uts/common/inet/ip/icmp.c3232
-rw-r--r--usr/src/uts/common/inet/ip/icmp_opt_data.c16
-rw-r--r--usr/src/uts/common/inet/ip/icmpddi.c6
-rw-r--r--usr/src/uts/common/inet/ip/ip.c487
-rw-r--r--usr/src/uts/common/inet/ip/ip6.c497
-rw-r--r--usr/src/uts/common/inet/ip/ip6_if.c39
-rw-r--r--usr/src/uts/common/inet/ip/ip_helper_stream.c482
-rw-r--r--usr/src/uts/common/inet/ip/ip_if.c50
-rw-r--r--usr/src/uts/common/inet/ip/ip_opt_data.c12
-rw-r--r--usr/src/uts/common/inet/ip/ip_rts.c52
-rw-r--r--usr/src/uts/common/inet/ip/ipclassifier.c79
-rw-r--r--usr/src/uts/common/inet/ip/keysock.c7
-rw-r--r--usr/src/uts/common/inet/ip/rts.c851
-rw-r--r--usr/src/uts/common/inet/ip/rts_opt_data.c17
-rw-r--r--usr/src/uts/common/inet/ip/rtsddi.c12
-rw-r--r--usr/src/uts/common/inet/ip/spdsock.c7
-rw-r--r--usr/src/uts/common/inet/ip6.h11
-rw-r--r--usr/src/uts/common/inet/ip_if.h3
-rw-r--r--usr/src/uts/common/inet/ip_impl.h18
-rw-r--r--usr/src/uts/common/inet/ip_rts.h9
-rw-r--r--usr/src/uts/common/inet/ip_stack.h2
-rw-r--r--usr/src/uts/common/inet/ipclassifier.h38
-rw-r--r--usr/src/uts/common/inet/mi.c94
-rw-r--r--usr/src/uts/common/inet/mi.h17
-rw-r--r--usr/src/uts/common/inet/optcom.c194
-rw-r--r--usr/src/uts/common/inet/optcom.h13
-rw-r--r--usr/src/uts/common/inet/proto_set.c440
-rw-r--r--usr/src/uts/common/inet/proto_set.h58
-rw-r--r--usr/src/uts/common/inet/rawip_impl.h29
-rw-r--r--usr/src/uts/common/inet/rts_impl.h37
-rw-r--r--usr/src/uts/common/inet/sctp/sctp.c23
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_bind.c12
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_common.c11
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_conn.c34
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_cookie.c6
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_impl.h31
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_input.c47
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_notify.c16
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_opt_data.c7
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_output.c25
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_shutdown.c6
-rw-r--r--usr/src/uts/common/inet/sctp_itf.h31
-rw-r--r--usr/src/uts/common/inet/sockmods/sockmod_sctp.c221
-rw-r--r--usr/src/uts/common/inet/sockmods/sockmod_sdp.c154
-rw-r--r--usr/src/uts/common/inet/sockmods/socksctp.c2105
-rw-r--r--usr/src/uts/common/inet/sockmods/socksctp.h (renamed from usr/src/uts/common/fs/sockfs/socksctp.h)44
-rw-r--r--usr/src/uts/common/inet/sockmods/socksctpsubr.c (renamed from usr/src/uts/common/fs/sockfs/socksctpsubr.c)199
-rw-r--r--usr/src/uts/common/inet/sockmods/socksdp.c (renamed from usr/src/uts/common/fs/sockfs/socksdp.c)1024
-rw-r--r--usr/src/uts/common/inet/sockmods/socksdp.h44
-rw-r--r--usr/src/uts/common/inet/sockmods/socksdpsubr.c60
-rw-r--r--usr/src/uts/common/inet/spdsock.h5
-rw-r--r--usr/src/uts/common/inet/squeue.c139
-rw-r--r--usr/src/uts/common/inet/tcp.h17
-rw-r--r--usr/src/uts/common/inet/tcp/tcp.c5242
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_fusion.c225
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_opt_data.c16
-rw-r--r--usr/src/uts/common/inet/tcp/tcpddi.c6
-rw-r--r--usr/src/uts/common/inet/tcp_impl.h9
-rw-r--r--usr/src/uts/common/inet/tcp_stack.h3
-rw-r--r--usr/src/uts/common/inet/udp/udp.c4128
-rw-r--r--usr/src/uts/common/inet/udp/udp_opt_data.c14
-rw-r--r--usr/src/uts/common/inet/udp/udpddi.c5
-rw-r--r--usr/src/uts/common/inet/udp_impl.h41
-rw-r--r--usr/src/uts/common/io/comstar/port/iscsit/iscsit_isns.c107
-rw-r--r--usr/src/uts/common/io/comstar/port/iscsit/iscsit_isns.h2
-rw-r--r--usr/src/uts/common/io/comstar/port/iscsit/iscsit_radiuspacket.c81
-rw-r--r--usr/src/uts/common/io/ib/clients/rds/rds_opt.c8
-rw-r--r--usr/src/uts/common/io/ib/clients/rds/rdsddi.c13
-rw-r--r--usr/src/uts/common/io/ib/clients/sdp/sdpddi.c34
-rw-r--r--usr/src/uts/common/io/idm/idm_so.c271
-rw-r--r--usr/src/uts/common/io/ksocket/ksocket.c733
-rw-r--r--usr/src/uts/common/io/ksocket/ksocket_impl.h74
-rw-r--r--usr/src/uts/common/io/ksocket/ksocket_mod.c57
-rw-r--r--usr/src/uts/common/io/scsi/adapters/iscsi/iscsi.h4
-rw-r--r--usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_ioctl.c23
-rw-r--r--usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_login.c15
-rw-r--r--usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_net.c417
-rw-r--r--usr/src/uts/common/io/scsi/adapters/iscsi/isns_client.c46
-rw-r--r--usr/src/uts/common/io/sock_conf.c251
-rw-r--r--usr/src/uts/common/io/strplumb.c1
-rw-r--r--usr/src/uts/common/netinet/icmp6.h10
-rw-r--r--usr/src/uts/common/os/fio.c9
-rw-r--r--usr/src/uts/common/os/modconf.c67
-rw-r--r--usr/src/uts/common/os/move.c2
-rw-r--r--usr/src/uts/common/os/streamio.c46
-rw-r--r--usr/src/uts/common/os/strsubr.c16
-rw-r--r--usr/src/uts/common/smbsrv/smb_kproto.h15
-rw-r--r--usr/src/uts/common/smbsrv/smb_ktypes.h6
-rw-r--r--usr/src/uts/common/sys/Makefile2
-rw-r--r--usr/src/uts/common/sys/idm/idm_so.h20
-rw-r--r--usr/src/uts/common/sys/iscsit/radius_packet.h6
-rw-r--r--usr/src/uts/common/sys/ksocket.h127
-rw-r--r--usr/src/uts/common/sys/modctl.h10
-rw-r--r--usr/src/uts/common/sys/socket.h39
-rw-r--r--usr/src/uts/common/sys/socket_proto.h182
-rw-r--r--usr/src/uts/common/sys/socketvar.h732
-rw-r--r--usr/src/uts/common/sys/sockio.h8
-rw-r--r--usr/src/uts/common/sys/sodirect.h35
-rw-r--r--usr/src/uts/common/sys/squeue.h3
-rw-r--r--usr/src/uts/common/sys/squeue_impl.h2
-rw-r--r--usr/src/uts/common/sys/stream.h1
-rw-r--r--usr/src/uts/common/sys/strsubr.h3
-rw-r--r--usr/src/uts/common/syscall/sendfile.c237
-rw-r--r--usr/src/uts/intel/Makefile.intel.shared7
-rw-r--r--usr/src/uts/intel/ia32/ml/modstubs.s58
-rw-r--r--usr/src/uts/intel/icmp/Makefile12
-rw-r--r--usr/src/uts/intel/icmp/icmp.global-objs.debug646
-rw-r--r--usr/src/uts/intel/idm/Makefile2
-rw-r--r--usr/src/uts/intel/ip/ip.global-objs.debug6413
-rw-r--r--usr/src/uts/intel/ip/ip.global-objs.obj6413
-rw-r--r--usr/src/uts/intel/iscsi/Makefile2
-rw-r--r--usr/src/uts/intel/iscsit/Makefile2
-rw-r--r--usr/src/uts/intel/ksocket/Makefile84
-rw-r--r--usr/src/uts/intel/rts/Makefile11
-rw-r--r--usr/src/uts/intel/rts/rts.global-objs.debug645
-rw-r--r--usr/src/uts/intel/smbsrv/Makefile7
-rw-r--r--usr/src/uts/intel/socksctp/Makefile95
-rw-r--r--usr/src/uts/intel/socksdp/Makefile87
-rw-r--r--usr/src/uts/intel/tcp/Makefile16
-rw-r--r--usr/src/uts/intel/udp/Makefile17
-rw-r--r--usr/src/uts/sparc/Makefile.sparc.shared7
-rw-r--r--usr/src/uts/sparc/icmp/Makefile12
-rw-r--r--usr/src/uts/sparc/icmp/icmp.global-objs.debug646
-rw-r--r--usr/src/uts/sparc/idm/Makefile2
-rw-r--r--usr/src/uts/sparc/ip/ip.global-objs.debug6413
-rw-r--r--usr/src/uts/sparc/ip/ip.global-objs.obj6413
-rw-r--r--usr/src/uts/sparc/iscsi/Makefile2
-rw-r--r--usr/src/uts/sparc/iscsit/Makefile2
-rw-r--r--usr/src/uts/sparc/ksocket/Makefile84
-rw-r--r--usr/src/uts/sparc/ml/modstubs.s55
-rw-r--r--usr/src/uts/sparc/rts/Makefile11
-rw-r--r--usr/src/uts/sparc/rts/rts.global-objs.debug645
-rw-r--r--usr/src/uts/sparc/smbsrv/Makefile8
-rw-r--r--usr/src/uts/sparc/socksctp/Makefile96
-rw-r--r--usr/src/uts/sparc/socksdp/Makefile88
-rw-r--r--usr/src/uts/sparc/tcp/Makefile17
-rw-r--r--usr/src/uts/sparc/udp/Makefile17
195 files changed, 28628 insertions, 16272 deletions
diff --git a/usr/src/cmd/cmd-inet/etc/sock2path b/usr/src/cmd/cmd-inet/etc/sock2path
index 425d6c8006..aba55bb652 100644
--- a/usr/src/cmd/cmd-inet/etc/sock2path
+++ b/usr/src/cmd/cmd-inet/etc/sock2path
@@ -1,9 +1,8 @@
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
@@ -18,39 +17,37 @@
#
# CDDL HEADER END
#
-# Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
# socket configuration information
#
-# Family Type Protocol Path
- 2 2 0 /dev/tcp
- 2 2 6 /dev/tcp
+# Family Type Protocol Dev|Module
+ 2 2 0 tcp
+ 2 2 6 tcp
- 26 2 0 /dev/tcp6
- 26 2 6 /dev/tcp6
+ 26 2 0 tcp
+ 26 2 6 tcp
- 2 1 0 /dev/udp
- 2 1 17 /dev/udp
+ 2 1 0 udp
+ 2 1 17 udp
- 26 1 0 /dev/udp6
- 26 1 17 /dev/udp6
+ 26 1 0 udp
+ 26 1 17 udp
1 2 0 /dev/ticotsord
1 6 0 /dev/ticotsord
1 1 0 /dev/ticlts
- 2 4 0 /dev/rawip
- 26 4 0 /dev/rawip6
+ 2 4 0 icmp
+ 26 4 0 icmp
- 2 2 132 /dev/sctp
- 26 2 132 /dev/sctp6
- 2 6 132 /dev/sctp
- 26 6 132 /dev/sctp6
+ 2 2 132 socksctp
+ 26 2 132 socksctp
+ 2 6 132 socksctp
+ 26 6 132 socksctp
- 24 4 0 /dev/rts
+ 24 4 0 rts
27 4 2 /dev/keysock
28 2 0 /dev/nca
diff --git a/usr/src/cmd/cmd-inet/usr.bin/netstat/unix.c b/usr/src/cmd/cmd-inet/usr.bin/netstat/unix.c
index 5e7afa8e3d..175310a9a6 100644
--- a/usr/src/cmd/cmd-inet/usr.bin/netstat/unix.c
+++ b/usr/src/cmd/cmd-inet/usr.bin/netstat/unix.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2001 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -37,8 +36,6 @@
* contributors.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* code for netstat's -k option
*
@@ -130,8 +127,8 @@ print_kn(kstat_t *ksp)
(void) printf("\nActive UNIX domain sockets\n");
(void) printf("%-8.8s %-10.10s %8.8s %8.8s "
- "Local Addr Remote Addr\n",
- "Address", "Type", "Vnode", "Conn");
+ "Local Addr Remote Addr\n",
+ "Address", "Type", "Vnode", "Conn");
/* for each sockinfo structure, display what we need: */
for (i = 0; i < ksp->ks_ndata; i++) {
@@ -164,13 +161,13 @@ print_kn(kstat_t *ksp)
if ((psi->si_state & SS_ISBOUND) &&
strlen(psi->si_laddr_sun_path) != 0 &&
psi->si_laddr_soa_len != 0) {
- if (psi->si_state & SS_FADDR_NOXLATE) {
+ if (psi->si_faddr_noxlate) {
(void) printf(" (socketpair) ");
} else {
if (psi->si_laddr_soa_len >
- sizeof (psi->si_laddr_family))
+ sizeof (psi->si_laddr_family))
(void) printf("%s ",
- psi->si_laddr_sun_path);
+ psi->si_laddr_sun_path);
else
(void) printf(" ");
}
@@ -182,13 +179,13 @@ print_kn(kstat_t *ksp)
strlen(psi->si_faddr_sun_path) != 0 &&
psi->si_faddr_soa_len != 0) {
- if (psi->si_state & SS_FADDR_NOXLATE) {
+ if (psi->si_faddr_noxlate) {
(void) printf(" (socketpair) ");
} else {
if (psi->si_faddr_soa_len >
- sizeof (psi->si_faddr_family))
+ sizeof (psi->si_faddr_family))
(void) printf("%s ",
- psi->si_faddr_sun_path);
+ psi->si_faddr_sun_path);
else
(void) printf(" ");
}
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/soconfig.c b/usr/src/cmd/cmd-inet/usr.sbin/soconfig.c
index 5d3838623f..b5c45f7b6f 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/soconfig.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/soconfig.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright (c) 1991-1996,2001 by Sun Microsystems, Inc.
- * All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <stdio.h>
#include <sys/stat.h>
#include <stdlib.h>
@@ -40,12 +37,12 @@
* Usage:
* sonconfig -f <file>
* Reads input from file. The file is structured as
- * <fam> <type> <protocol> <path>
+ * <fam> <type> <protocol> <path|module>
* <fam> <type> <protocol>
* with the first line registering and the second line
* deregistering.
*
- * soconfig <fam> <type> <protocol> <path>
+ * soconfig <fam> <type> <protocol> <path|module>
* registers
*
* soconfig <fam> <type> <protocol>
@@ -99,9 +96,9 @@ static void
usage(void)
{
fprintf(stderr, gettext(
- "Usage: soconfig -f <file>\n"
- "\tsoconfig <fam> <type> <protocol> <path>\n"
- "\tsoconfig <fam> <type> <protocol>\n"));
+ "Usage: soconfig -f <file>\n"
+ "\tsoconfig <fam> <type> <protocol> <path|module>\n"
+ "\tsoconfig <fam> <type> <protocol>\n"));
}
/*
@@ -131,7 +128,7 @@ parse_file(char *filename)
linecount++;
strcpy(pline, line);
argcount = split_line(pline, argvec,
- sizeof (argvec) / sizeof (argvec[0]));
+ sizeof (argvec) / sizeof (argvec[0]));
#ifdef DEBUG
{
int i;
@@ -147,18 +144,18 @@ parse_file(char *filename)
break;
case 3:
numerror += parse_params(argvec[0], argvec[1],
- argvec[2], NULL, linecount);
+ argvec[2], NULL, linecount);
break;
case 4:
numerror += parse_params(argvec[0], argvec[1],
- argvec[2], argvec[3], linecount);
+ argvec[2], argvec[3], linecount);
break;
default:
numerror++;
fprintf(stderr,
- gettext("Malformed line: <%s>\n"), line);
+ gettext("Malformed line: <%s>\n"), line);
fprintf(stderr,
- gettext("\ton line %d\n"), linecount);
+ gettext("\ton line %d\n"), linecount);
break;
}
}
@@ -223,7 +220,7 @@ parse_params(char *famstr, char *typestr, char *protostr, char *path, int line)
fprintf(stderr, gettext("Bad family number: %s\n"), famstr);
if (line != -1)
fprintf(stderr,
- gettext("\ton line %d\n"), line);
+ gettext("\ton line %d\n"), line);
else {
fprintf(stderr, "\n");
usage();
@@ -234,10 +231,10 @@ parse_params(char *famstr, char *typestr, char *protostr, char *path, int line)
type = parse_int(typestr);
if (type == -1) {
fprintf(stderr,
- gettext("Bad socket type number: %s\n"), typestr);
+ gettext("Bad socket type number: %s\n"), typestr);
if (line != -1)
fprintf(stderr,
- gettext("\ton line %d\n"), line);
+ gettext("\ton line %d\n"), line);
else {
fprintf(stderr, "\n");
usage();
@@ -248,10 +245,10 @@ parse_params(char *famstr, char *typestr, char *protostr, char *path, int line)
protocol = parse_int(protostr);
if (protocol == -1) {
fprintf(stderr,
- gettext("Bad protocol number: %s\n"), protostr);
+ gettext("Bad protocol number: %s\n"), protostr);
if (line != -1)
fprintf(stderr,
- gettext("\ton line %d\n"), line);
+ gettext("\ton line %d\n"), line);
else {
fprintf(stderr, "\n");
usage();
@@ -263,11 +260,12 @@ parse_params(char *famstr, char *typestr, char *protostr, char *path, int line)
if (path != NULL) {
struct stat stats;
- if (stat(path, &stats) == -1) {
+ if (strncmp(path, "/dev", strlen("/dev")) == 0 &&
+ stat(path, &stats) == -1) {
perror(path);
if (line != -1)
fprintf(stderr,
- gettext("\ton line %d\n"), line);
+ gettext("\ton line %d\n"), line);
else {
fprintf(stderr, "\n");
usage();
@@ -278,7 +276,7 @@ parse_params(char *famstr, char *typestr, char *protostr, char *path, int line)
#ifdef DEBUG
printf("not calling sockconfig(%d, %d, %d, %s)\n",
- fam, type, protocol, path == NULL ? "(null)" : path);
+ fam, type, protocol, path == NULL ? "(null)" : path);
#else
if (_sockconfig(fam, type, protocol, path) == -1) {
perror("sockconfig");
diff --git a/usr/src/cmd/mdb/Makefile.common b/usr/src/cmd/mdb/Makefile.common
index ed27426b8d..bb341fdc8f 100644
--- a/usr/src/cmd/mdb/Makefile.common
+++ b/usr/src/cmd/mdb/Makefile.common
@@ -87,6 +87,7 @@ COMMON_MODULES_KVM = \
sdbc \
smbfs \
smbsrv \
+ sockfs \
specfs \
sppp \
stmf \
diff --git a/usr/src/cmd/mdb/common/modules/genunix/net.c b/usr/src/cmd/mdb/common/modules/genunix/net.c
index c8785ed796..987e3b52a0 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/net.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/net.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <mdb/mdb_modapi.h>
#include <mdb/mdb_ks.h>
#include <mdb/mdb_ctf.h>
@@ -50,6 +48,7 @@
#include <inet/arp_impl.h>
#include <inet/rawip_impl.h>
#include <inet/mi.h>
+#include <fs/sockfs/socktpi_impl.h>
#define ADDR_V6_WIDTH 23
#define ADDR_V4_WIDTH 15
@@ -248,7 +247,7 @@ sonode_walk_init(mdb_walk_state_t *wsp)
}
}
- wsp->walk_data = mdb_alloc(sizeof (struct sonode), UM_SLEEP);
+ wsp->walk_data = mdb_alloc(sizeof (struct sotpi_sonode), UM_SLEEP);
return (WALK_NEXT);
}
@@ -256,12 +255,12 @@ int
sonode_walk_step(mdb_walk_state_t *wsp)
{
int status;
- struct sonode *sonodep;
+ struct sotpi_sonode *stp;
if (wsp->walk_addr == NULL)
return (WALK_DONE);
- if (mdb_vread(wsp->walk_data, sizeof (struct sonode),
+ if (mdb_vread(wsp->walk_data, sizeof (struct sotpi_sonode),
wsp->walk_addr) == -1) {
mdb_warn("failed to read sonode at %p", wsp->walk_addr);
return (WALK_ERR);
@@ -270,16 +269,16 @@ sonode_walk_step(mdb_walk_state_t *wsp)
status = wsp->walk_callback(wsp->walk_addr, wsp->walk_data,
wsp->walk_cbdata);
- sonodep = wsp->walk_data;
+ stp = wsp->walk_data;
- wsp->walk_addr = (uintptr_t)sonodep->so_next;
+ wsp->walk_addr = (uintptr_t)stp->st_info.sti_next_so;
return (status);
}
void
sonode_walk_fini(mdb_walk_state_t *wsp)
{
- mdb_free(wsp->walk_data, sizeof (struct sonode));
+ mdb_free(wsp->walk_data, sizeof (struct sotpi_sonode));
}
struct mi_walk_data {
@@ -517,9 +516,9 @@ sonode(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
mdb_printf(" %4hi", so.so_type);
}
- mdb_printf(" %5hi %05x %04x %04hx %0?p\n",
+ mdb_printf(" %5hi %05x %04x %04hx\n",
so.so_protocol, so.so_state, so.so_mode,
- so.so_flag, so.so_accessvp);
+ so.so_flag);
return (DCMD_OK);
}
@@ -740,12 +739,13 @@ netstat_udpv6_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
* returns 0 on success, -1 otherwise
*/
static int
-netstat_unix_name_pr(const struct sonode *so, const struct soaddr *soa)
+netstat_unix_name_pr(const struct sotpi_sonode *st, const struct soaddr *soa)
{
+ const struct sonode *so = &st->st_sonode;
const char none[] = " (none)";
if ((so->so_state & SS_ISBOUND) && (soa->soa_len != 0)) {
- if (so->so_state & SS_FADDR_NOXLATE) {
+ if (st->st_info.sti_faddr_noxlate) {
mdb_printf("%-14s ", " (socketpair)");
} else {
if (soa->soa_len > sizeof (sa_family_t)) {
@@ -775,9 +775,11 @@ netstat_unix_name_pr(const struct sonode *so, const struct soaddr *soa)
static int
netstat_unix_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
{
- const struct sonode *so = walk_data;
+ const struct sotpi_sonode *st = walk_data;
+ const struct sonode *so = &st->st_sonode;
+ const struct sotpi_info *sti = &st->st_info;
- if (so->so_accessvp == NULL)
+ if (so->so_count == 0)
return (WALK_NEXT);
if (so->so_family != AF_UNIX) {
@@ -787,7 +789,7 @@ netstat_unix_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
mdb_printf("%-?p ", kaddr);
- switch (so->so_serv_type) {
+ switch (sti->sti_serv_type) {
case T_CLTS:
mdb_printf("%-10s ", "dgram");
break;
@@ -798,27 +800,27 @@ netstat_unix_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
mdb_printf("%-10s ", "stream-ord");
break;
default:
- mdb_printf("%-10i ", so->so_serv_type);
+ mdb_printf("%-10i ", sti->sti_serv_type);
}
if ((so->so_state & SS_ISBOUND) &&
- (so->so_ux_laddr.soua_magic == SOU_MAGIC_EXPLICIT)) {
- mdb_printf("%0?p ", so->so_ux_laddr.soua_vp);
+ (sti->sti_ux_laddr.soua_magic == SOU_MAGIC_EXPLICIT)) {
+ mdb_printf("%0?p ", sti->sti_ux_laddr.soua_vp);
} else {
mdb_printf("%0?p ", NULL);
}
if ((so->so_state & SS_ISCONNECTED) &&
- (so->so_ux_faddr.soua_magic == SOU_MAGIC_EXPLICIT)) {
- mdb_printf("%0?p ", so->so_ux_faddr.soua_vp);
+ (sti->sti_ux_faddr.soua_magic == SOU_MAGIC_EXPLICIT)) {
+ mdb_printf("%0?p ", sti->sti_ux_faddr.soua_vp);
} else {
mdb_printf("%0?p ", NULL);
}
- if (netstat_unix_name_pr(so, &so->so_laddr) == -1)
+ if (netstat_unix_name_pr(st, &sti->sti_laddr) == -1)
return (WALK_ERR);
- if (netstat_unix_name_pr(so, &so->so_faddr) == -1)
+ if (netstat_unix_name_pr(st, &sti->sti_faddr) == -1)
return (WALK_ERR);
mdb_printf("%4i\n", so->so_zoneid);
diff --git a/usr/src/cmd/mdb/common/modules/genunix/vfs.c b/usr/src/cmd/mdb/common/modules/genunix/vfs.c
index 5c5fc3361e..b12cdca0c9 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/vfs.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/vfs.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <mdb/mdb_modapi.h>
#include <mdb/mdb_ks.h>
@@ -47,6 +45,11 @@
#include <sys/socketvar.h>
#include <sys/strsubr.h>
#include <sys/un.h>
+#include <fs/sockfs/socktpi_impl.h>
+#include <inet/ipclassifier.h>
+#include <inet/ip_if.h>
+#include <inet/sctp/sctp_impl.h>
+#include <inet/sctp/sctp_addr.h>
int
vfs_walk_init(mdb_walk_state_t *wsp)
@@ -173,7 +176,7 @@ read_fsname(uintptr_t vfsp, char *fsname)
#define FSINFO_MNTLEN 56
#endif
-/*ARGSUSED*/
+/* ARGSUSED */
int
fsinfo(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
{
@@ -387,14 +390,14 @@ pfiles_print_addr(struct sockaddr *addr)
switch (addr->sa_family) {
case AF_INET:
- /*LINTED: alignment*/
+ /* LINTED: alignment */
s_in = (struct sockaddr_in *)addr;
mdb_nhconvert(&port, &s_in->sin_port, sizeof (port));
mdb_printf("AF_INET %I %d ", s_in->sin_addr.s_addr, port);
break;
case AF_INET6:
- /*LINTED: alignment*/
+ /* LINTED: alignment */
s_in6 = (struct sockaddr_in6 *)addr;
mdb_nhconvert(&port, &s_in6->sin6_port, sizeof (port));
mdb_printf("AF_INET6 %N %d ", &(s_in6->sin6_addr), port);
@@ -410,31 +413,39 @@ pfiles_print_addr(struct sockaddr *addr)
}
}
-
static int
-pfiles_get_sonode(uintptr_t vp, struct sonode *sonode)
+pfiles_get_sonode(vnode_t *v_sock, struct sonode *sonode)
{
- vnode_t v;
- struct stdata stream;
-
- if (mdb_vread(&v, sizeof (v), vp) == -1) {
- mdb_warn("failed to read socket vnode");
+ if (mdb_vread(sonode, sizeof (struct sonode),
+ (uintptr_t)v_sock->v_data) == -1) {
+ mdb_warn("failed to read sonode");
return (-1);
}
- if (mdb_vread(&stream, sizeof (stream), (uintptr_t)v.v_stream) == -1) {
+ return (0);
+}
+
+static int
+pfiles_get_tpi_sonode(vnode_t *v_sock, sotpi_sonode_t *sotpi_sonode)
+{
+
+ struct stdata stream;
+
+ if (mdb_vread(&stream, sizeof (stream),
+ (uintptr_t)v_sock->v_stream) == -1) {
mdb_warn("failed to read stream data");
return (-1);
}
- if (mdb_vread(&v, sizeof (v), (uintptr_t)stream.sd_vnode) == -1) {
+ if (mdb_vread(v_sock, sizeof (vnode_t),
+ (uintptr_t)stream.sd_vnode) == -1) {
mdb_warn("failed to read stream vnode");
return (-1);
}
- if (mdb_vread(sonode, sizeof (struct sonode),
- (uintptr_t)v.v_data) == -1) {
- mdb_warn("failed to read sonode");
+ if (mdb_vread(sotpi_sonode, sizeof (sotpi_sonode_t),
+ (uintptr_t)v_sock->v_data) == -1) {
+ mdb_warn("failed to read sotpi_sonode");
return (-1);
}
@@ -470,16 +481,20 @@ pfiles_dig_pathname(uintptr_t vp, char *path)
/*
* For sockets, we won't find a path unless we print the path
- * associated with the accessvp.
+ * associated with transport's STREAM device.
*/
if (v.v_type == VSOCK) {
struct sonode sonode;
- if (pfiles_get_sonode(vp, &sonode) == -1) {
+ if (pfiles_get_sonode(&v, &sonode) == -1) {
return (-1);
}
-
- vp = (uintptr_t)sonode.so_accessvp;
+ if (!SOCK_IS_NONSTR(&sonode)) {
+ struct sockparams *sp = sonode.so_sockparams;
+ vp = (uintptr_t)sp->sp_sdev_info.sd_vnode;
+ } else {
+ vp = NULL;
+ }
}
}
@@ -531,6 +546,364 @@ struct pfiles_cbdata {
int fd;
};
+#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset))
+#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset))
+
+/*
+ * SCTP interface for geting the first source address of a sctp_t.
+ */
+int
+sctp_getsockaddr(sctp_t *sctp, struct sockaddr *addr)
+{
+ int err = -1;
+ int i;
+ int l;
+ sctp_saddr_ipif_t *pobj;
+ sctp_saddr_ipif_t obj;
+ size_t added = 0;
+ sin6_t *sin6;
+ sin_t *sin4;
+ int scanned = 0;
+ boolean_t skip_lback = B_FALSE;
+
+ addr->sa_family = sctp->sctp_family;
+ if (sctp->sctp_nsaddrs == 0)
+ goto done;
+
+ /*
+ * Skip loopback addresses for non-loopback assoc.
+ */
+ if (sctp->sctp_state >= SCTPS_ESTABLISHED && !sctp->sctp_loopback) {
+ skip_lback = B_TRUE;
+ }
+
+ for (i = 0; i < SCTP_IPIF_HASH; i++) {
+ if (sctp->sctp_saddrs[i].ipif_count == 0)
+ continue;
+
+ pobj = list_object(&sctp->sctp_saddrs[i].sctp_ipif_list,
+ sctp->sctp_saddrs[i].sctp_ipif_list.list_head.list_next);
+ if (mdb_vread(&obj, sizeof (sctp_saddr_ipif_t),
+ (uintptr_t)pobj) == -1) {
+ mdb_warn("failed to read sctp_saddr_ipif_t");
+ return (err);
+ }
+
+ for (l = 0; l < sctp->sctp_saddrs[i].ipif_count; l++) {
+ sctp_ipif_t ipif;
+ in6_addr_t laddr;
+ list_node_t *pnode;
+ list_node_t node;
+
+ if (mdb_vread(&ipif, sizeof (sctp_ipif_t),
+ (uintptr_t)obj.saddr_ipifp) == -1) {
+ mdb_warn("failed to read sctp_ipif_t");
+ return (err);
+ }
+ laddr = ipif.sctp_ipif_saddr;
+
+ scanned++;
+ if ((ipif.sctp_ipif_state == SCTP_IPIFS_CONDEMNED) ||
+ SCTP_DONT_SRC(&obj) ||
+ (ipif.sctp_ipif_ill->sctp_ill_flags &
+ PHYI_LOOPBACK) && skip_lback) {
+ if (scanned >= sctp->sctp_nsaddrs)
+ goto done;
+
+ /* LINTED: alignment */
+ pnode = list_d2l(&sctp->sctp_saddrs[i].
+ sctp_ipif_list, pobj);
+ if (mdb_vread(&node, sizeof (list_node_t),
+ (uintptr_t)pnode) == -1) {
+ mdb_warn("failed to read list_node_t");
+ return (err);
+ }
+ pobj = list_object(&sctp->sctp_saddrs[i].
+ sctp_ipif_list, node.list_next);
+ if (mdb_vread(&obj, sizeof (sctp_saddr_ipif_t),
+ (uintptr_t)pobj) == -1) {
+ mdb_warn("failed to read "
+ "sctp_saddr_ipif_t");
+ return (err);
+ }
+ continue;
+ }
+
+ switch (sctp->sctp_family) {
+ case AF_INET:
+ /* LINTED: alignment */
+ sin4 = (sin_t *)addr;
+ if ((sctp->sctp_state <= SCTPS_LISTEN) &&
+ sctp->sctp_bound_to_all) {
+ sin4->sin_addr.s_addr = INADDR_ANY;
+ sin4->sin_port = sctp->sctp_lport;
+ } else {
+ sin4 += added;
+ sin4->sin_family = AF_INET;
+ sin4->sin_port = sctp->sctp_lport;
+ IN6_V4MAPPED_TO_INADDR(&laddr,
+ &sin4->sin_addr);
+ }
+ break;
+
+ case AF_INET6:
+ /* LINTED: alignment */
+ sin6 = (sin6_t *)addr;
+ if ((sctp->sctp_state <= SCTPS_LISTEN) &&
+ sctp->sctp_bound_to_all) {
+ bzero(&sin6->sin6_addr,
+ sizeof (sin6->sin6_addr));
+ sin6->sin6_port = sctp->sctp_lport;
+ } else {
+ sin6 += added;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = sctp->sctp_lport;
+ sin6->sin6_addr = laddr;
+ }
+ sin6->sin6_flowinfo = sctp->sctp_ip6h->ip6_vcf &
+ ~IPV6_VERS_AND_FLOW_MASK;
+ sin6->sin6_scope_id = 0;
+ sin6->__sin6_src_id = 0;
+ break;
+ }
+ added++;
+ if (added >= 1) {
+ err = 0;
+ goto done;
+ }
+ if (scanned >= sctp->sctp_nsaddrs)
+ goto done;
+
+ /* LINTED: alignment */
+ pnode = list_d2l(&sctp->sctp_saddrs[i].sctp_ipif_list,
+ pobj);
+ if (mdb_vread(&node, sizeof (list_node_t),
+ (uintptr_t)pnode) == -1) {
+ mdb_warn("failed to read list_node_t");
+ return (err);
+ }
+ pobj = list_object(&sctp->sctp_saddrs[i].
+ sctp_ipif_list, node.list_next);
+ if (mdb_vread(&obj, sizeof (sctp_saddr_ipif_t),
+ (uintptr_t)pobj) == -1) {
+ mdb_warn("failed to read sctp_saddr_ipif_t");
+ return (err);
+ }
+ }
+ }
+done:
+ return (err);
+}
+
+/*
+ * SCTP interface for geting the primary peer address of a sctp_t.
+ */
+static int
+sctp_getpeeraddr(sctp_t *sctp, struct sockaddr *addr)
+{
+ struct sockaddr_in *sin4;
+ struct sockaddr_in6 *sin6;
+ sctp_faddr_t sctp_primary;
+ in6_addr_t faddr;
+
+ if (sctp->sctp_faddrs == NULL)
+ return (-1);
+
+ addr->sa_family = sctp->sctp_family;
+ if (mdb_vread(&sctp_primary, sizeof (sctp_faddr_t),
+ (uintptr_t)sctp->sctp_primary) == -1) {
+ mdb_warn("failed to read sctp primary faddr");
+ return (-1);
+ }
+ faddr = sctp_primary.faddr;
+
+ switch (sctp->sctp_family) {
+ case AF_INET:
+ /* LINTED: alignment */
+ sin4 = (struct sockaddr_in *)addr;
+ IN6_V4MAPPED_TO_INADDR(&faddr, &sin4->sin_addr);
+ sin4->sin_port = sctp->sctp_fport;
+ sin4->sin_family = AF_INET;
+ break;
+
+ case AF_INET6:
+ /* LINTED: alignment */
+ sin6 = (struct sockaddr_in6 *)addr;
+ sin6->sin6_addr = faddr;
+ sin6->sin6_port = sctp->sctp_fport;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_flowinfo = 0;
+ sin6->sin6_scope_id = 0;
+ sin6->__sin6_src_id = 0;
+ break;
+ }
+
+ return (0);
+}
+
+static int
+tpi_sock_print(sotpi_sonode_t *sotpi_sonode)
+{
+ if (sotpi_sonode->st_info.sti_laddr_valid == 1) {
+ struct sockaddr *laddr =
+ mdb_alloc(sotpi_sonode->st_info.sti_laddr_len, UM_SLEEP);
+ if (mdb_vread(laddr, sotpi_sonode->st_info.sti_laddr_len,
+ (uintptr_t)sotpi_sonode->st_info.sti_laddr_sa) == -1) {
+ mdb_warn("failed to read sotpi_sonode socket addr");
+ return (-1);
+ }
+
+ mdb_printf("socket: ");
+ pfiles_print_addr(laddr);
+ }
+
+ if (sotpi_sonode->st_info.sti_faddr_valid == 1) {
+ struct sockaddr *faddr =
+ mdb_alloc(sotpi_sonode->st_info.sti_faddr_len, UM_SLEEP);
+ if (mdb_vread(faddr, sotpi_sonode->st_info.sti_faddr_len,
+ (uintptr_t)sotpi_sonode->st_info.sti_faddr_sa) == -1) {
+ mdb_warn("failed to read sotpi_sonode remote addr");
+ return (-1);
+ }
+
+ mdb_printf("remote: ");
+ pfiles_print_addr(faddr);
+ }
+
+ return (0);
+}
+
+static int
+tcpip_sock_print(struct sonode *socknode)
+{
+ switch (socknode->so_family) {
+ case AF_INET:
+ {
+ conn_t conn_t;
+ in_port_t port;
+
+ if (mdb_vread(&conn_t, sizeof (conn_t),
+ (uintptr_t)socknode->so_proto_handle) == -1) {
+ mdb_warn("failed to read conn_t V4");
+ return (-1);
+ }
+
+ mdb_printf("socket: ");
+ mdb_nhconvert(&port, &conn_t.conn_lport, sizeof (port));
+ mdb_printf("AF_INET %I %d ", conn_t.conn_src, port);
+
+ /*
+ * If this is a listening socket, we don't print
+ * the remote address.
+ */
+ if (IPCL_IS_TCP(&conn_t) && IPCL_IS_BOUND(&conn_t) == 0 ||
+ IPCL_IS_UDP(&conn_t) && IPCL_IS_CONNECTED(&conn_t)) {
+ mdb_printf("remote: ");
+ mdb_nhconvert(&port, &conn_t.conn_fport, sizeof (port));
+ mdb_printf("AF_INET %I %d ", conn_t.conn_rem, port);
+ }
+
+ break;
+ }
+
+ case AF_INET6:
+ {
+ conn_t conn_t;
+ in_port_t port;
+
+ if (mdb_vread(&conn_t, sizeof (conn_t),
+ (uintptr_t)socknode->so_proto_handle) == -1) {
+ mdb_warn("failed to read conn_t V6");
+ return (-1);
+ }
+
+ mdb_printf("socket: ");
+ mdb_nhconvert(&port, &conn_t.conn_lport, sizeof (port));
+ mdb_printf("AF_INET6 %N %d ", &conn_t.conn_srcv6, port);
+
+ /*
+ * If this is a listening socket, we don't print
+ * the remote address.
+ */
+ if (IPCL_IS_TCP(&conn_t) && IPCL_IS_BOUND(&conn_t) == 0 ||
+ IPCL_IS_UDP(&conn_t) && IPCL_IS_CONNECTED(&conn_t)) {
+ mdb_printf("remote: ");
+ mdb_nhconvert(&port, &conn_t.conn_fport, sizeof (port));
+ mdb_printf("AF_INET6 %N %d ", &conn_t.conn_remv6, port);
+ }
+
+ break;
+ }
+
+ default:
+ mdb_printf("AF_?? (%d)", socknode->so_family);
+ break;
+ }
+
+ return (0);
+}
+
+static int
+sctp_sock_print(struct sonode *socknode)
+{
+ sctp_t sctp_t;
+
+ struct sockaddr *laddr = mdb_alloc(sizeof (struct sockaddr), UM_SLEEP);
+ struct sockaddr *faddr = mdb_alloc(sizeof (struct sockaddr), UM_SLEEP);
+
+ if (mdb_vread(&sctp_t, sizeof (sctp_t),
+ (uintptr_t)socknode->so_proto_handle) == -1) {
+ mdb_warn("failed to read sctp_t");
+ return (-1);
+ }
+
+ if (sctp_getsockaddr(&sctp_t, laddr) == 0) {
+ mdb_printf("socket:");
+ pfiles_print_addr(laddr);
+ }
+ if (sctp_getpeeraddr(&sctp_t, faddr) == 0) {
+ mdb_printf("remote:");
+ pfiles_print_addr(faddr);
+ }
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+sdp_sock_print(struct sonode *socknode)
+{
+ return (0);
+}
+
+struct sock_print {
+ int family;
+ int type;
+ int pro;
+ int (*print)(struct sonode *socknode);
+} sock_prints[] = {
+ { 2, 2, 0, tcpip_sock_print }, /* /dev/tcp */
+ { 2, 2, 6, tcpip_sock_print }, /* /dev/tcp */
+ { 26, 2, 0, tcpip_sock_print }, /* /dev/tcp6 */
+ { 26, 2, 6, tcpip_sock_print }, /* /dev/tcp6 */
+ { 2, 1, 0, tcpip_sock_print }, /* /dev/udp */
+ { 2, 1, 17, tcpip_sock_print }, /* /dev/udp */
+ { 26, 1, 0, tcpip_sock_print }, /* /dev/udp6 */
+ { 26, 1, 17, tcpip_sock_print }, /* /dev/udp6 */
+ { 2, 4, 0, tcpip_sock_print }, /* /dev/rawip */
+ { 26, 4, 0, tcpip_sock_print }, /* /dev/rawip6 */
+ { 2, 2, 132, sctp_sock_print }, /* /dev/sctp */
+ { 26, 2, 132, sctp_sock_print }, /* /dev/sctp6 */
+ { 2, 6, 132, sctp_sock_print }, /* /dev/sctp */
+ { 26, 6, 132, sctp_sock_print }, /* /dev/sctp6 */
+ { 24, 4, 0, tcpip_sock_print }, /* /dev/rts */
+ { 2, 2, 257, sdp_sock_print }, /* /dev/sdp */
+ { 26, 2, 257, sdp_sock_print }, /* /dev/sdp */
+};
+
+#define NUM_SOCK_PRINTS \
+ (sizeof (sock_prints) / sizeof (struct sock_print))
+
static int
pfile_callback(uintptr_t addr, const struct file *f, struct pfiles_cbdata *cb)
{
@@ -624,40 +997,62 @@ pfile_callback(uintptr_t addr, const struct file *f, struct pfiles_cbdata *cb)
case VSOCK:
{
- struct sonode sonode;
+ vnode_t v_sock;
+ struct sonode so;
- if (pfiles_get_sonode(realvpp, &sonode) == -1)
+ if (mdb_vread(&v_sock, sizeof (v_sock), realvpp) == -1) {
+ mdb_warn("failed to read socket vnode");
return (DCMD_ERR);
+ }
/*
- * If the address is cached in the sonode, use it; otherwise,
- * we print nothing.
+ * Sockets can be non-stream or stream, they have to be dealed
+ * with differently.
*/
- if (sonode.so_state & SS_LADDR_VALID) {
- struct sockaddr *laddr =
- mdb_alloc(sonode.so_laddr_len, UM_SLEEP);
- if (mdb_vread(laddr, sonode.so_laddr_len,
- (uintptr_t)sonode.so_laddr_sa) == -1) {
- mdb_warn("failed to read sonode socket addr");
+ if (v_sock.v_stream == NULL) {
+ if (pfiles_get_sonode(&v_sock, &so) == -1)
return (DCMD_ERR);
- }
- mdb_printf("socket: ");
- pfiles_print_addr(laddr);
- }
+ /* Pick the proper methods. */
+ for (i = 0; i <= NUM_SOCK_PRINTS; i++) {
+ if ((sock_prints[i].family == so.so_family &&
+ sock_prints[i].type == so.so_type &&
+ sock_prints[i].pro == so.so_protocol) ||
+ (sock_prints[i].family == so.so_family &&
+ sock_prints[i].type == so.so_type &&
+ so.so_type == SOCK_RAW)) {
+ if ((*sock_prints[i].print)(&so) == -1)
+ return (DCMD_ERR);
+ }
+ }
+ } else {
+ sotpi_sonode_t sotpi_sonode;
- if (sonode.so_state & SS_FADDR_VALID) {
- struct sockaddr *faddr =
- mdb_alloc(sonode.so_faddr_len, UM_SLEEP);
- if (mdb_vread(faddr, sonode.so_faddr_len,
- (uintptr_t)sonode.so_faddr_sa) == -1) {
- mdb_warn("failed to read sonode remote addr");
+ if (pfiles_get_sonode(&v_sock, &so) == -1)
return (DCMD_ERR);
+
+ /*
+ * If the socket is a fallback socket, read its related
+ * information separately; otherwise, read it as a whole
+ * tpi socket.
+ */
+ if (so.so_state & SS_FALLBACK_COMP) {
+ sotpi_sonode.st_sonode = so;
+
+ if (mdb_vread(&(sotpi_sonode.st_info),
+ sizeof (sotpi_info_t),
+ (uintptr_t)so.so_priv) == -1)
+ return (DCMD_ERR);
+ } else {
+ if (pfiles_get_tpi_sonode(&v_sock,
+ &sotpi_sonode) == -1)
+ return (DCMD_ERR);
}
- mdb_printf("remote: ");
- pfiles_print_addr(faddr);
+ if (tpi_sock_print(&sotpi_sonode) == -1)
+ return (DCMD_ERR);
}
+
break;
}
@@ -691,7 +1086,6 @@ pfile_callback(uintptr_t addr, const struct file *f, struct pfiles_cbdata *cb)
break;
}
-
mdb_printf("\n");
return (WALK_NEXT);
diff --git a/usr/src/cmd/mdb/common/modules/sockfs/sockfs.c b/usr/src/cmd/mdb/common/modules/sockfs/sockfs.c
new file mode 100644
index 0000000000..33b8d20f8a
--- /dev/null
+++ b/usr/src/cmd/mdb/common/modules/sockfs/sockfs.c
@@ -0,0 +1,154 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/stropts.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+
+#include <mdb/mdb_modapi.h>
+#include <mdb/mdb_ks.h>
+
+/*
+ * Look up the symbol name for the given sockparams list and walk
+ * all the entries.
+ */
+static boolean_t
+sockparams_walk_list(const char *symname, int argc, const mdb_arg_t *argv)
+{
+ GElf_Sym sym;
+
+ if (mdb_lookup_by_name(symname, &sym)) {
+ mdb_warn("can't find symbol %s", symname);
+ return (B_FALSE);
+ }
+
+ if (mdb_pwalk_dcmd("list", "sockfs`sockparams", argc, argv,
+ sym.st_value) != 0) {
+ mdb_warn("can't walk %s", symname);
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+}
+
+/*
+ * dcmd to print sockparams info.
+ *
+ * If no address is given then the default is to print all sockparams on the
+ * global list (i.e., installed with soconfig(1)). To also print the ephemeral
+ * entries the '-e' flag should be used. Only ephemeral entries can be printed
+ * by specifying the '-E' flag.
+ */
+static int
+sockparams_prt(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+ struct sockparams sp;
+
+ if ((flags & DCMD_ADDRSPEC) == 0) {
+ uint_t opt_e = 0;
+ uint_t opt_E = 0;
+
+ /*
+ * Determine what lists should be printed
+ */
+ if (mdb_getopts(argc, argv,
+ 'e', MDB_OPT_SETBITS, 1, &opt_e,
+ 'E', MDB_OPT_SETBITS, 1, &opt_E) != argc)
+ return (DCMD_USAGE);
+
+ if (!opt_E) {
+ if (!sockparams_walk_list("sphead", argc, argv))
+ return (DCMD_ERR);
+ }
+
+ if (opt_e || opt_E) {
+ if (!sockparams_walk_list("sp_ephem_list", argc, argv))
+ return (DCMD_ERR);
+ }
+
+ return (DCMD_OK);
+ }
+
+ /*
+ * If we are piping the output, then just print out the address,
+ * otherwise summarize the sockparams info.
+ */
+ if ((flags & DCMD_PIPE_OUT) != 0) {
+ mdb_printf("%#lr\n", addr);
+ return (DCMD_OK);
+ }
+
+ if (DCMD_HDRSPEC(flags)) {
+ mdb_printf("%-?s %3s %3s %3s %15s %15s %6s %6s\n",
+ "ADDR", "FAM", "TYP", "PRO", "STRDEV", "SOCKMOD", "REFS",
+ "FLGS");
+ }
+
+ if (mdb_vread(&sp, sizeof (sp), addr) == -1) {
+ mdb_warn("failed to read sockparams at %0?p", addr);
+ return (DCMD_ERR);
+ }
+
+ mdb_printf("%0?p %3u %3u %3u %15s %15s %6u %#6x\n",
+ addr,
+ sp.sp_family, sp.sp_type, sp.sp_protocol,
+ (sp.sp_sdev_info.sd_devpath != 0) ?
+ sp.sp_sdev_info.sd_devpath : "-",
+ sp.sp_smod_name, sp.sp_refcnt,
+ sp.sp_flags);
+
+
+ return (DCMD_OK);
+}
+
+/*
+ * Help function
+ */
+void
+sockparams_help(void)
+{
+ mdb_printf("Print sockparams information for a give sockparams ptr.\n"
+ "Without the address, list available sockparams. Default "
+ "behavior is to list only entries that were installed by the "
+ "admin (via soconfig(1M)).\n\n"
+ "Options:\n"
+ " -e:\t\tlist ephemeral sockparams\n"
+ " -E:\t\tonly list ephemeral sockparams\n");
+}
+
+static const mdb_dcmd_t dcmds[] = {
+ { "sockparams", "[-eE]", "print sockparams", sockparams_prt,
+ sockparams_help },
+ { NULL }
+};
+
+static const mdb_modinfo_t modinfo = { MDB_API_VERSION, dcmds, NULL };
+
+const mdb_modinfo_t *
+_mdb_init(void)
+{
+ return (&modinfo);
+}
diff --git a/usr/src/cmd/mdb/intel/amd64/sockfs/Makefile b/usr/src/cmd/mdb/intel/amd64/sockfs/Makefile
new file mode 100644
index 0000000000..9808e469f6
--- /dev/null
+++ b/usr/src/cmd/mdb/intel/amd64/sockfs/Makefile
@@ -0,0 +1,33 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+
+MODULE = sockfs.so
+MDBTGT = kvm
+
+MODSRCS = sockfs.c
+
+include ../../../../Makefile.cmd
+include ../../../../Makefile.cmd.64
+include ../../Makefile.amd64
+include ../../../Makefile.module
diff --git a/usr/src/cmd/mdb/intel/ia32/sockfs/Makefile b/usr/src/cmd/mdb/intel/ia32/sockfs/Makefile
new file mode 100644
index 0000000000..9b14d2fd04
--- /dev/null
+++ b/usr/src/cmd/mdb/intel/ia32/sockfs/Makefile
@@ -0,0 +1,32 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+
+MODULE = sockfs.so
+MDBTGT = kvm
+
+MODSRCS = sockfs.c
+
+include ../../../../Makefile.cmd
+include ../../Makefile.ia32
+include ../../../Makefile.module
diff --git a/usr/src/cmd/mdb/sparc/v9/sockfs/Makefile b/usr/src/cmd/mdb/sparc/v9/sockfs/Makefile
new file mode 100644
index 0000000000..9e65a6282b
--- /dev/null
+++ b/usr/src/cmd/mdb/sparc/v9/sockfs/Makefile
@@ -0,0 +1,33 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+
+MODULE = sockfs.so
+MDBTGT = kvm
+
+MODSRCS = sockfs.c
+
+include ../../../../Makefile.cmd
+include ../../../../Makefile.cmd.64
+include ../../Makefile.sparcv9
+include ../../../Makefile.module
diff --git a/usr/src/pkgdefs/SUNWckr/prototype_com b/usr/src/pkgdefs/SUNWckr/prototype_com
index 1988298dfe..ead3a7e5e8 100644
--- a/usr/src/pkgdefs/SUNWckr/prototype_com
+++ b/usr/src/pkgdefs/SUNWckr/prototype_com
@@ -134,6 +134,7 @@ d none kernel/misc 755 root sys
d none kernel/sched 755 root sys
d none kernel/strmod 755 root sys
d none kernel/sys 755 root sys
+d none kernel/socketmod 755 root sys
d none lib 755 root bin
d none lib/svc 0755 root bin
d none lib/svc/method 0755 root bin
diff --git a/usr/src/pkgdefs/SUNWckr/prototype_i386 b/usr/src/pkgdefs/SUNWckr/prototype_i386
index 57be328034..adc41583bb 100644
--- a/usr/src/pkgdefs/SUNWckr/prototype_i386
+++ b/usr/src/pkgdefs/SUNWckr/prototype_i386
@@ -212,6 +212,7 @@ f none kernel/misc/ipc 755 root sys
f none kernel/misc/kbtrans 755 root sys
f none kernel/misc/kcf 755 root sys
f none kernel/misc/kmdbmod 755 root sys
+f none kernel/misc/ksocket 755 root sys
f none kernel/misc/mac 755 root sys
l none kernel/misc/md5=../../kernel/crypto/md5
f none kernel/misc/net80211 755 root sys
@@ -427,6 +428,7 @@ f none kernel/misc/amd64/ipc 755 root sys
f none kernel/misc/amd64/kbtrans 755 root sys
f none kernel/misc/amd64/kcf 755 root sys
f none kernel/misc/amd64/kmdbmod 755 root sys
+f none kernel/misc/amd64/ksocket 755 root sys
f none kernel/misc/amd64/mac 755 root sys
l none kernel/misc/amd64/md5=../../../kernel/crypto/amd64/md5
f none kernel/misc/amd64/net80211 755 root sys
@@ -497,3 +499,14 @@ f none kernel/kiconv/amd64/kiconv_ja 755 root sys
f none kernel/kiconv/amd64/kiconv_ko 755 root sys
f none kernel/kiconv/amd64/kiconv_sc 755 root sys
f none kernel/kiconv/amd64/kiconv_tc 755 root sys
+l none kernel/socketmod/icmp=../../kernel/drv/icmp
+l none kernel/socketmod/rts=../../kernel/drv/rts
+l none kernel/socketmod/tcp=../../kernel/drv/tcp
+l none kernel/socketmod/udp=../../kernel/drv/udp
+f none kernel/socketmod/socksctp 755 root sys
+d none kernel/socketmod/amd64 755 root sys
+l none kernel/socketmod/amd64/icmp=../../../kernel/drv/amd64/icmp
+l none kernel/socketmod/amd64/rts=../../../kernel/drv/amd64/rts
+l none kernel/socketmod/amd64/tcp=../../../kernel/drv/amd64/tcp
+l none kernel/socketmod/amd64/udp=../../../kernel/drv/amd64/udp
+f none kernel/socketmod/amd64/socksctp 755 root sys
diff --git a/usr/src/pkgdefs/SUNWckr/prototype_sparc b/usr/src/pkgdefs/SUNWckr/prototype_sparc
index daccee4e10..e81a86168e 100644
--- a/usr/src/pkgdefs/SUNWckr/prototype_sparc
+++ b/usr/src/pkgdefs/SUNWckr/prototype_sparc
@@ -199,6 +199,7 @@ f none kernel/misc/sparcv9/idmap 755 root sys
f none kernel/misc/sparcv9/ipc 755 root sys
f none kernel/misc/sparcv9/kbtrans 755 root sys
f none kernel/misc/sparcv9/kcf 755 root sys
+f none kernel/misc/sparcv9/ksocket 755 root sys
f none kernel/misc/sparcv9/mac 755 root sys
l none kernel/misc/sparcv9/md5=../../../kernel/crypto/sparcv9/md5
f none kernel/misc/sparcv9/neti 755 root sys
@@ -267,3 +268,9 @@ f none kernel/kiconv/sparcv9/kiconv_ja 755 root sys
f none kernel/kiconv/sparcv9/kiconv_ko 755 root sys
f none kernel/kiconv/sparcv9/kiconv_sc 755 root sys
f none kernel/kiconv/sparcv9/kiconv_tc 755 root sys
+d none kernel/socketmod/sparcv9 755 root sys
+l none kernel/socketmod/sparcv9/icmp=../../../kernel/drv/sparcv9/icmp
+l none kernel/socketmod/sparcv9/rts=../../../kernel/drv/sparcv9/rts
+l none kernel/socketmod/sparcv9/tcp=../../../kernel/drv/sparcv9/tcp
+l none kernel/socketmod/sparcv9/udp=../../../kernel/drv/sparcv9/udp
+f none kernel/socketmod/sparcv9/socksctp 755 root sys
diff --git a/usr/src/pkgdefs/SUNWhea/prototype_com b/usr/src/pkgdefs/SUNWhea/prototype_com
index c5d0e03053..df95ddfabe 100644
--- a/usr/src/pkgdefs/SUNWhea/prototype_com
+++ b/usr/src/pkgdefs/SUNWhea/prototype_com
@@ -971,6 +971,7 @@ f none usr/include/sys/kmem.h 644 root bin
f none usr/include/sys/kmem_impl.h 644 root bin
f none usr/include/sys/kobj.h 644 root bin
f none usr/include/sys/kobj_impl.h 644 root bin
+f none usr/include/sys/ksocket.h 644 root bin
f none usr/include/sys/kstat.h 644 root bin
f none usr/include/sys/kstr.h 644 root bin
f none usr/include/sys/ksyms.h 644 root bin
@@ -1225,6 +1226,7 @@ f none usr/include/sys/socket.h 644 root bin
f none usr/include/sys/socket_impl.h 644 root bin
f none usr/include/sys/socketvar.h 644 root bin
f none usr/include/sys/sockio.h 644 root bin
+f none usr/include/sys/socket_proto.h 644 root bin
f none usr/include/sys/sodirect.h 644 root bin
f none usr/include/sys/sservice.h 644 root bin
f none usr/include/sys/squeue.h 644 root bin
diff --git a/usr/src/pkgdefs/SUNWibsdp/postinstall b/usr/src/pkgdefs/SUNWibsdp/postinstall
index e320b55507..01b5720227 100644
--- a/usr/src/pkgdefs/SUNWibsdp/postinstall
+++ b/usr/src/pkgdefs/SUNWibsdp/postinstall
@@ -19,18 +19,15 @@
#
# CDDL HEADER END
#
-#
-# ident "%Z%%M% %I% %E% SMI"
-#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
PATH="/usr/bin:/usr/sbin:${PATH}"
export PATH
-SDP4_SOCK_ENTRY=" 2 2 257 /dev/sdp"
-SDP6_SOCK_ENTRY=" 26 2 257 /dev/sdp"
+SDP4_SOCK_ENTRY=" 2 2 257 socksdp"
+SDP6_SOCK_ENTRY=" 26 2 257 socksdp"
if [ "${BASEDIR:=/}" != "/" ]
then
diff --git a/usr/src/pkgdefs/SUNWibsdp/preremove b/usr/src/pkgdefs/SUNWibsdp/preremove
index d0f143d2cf..bf6b2d72ad 100644
--- a/usr/src/pkgdefs/SUNWibsdp/preremove
+++ b/usr/src/pkgdefs/SUNWibsdp/preremove
@@ -19,18 +19,15 @@
#
# CDDL HEADER END
#
-#
-# ident "%Z%%M% %I% %E% SMI"
-#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
PATH="/usr/bin:/usr/sbin:${PATH}"
export PATH
-SDP4_SOCK_ENTRY=" 2 2 257 /dev/sdp"
-SDP6_SOCK_ENTRY=" 26 2 257 /dev/sdp"
+SDP4_SOCK_ENTRY=" 2 2 257 socksdp"
+SDP6_SOCK_ENTRY=" 26 2 257 socksdp"
EXIT=0
diff --git a/usr/src/pkgdefs/SUNWibsdp/prototype_i386 b/usr/src/pkgdefs/SUNWibsdp/prototype_i386
index 2c01d15098..f1a1db9a48 100644
--- a/usr/src/pkgdefs/SUNWibsdp/prototype_i386
+++ b/usr/src/pkgdefs/SUNWibsdp/prototype_i386
@@ -19,11 +19,9 @@
# CDDL HEADER END
#
#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
# This required package information file contains a list of package contents.
# The 'pkgmk' command uses this file to identify the contents of a package
# and their location on the development machine when building the package.
@@ -47,3 +45,7 @@
f none kernel/drv/sdp 0755 root sys
d none kernel/drv/amd64 0755 root sys
f none kernel/drv/amd64/sdp 0755 root sys
+d none kernel/socketmod 755 root sys
+f none kernel/socketmod/socksdp 755 root sys
+d none kernel/socketmod/amd64 755 root sys
+f none kernel/socketmod/amd64/socksdp 755 root sys
diff --git a/usr/src/pkgdefs/SUNWibsdp/prototype_sparc b/usr/src/pkgdefs/SUNWibsdp/prototype_sparc
index 891011aba8..37fa95f27d 100644
--- a/usr/src/pkgdefs/SUNWibsdp/prototype_sparc
+++ b/usr/src/pkgdefs/SUNWibsdp/prototype_sparc
@@ -19,11 +19,9 @@
# CDDL HEADER END
#
#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
# This required package information file contains a list of package contents.
# The 'pkgmk' command uses this file to identify the contents of a package
# and their location on the development machine when building the package.
@@ -49,3 +47,6 @@
#
d none kernel/drv/sparcv9 0755 root sys
f none kernel/drv/sparcv9/sdp 0755 root sys
+d none kernel/socketmod 755 root sys
+d none kernel/socketmod/sparcv9 755 root sys
+f none kernel/socketmod/sparcv9/socksdp 755 root sys
diff --git a/usr/src/pkgdefs/SUNWmdb/prototype_i386 b/usr/src/pkgdefs/SUNWmdb/prototype_i386
index 05c255e659..fb1a898f13 100644
--- a/usr/src/pkgdefs/SUNWmdb/prototype_i386
+++ b/usr/src/pkgdefs/SUNWmdb/prototype_i386
@@ -89,6 +89,7 @@ f none usr/lib/mdb/kvm/amd64/sppp.so 555 root sys
f none usr/lib/mdb/kvm/amd64/ufs.so 555 root sys
f none usr/lib/mdb/kvm/amd64/uhci.so 555 root sys
f none usr/lib/mdb/kvm/amd64/usba.so 555 root sys
+f none usr/lib/mdb/kvm/amd64/sockfs.so 555 root sys
f none usr/lib/mdb/kvm/arp.so 555 root sys
f none usr/lib/mdb/kvm/audiosup.so 555 root sys
f none usr/lib/mdb/kvm/cpc.so 555 root sys
@@ -117,6 +118,7 @@ f none usr/lib/mdb/kvm/s1394.so 555 root sys
f none usr/lib/mdb/kvm/scsi_vhci.so 555 root sys
f none usr/lib/mdb/kvm/sctp.so 555 root sys
f none usr/lib/mdb/kvm/sd.so 555 root sys
+f none usr/lib/mdb/kvm/sockfs.so 555 root sys
f none usr/lib/mdb/kvm/specfs.so 555 root sys
f none usr/lib/mdb/kvm/sppp.so 555 root sys
f none usr/lib/mdb/kvm/ufs.so 555 root sys
diff --git a/usr/src/pkgdefs/SUNWmdb/prototype_sparc b/usr/src/pkgdefs/SUNWmdb/prototype_sparc
index 51f5c49182..eae343b703 100644
--- a/usr/src/pkgdefs/SUNWmdb/prototype_sparc
+++ b/usr/src/pkgdefs/SUNWmdb/prototype_sparc
@@ -64,6 +64,7 @@ f none usr/lib/mdb/kvm/sparcv9/ptm.so 555 root sys
s none usr/lib/mdb/kvm/sparcv9/px.so=intr.so
f none usr/lib/mdb/kvm/sparcv9/random.so 555 root sys
f none usr/lib/mdb/kvm/sparcv9/sctp.so 555 root sys
+f none usr/lib/mdb/kvm/sparcv9/sockfs.so 555 root sys
f none usr/lib/mdb/kvm/sparcv9/s1394.so 555 root sys
f none usr/lib/mdb/kvm/sparcv9/scsi_vhci.so 555 root sys
f none usr/lib/mdb/kvm/sparcv9/specfs.so 555 root sys
diff --git a/usr/src/pkgdefs/SUNWmdbr/prototype_i386 b/usr/src/pkgdefs/SUNWmdbr/prototype_i386
index 237c1da83b..662f4cb1e3 100644
--- a/usr/src/pkgdefs/SUNWmdbr/prototype_i386
+++ b/usr/src/pkgdefs/SUNWmdbr/prototype_i386
@@ -33,9 +33,8 @@ f none kernel/kmdb/amd64/cpu.generic 555 root sys
f none kernel/kmdb/amd64/cpu_ms.AuthenticAMD.15 555 root sys
f none kernel/kmdb/amd64/crypto 555 root sys
f none kernel/kmdb/amd64/genunix 555 root sys
-f none kernel/kmdb/amd64/ip 555 root sys
f none kernel/kmdb/amd64/hook 555 root sys
-f none kernel/kmdb/amd64/neti 555 root sys
+f none kernel/kmdb/amd64/ip 555 root sys
f none kernel/kmdb/amd64/ipc 555 root sys
f none kernel/kmdb/amd64/ipp 555 root sys
f none kernel/kmdb/amd64/krtld 555 root sys
@@ -46,6 +45,7 @@ f none kernel/kmdb/amd64/md 555 root sys
f none kernel/kmdb/amd64/mdb_ds 555 root sys
f none kernel/kmdb/amd64/mpt 555 root sys
f none kernel/kmdb/amd64/nca 555 root sys
+f none kernel/kmdb/amd64/neti 555 root sys
f none kernel/kmdb/amd64/nfs 555 root sys
f none kernel/kmdb/amd64/ptm 555 root sys
f none kernel/kmdb/amd64/random 555 root sys
@@ -53,6 +53,7 @@ f none kernel/kmdb/amd64/s1394 555 root sys
f none kernel/kmdb/amd64/scsi_vhci 555 root sys
f none kernel/kmdb/amd64/sctp 555 root sys
f none kernel/kmdb/amd64/sd 555 root sys
+f none kernel/kmdb/amd64/sockfs 555 root sys
f none kernel/kmdb/amd64/specfs 555 root sys
f none kernel/kmdb/amd64/sppp 555 root sys
f none kernel/kmdb/amd64/ufs 555 root sys
@@ -65,9 +66,8 @@ f none kernel/kmdb/cpu.generic 555 root sys
f none kernel/kmdb/cpu_ms.AuthenticAMD.15 555 root sys
f none kernel/kmdb/crypto 555 root sys
f none kernel/kmdb/genunix 555 root sys
-f none kernel/kmdb/ip 555 root sys
f none kernel/kmdb/hook 555 root sys
-f none kernel/kmdb/neti 555 root sys
+f none kernel/kmdb/ip 555 root sys
f none kernel/kmdb/ipc 555 root sys
f none kernel/kmdb/ipp 555 root sys
f none kernel/kmdb/krtld 555 root sys
@@ -78,6 +78,7 @@ f none kernel/kmdb/md 555 root sys
f none kernel/kmdb/mdb_ds 555 root sys
f none kernel/kmdb/mpt 555 root sys
f none kernel/kmdb/nca 555 root sys
+f none kernel/kmdb/neti 555 root sys
f none kernel/kmdb/nfs 555 root sys
f none kernel/kmdb/ptm 555 root sys
f none kernel/kmdb/random 555 root sys
@@ -85,6 +86,7 @@ f none kernel/kmdb/s1394 555 root sys
f none kernel/kmdb/scsi_vhci 555 root sys
f none kernel/kmdb/sctp 555 root sys
f none kernel/kmdb/sd 555 root sys
+f none kernel/kmdb/sockfs 555 root sys
f none kernel/kmdb/specfs 555 root sys
f none kernel/kmdb/sppp 555 root sys
f none kernel/kmdb/ufs 555 root sys
diff --git a/usr/src/pkgdefs/SUNWmdbr/prototype_sparc b/usr/src/pkgdefs/SUNWmdbr/prototype_sparc
index b4057c2328..0e3e805552 100644
--- a/usr/src/pkgdefs/SUNWmdbr/prototype_sparc
+++ b/usr/src/pkgdefs/SUNWmdbr/prototype_sparc
@@ -22,7 +22,6 @@
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#
!include prototype_com
#
@@ -32,10 +31,9 @@ f none kernel/kmdb/sparcv9/audiosup 555 root sys
f none kernel/kmdb/sparcv9/cpc 555 root sys
f none kernel/kmdb/sparcv9/crypto 555 root sys
f none kernel/kmdb/sparcv9/genunix 555 root sys
+f none kernel/kmdb/sparcv9/hook 555 root sys
f none kernel/kmdb/sparcv9/intr 555 root sys
f none kernel/kmdb/sparcv9/ip 555 root sys
-f none kernel/kmdb/sparcv9/hook 555 root sys
-f none kernel/kmdb/sparcv9/neti 555 root sys
f none kernel/kmdb/sparcv9/ipc 555 root sys
f none kernel/kmdb/sparcv9/ipp 555 root sys
f none kernel/kmdb/sparcv9/isp 555 root sys
@@ -47,16 +45,18 @@ f none kernel/kmdb/sparcv9/md 555 root sys
f none kernel/kmdb/sparcv9/mdb_ds 555 root sys
f none kernel/kmdb/sparcv9/mpt 555 root sys
f none kernel/kmdb/sparcv9/nca 555 root sys
+f none kernel/kmdb/sparcv9/neti 555 root sys
f none kernel/kmdb/sparcv9/nfs 555 root sys
-s none kernel/kmdb/sparcv9/pcisch=intr
s none kernel/kmdb/sparcv9/pcipsy=intr
+s none kernel/kmdb/sparcv9/pcisch=intr
f none kernel/kmdb/sparcv9/ptm 555 root sys
s none kernel/kmdb/sparcv9/px=intr
f none kernel/kmdb/sparcv9/random 555 root sys
-f none kernel/kmdb/sparcv9/sctp 555 root sys
f none kernel/kmdb/sparcv9/s1394 555 root sys
f none kernel/kmdb/sparcv9/scsi_vhci 555 root sys
+f none kernel/kmdb/sparcv9/sctp 555 root sys
f none kernel/kmdb/sparcv9/sd 555 root sys
+f none kernel/kmdb/sparcv9/sockfs 555 root sys
f none kernel/kmdb/sparcv9/specfs 555 root sys
f none kernel/kmdb/sparcv9/sppp 555 root sys
f none kernel/kmdb/sparcv9/ssd 555 root sys
@@ -68,10 +68,10 @@ d none platform/sun4u 755 root sys
d none platform/sun4u/kernel 755 root sys
d none platform/sun4u/kernel/kmdb 755 root sys
d none platform/sun4u/kernel/kmdb/sparcv9 755 root sys
+f none platform/sun4u/kernel/kmdb/sparcv9/oplhwd 555 root sys
f none platform/sun4u/kernel/kmdb/sparcv9/sgenv 555 root sys
f none platform/sun4u/kernel/kmdb/sparcv9/sgsbbc 555 root sys
f none platform/sun4u/kernel/kmdb/sparcv9/unix 555 root sys
-f none platform/sun4u/kernel/kmdb/sparcv9/oplhwd 555 root sys
#
d none platform/sun4v 755 root sys
d none platform/sun4v/kernel 755 root sys
diff --git a/usr/src/pkgdefs/common_files/i.sock2path b/usr/src/pkgdefs/common_files/i.sock2path
index 9b1bdedc36..31fcde8e06 100644
--- a/usr/src/pkgdefs/common_files/i.sock2path
+++ b/usr/src/pkgdefs/common_files/i.sock2path
@@ -3,9 +3,8 @@
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
@@ -20,10 +19,7 @@
#
# CDDL HEADER END
#
-#
-#ident "%Z%%M% %I% %E% SMI"
-#
-# Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
@@ -80,6 +76,25 @@ do
echo >> $dest
grep '/dev/spdsock' $src >> $dest
fi
+ grep "^#" $dest | awk '{
+ if ($5=="Path") {print $0 "|Module"}
+ else {print $0}}' > /tmp/i.$$
+ grep -v "^#" $dest | awk '{
+ if ($4=="/dev/tcp" || $4=="/dev/tcp6") {
+ print "\t" $1 "\t" $2 "\t" $3 "\ttcp"
+ } else if ($4=="/dev/udp" || $4=="/dev/udp6") {
+ print "\t" $1 "\t" $2 "\t" $3 "\tudp"
+ } else if ($4=="/dev/rawip" || $4=="/dev/rawip6") {
+ print "\t" $1 "\t" $2 "\t" $3 "\ticmp"
+ } else if ($4=="/dev/sctp" || $4=="/dev/sctp6") {
+ print "\t" $1 "\t" $2 "\t" $3 "\tsocksctp"
+ } else if ($4=="/dev/rts") {
+ print "\t" $1 "\t" $2 "\t" $3 "\trts"
+ } else if ($4=="/dev/sdp" || $4=="/dev/sdp6") {
+ print "\t" $1 "\t" $2 "\t" $3 "\tsocksdp"
+ } else {print $0}}' >> /tmp/i.$$
+ cp /tmp/i.$$ $dest
+ rm -f /tmp/i.$$
fi
done
diff --git a/usr/src/uts/Makefile.targ b/usr/src/uts/Makefile.targ
index 86adc21eb2..d9fc918b94 100644
--- a/usr/src/uts/Makefile.targ
+++ b/usr/src/uts/Makefile.targ
@@ -22,7 +22,6 @@
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#
# This Makefiles contains the common targets and definitions for
# all kernels. It is to be included in the Makefiles for specific
# implementation architectures and processor architecture dependent
@@ -163,6 +162,9 @@ $(ROOT_FS_DIR)/%: $(OBJS_DIR)/% $(ROOT_FS_DIR) FRC
$(ROOT_SCHED_DIR)/%: $(OBJS_DIR)/% $(ROOT_SCHED_DIR) FRC
$(INS.file)
+$(ROOT_SOCK_DIR)/%: $(OBJS_DIR)/% $(ROOT_SOCK_DIR) FRC
+ $(INS.file)
+
$(ROOT_STRMOD_DIR)/%: $(OBJS_DIR)/% $(ROOT_STRMOD_DIR) FRC
$(INS.file)
@@ -388,12 +390,10 @@ $(MODLIST_DEPS): FRC
@case $@ in \
*32) \
class=32; \
- relmodule=`dirname $(RELMODULE)`; \
- rellink=`dirname $(RELLINK)`;; \
+ relmodule=`dirname $(RELMODULE)`;; \
*64) \
class=64; \
- relmodule=`dirname $(RELMODULE)`/$(SUBDIR64); \
- rellink=`dirname $(RELLINK)`/$(SUBDIR64);; \
+ relmodule=`dirname $(RELMODULE)`/$(SUBDIR64);; \
esac; \
if [ -z "$(THISIMPL)" ]; then \
impl=all; \
@@ -426,8 +426,16 @@ $(MODLIST_DEPS): FRC
done \
fi; \
if [ -n "$(ROOTLINK)" ]; then \
+ rellinks="$(RELLINK)"; \
+ for r in $$rellinks; do \
+ if [ $$class = 32 ]; then \
+ linkdir=`dirname $$r`; \
+ else \
+ linkdir=`dirname $$r`/$(SUBDIR64); \
+ fi; \
echo LINK $$relmodule $$module \
- $$rellink `basename $(RELLINK)` $$impl; \
+ $$linkdir `basename $$r` $$impl; \
+ done \
fi; \
if [ -n "$(UNIX32_LINK)" ]; then \
echo SYMLINK $(SUBDIR64)/$(UNIX) \
diff --git a/usr/src/uts/Makefile.uts b/usr/src/uts/Makefile.uts
index 86b39fc084..0f4718e3da 100644
--- a/usr/src/uts/Makefile.uts
+++ b/usr/src/uts/Makefile.uts
@@ -419,6 +419,7 @@ ROOT_DTRACE_DIR_32 = $(ROOT_MOD_DIR)/dtrace
ROOT_EXEC_DIR_32 = $(ROOT_MOD_DIR)/exec
ROOT_FS_DIR_32 = $(ROOT_MOD_DIR)/fs
ROOT_SCHED_DIR_32 = $(ROOT_MOD_DIR)/sched
+ROOT_SOCK_DIR_32 = $(ROOT_MOD_DIR)/socketmod
ROOT_STRMOD_DIR_32 = $(ROOT_MOD_DIR)/strmod
ROOT_IPP_DIR_32 = $(ROOT_MOD_DIR)/ipp
ROOT_SYS_DIR_32 = $(ROOT_MOD_DIR)/sys
@@ -444,6 +445,7 @@ ROOT_DTRACE_DIR_64 = $(ROOT_MOD_DIR)/dtrace/$(SUBDIR64)
ROOT_EXEC_DIR_64 = $(ROOT_MOD_DIR)/exec/$(SUBDIR64)
ROOT_FS_DIR_64 = $(ROOT_MOD_DIR)/fs/$(SUBDIR64)
ROOT_SCHED_DIR_64 = $(ROOT_MOD_DIR)/sched/$(SUBDIR64)
+ROOT_SOCK_DIR_64 = $(ROOT_MOD_DIR)/socketmod/$(SUBDIR64)
ROOT_STRMOD_DIR_64 = $(ROOT_MOD_DIR)/strmod/$(SUBDIR64)
ROOT_IPP_DIR_64 = $(ROOT_MOD_DIR)/ipp/$(SUBDIR64)
ROOT_SYS_DIR_64 = $(ROOT_MOD_DIR)/sys/$(SUBDIR64)
@@ -469,6 +471,7 @@ ROOT_DTRACE_DIR = $(ROOT_DTRACE_DIR_$(CLASS))
ROOT_EXEC_DIR = $(ROOT_EXEC_DIR_$(CLASS))
ROOT_FS_DIR = $(ROOT_FS_DIR_$(CLASS))
ROOT_SCHED_DIR = $(ROOT_SCHED_DIR_$(CLASS))
+ROOT_SOCK_DIR = $(ROOT_SOCK_DIR_$(CLASS))
ROOT_STRMOD_DIR = $(ROOT_STRMOD_DIR_$(CLASS))
ROOT_IPP_DIR = $(ROOT_IPP_DIR_$(CLASS))
ROOT_SYS_DIR = $(ROOT_SYS_DIR_$(CLASS))
@@ -492,7 +495,7 @@ ROOT_MOD_DIRS_32 = $(ROOT_BRAND_DIR_32) $(ROOT_DRV_DIR_32)
ROOT_MOD_DIRS_32 += $(ROOT_EXEC_DIR_32) $(ROOT_DTRACE_DIR_32)
ROOT_MOD_DIRS_32 += $(ROOT_FS_DIR_32) $(ROOT_SCHED_DIR_32)
ROOT_MOD_DIRS_32 += $(ROOT_STRMOD_DIR_32) $(ROOT_SYS_DIR_32)
-ROOT_MOD_DIRS_32 += $(ROOT_IPP_DIR_32)
+ROOT_MOD_DIRS_32 += $(ROOT_IPP_DIR_32) $(ROOT_SOCK_DIR_32)
ROOT_MOD_DIRS_32 += $(ROOT_MISC_DIR_32) $(ROOT_MACH_DIR_32)
ROOT_MOD_DIRS_32 += $(ROOT_KGSS_DIR_32)
ROOT_MOD_DIRS_32 += $(ROOT_SCSI_VHCI_DIR_32)
@@ -508,6 +511,7 @@ USR_DRV_DIR_32 = $(USR_MOD_DIR)/drv
USR_EXEC_DIR_32 = $(USR_MOD_DIR)/exec
USR_FS_DIR_32 = $(USR_MOD_DIR)/fs
USR_SCHED_DIR_32 = $(USR_MOD_DIR)/sched
+USR_SOCK_DIR_32 = $(USR_MOD_DIR)/socketmod
USR_STRMOD_DIR_32 = $(USR_MOD_DIR)/strmod
USR_SYS_DIR_32 = $(USR_MOD_DIR)/sys
USR_MISC_DIR_32 = $(USR_MOD_DIR)/misc
@@ -521,6 +525,7 @@ USR_DRV_DIR_64 = $(USR_MOD_DIR)/drv/$(SUBDIR64)
USR_EXEC_DIR_64 = $(USR_MOD_DIR)/exec/$(SUBDIR64)
USR_FS_DIR_64 = $(USR_MOD_DIR)/fs/$(SUBDIR64)
USR_SCHED_DIR_64 = $(USR_MOD_DIR)/sched/$(SUBDIR64)
+USR_SOCK_DIR_64 = $(USR_MOD_DIR)/socketmod/$(SUBDIR64)
USR_STRMOD_DIR_64 = $(USR_MOD_DIR)/strmod/$(SUBDIR64)
USR_SYS_DIR_64 = $(USR_MOD_DIR)/sys/$(SUBDIR64)
USR_MISC_DIR_64 = $(USR_MOD_DIR)/misc/$(SUBDIR64)
@@ -534,6 +539,7 @@ USR_DRV_DIR = $(USR_DRV_DIR_$(CLASS))
USR_EXEC_DIR = $(USR_EXEC_DIR_$(CLASS))
USR_FS_DIR = $(USR_FS_DIR_$(CLASS))
USR_SCHED_DIR = $(USR_SCHED_DIR_$(CLASS))
+USR_SOCK_DIR = $(USR_SOCK_DIR_$(CLASS))
USR_STRMOD_DIR = $(USR_STRMOD_DIR_$(CLASS))
USR_SYS_DIR = $(USR_SYS_DIR_$(CLASS))
USR_MISC_DIR = $(USR_MISC_DIR_$(CLASS))
@@ -599,7 +605,8 @@ PARALLEL_KMODS = $(DRV_KMODS) $(EXEC_KMODS) $(FS_KMODS) $(SCHED_KMODS) \
$(MMU_KMODS) $(DACF_KMODS) $(EXPORT_KMODS) $(IPP_KMODS) \
$(CRYPTO_KMODS) $(PCBE_KMODS) \
$(DRV_KMODS_$(CLASS)) $(MISC_KMODS_$(CLASS)) $(MAC_KMODS) \
- $(DEVNAME_KMODS) $(BRAND_KMODS) $(KICONV_KMODS)
+ $(DEVNAME_KMODS) $(BRAND_KMODS) $(KICONV_KMODS) \
+ $(SOCKET_KMODS)
KMODS = $(GENUNIX_KMODS) $(PARALLEL_KMODS)
@@ -614,7 +621,7 @@ LINT_KMODS = $(DRV_KMODS) $(EXEC_KMODS) $(FS_KMODS) $(SCHED_KMODS) \
$(MACH_KMODS) $(GSS_KMODS) $(DACF_KMODS) $(IPP_KMODS) \
$(CRYPTO_KMODS) $(PCBE_KMODS) $(DEVNAME_KMODS) \
$(DRV_KMODS_$(CLASS)) $(MISC_KMODS_$(CLASS)) $(MAC_KMODS) \
- $(BRAND_KMODS) $(KICONV_KMODS)
+ $(BRAND_KMODS) $(KICONV_KMODS) $(SOCKET_KMODS)
$(CLOSED_BUILD)CLOSED_LINT_KMODS = $(CLOSED_DRV_KMODS) $(CLOSED_TOD_KMODS) \
$(CLOSED_MISC_KMODS) $(CLOSED_DRV_KMODS_$(CLASS))
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 564b2cf72e..f0951c280b 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -289,6 +289,7 @@ GENUNIX_OBJS += \
sigsuspend.o \
sigtimedwait.o \
sleepq.o \
+ sock_conf.o \
space.o \
sscanf.o \
ssig.o \
@@ -489,7 +490,8 @@ IP_OBJS += igmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \
ip_multi.o ip_ndp.o ip_opt_data.o ip_rts.o ip_srcid.o \
ipddi.o ipdrop.o mi.o nd.o optcom.o snmpcom.o ipsec_loader.o \
spd.o ipclassifier.o inet_common.o ip_squeue.o squeue.o \
- ip_sadb.o ip_ftable.o radix.o ip_dummy.o \
+ ip_sadb.o ip_ftable.o proto_set.o radix.o ip_dummy.o \
+ ip_helper_stream.o \
$(IP_ICMP_OBJS) \
$(IP_RTS_OBJS) \
$(IP_TCP_OBJS) \
@@ -531,6 +533,10 @@ SCTP6_OBJS += sctp6ddi.o
NCA_OBJS += ncaddi.o
+SDP_SOCK_MOD_OBJS += sockmod_sdp.o socksdp.o socksdpsubr.o
+
+SCTP_SOCK_MOD_OBJS += sockmod_sctp.o socksctp.o socksctpsubr.o
+
TUN_OBJS += tun.o
ATUN_OBJS += atun.o
@@ -1138,10 +1144,10 @@ SHAREFS_OBJS += sharetab.o sharefs_vfsops.o sharefs_vnops.o
SPEC_OBJS += specsubr.o specvfsops.o specvnops.o
-SOCK_OBJS += socksubr.o sockvfsops.o sockvnops.o \
- socksyscalls.o socktpi.o sockstr.o \
- socksctp.o socksctpsubr.o socksctpvnops.o sockssl.o \
- socksdp.o socksdpsubr.o socksdpvnops.o \
+SOCK_OBJS += socksubr.o sockvfsops.o sockparams.o \
+ socksyscalls.o socktpi.o sockstr.o sockssl.o \
+ sockcommon_vnops.o sockcommon_subr.o \
+ sockcommon_sops.o sockcommon.o socknotify.o \
nl7c.o nl7curi.o nl7chttp.o nl7clogd.o \
nl7cnca.o
@@ -1456,6 +1462,8 @@ KGSSD_DERIVED_OBJS = gssd_xdr.o
KGSS_DUMMY_OBJS += dmech.o
+KSOCKET_OBJS += ksocket.o ksocket_mod.o
+
CRYPTO= cksumtypes.o decrypt.o encrypt.o encrypt_length.o etypes.o \
nfold.o verify_checksum.o prng.o block_size.o make_checksum.o\
checksum_length.o hmac.o default_state.o mandatory_sumtype.o
diff --git a/usr/src/uts/common/Makefile.rules b/usr/src/uts/common/Makefile.rules
index 0035b502b9..35fe0895f1 100644
--- a/usr/src/uts/common/Makefile.rules
+++ b/usr/src/uts/common/Makefile.rules
@@ -481,6 +481,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/nca/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/sockmods/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/common/inet/vni/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -681,6 +685,10 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/common/io/kbtrans/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
+$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/ksocket/%.c
+ $(COMPILE.c) -o $@ $<
+ $(CTFCONVERT_O)
+
$(OBJS_DIR)/%.o: $(UTSBASE)/common/io/aggr/%.c
$(COMPILE.c) -o $@ $<
$(CTFCONVERT_O)
@@ -1548,6 +1556,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/idmap/%.c
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/sockmods/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/inet/arp/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
@@ -1732,6 +1743,9 @@ $(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/kb8042/%.c
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/kbtrans/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
+$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/ksocket/%.c
+ @($(LHEAD) $(LINT.c) $< $(LTAIL))
+
$(LINTS_DIR)/%.ln: $(UTSBASE)/common/io/aggr/%.c
@($(LHEAD) $(LINT.c) $< $(LTAIL))
diff --git a/usr/src/uts/common/c2/audit_event.c b/usr/src/uts/common/c2/audit_event.c
index 723212aa52..92559a3575 100644
--- a/usr/src/uts/common/c2/audit_event.c
+++ b/usr/src/uts/common/c2/audit_event.c
@@ -72,6 +72,8 @@
#include <sys/tihdr.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
+#include <sys/vfs_opreg.h>
+#include <fs/sockfs/sockcommon.h>
#include <netinet/in.h>
#include <sys/ddi.h>
#include <sys/port_impl.h>
@@ -3328,7 +3330,6 @@ auf_accept(
char so_laddr[sizeof (struct sockaddr_in6)];
char so_faddr[sizeof (struct sockaddr_in6)];
int err;
- int len;
short so_family, so_type;
int add_sock_token = 0;
@@ -3374,28 +3375,17 @@ auf_accept(
* XXX - what about other socket types for AF_INET (e.g. DGRAM)
*/
if (so->so_type == SOCK_STREAM) {
+ socklen_t len;
bzero((void *)so_laddr, sizeof (so_laddr));
bzero((void *)so_faddr, sizeof (so_faddr));
- /*
- * no local address then need to get it from lower
- * levels. only put out record on first read ala
- * AUE_WRITE.
- */
- if (so->so_state & SS_ISBOUND) {
- /* only done once on a connection */
- (void) SOP_GETSOCKNAME(so);
- (void) SOP_GETPEERNAME(so);
-
- /* get local and foreign addresses */
- mutex_enter(&so->so_lock);
- len = min(so->so_laddr_len, sizeof (so_laddr));
- bcopy(so->so_laddr_sa, so_laddr, len);
- len = min(so->so_faddr_len, sizeof (so_faddr));
- bcopy(so->so_faddr_sa, so_faddr, len);
- mutex_exit(&so->so_lock);
- }
+ len = sizeof (so_laddr);
+ (void) socket_getsockname(so,
+ (struct sockaddr *)so_laddr, &len, CRED());
+ len = sizeof (so_faddr);
+ (void) socket_getpeername(so,
+ (struct sockaddr *)so_faddr, &len, B_FALSE, CRED());
add_sock_token = 1;
}
@@ -3434,7 +3424,7 @@ auf_bind(struct t_audit_data *tad, int error, rval_t *rvp)
char so_laddr[sizeof (struct sockaddr_in6)];
char so_faddr[sizeof (struct sockaddr_in6)];
int err, fd;
- int len;
+ socklen_t len;
short so_family, so_type;
int add_sock_token = 0;
@@ -3466,17 +3456,10 @@ auf_bind(struct t_audit_data *tad, int error, rval_t *rvp)
case AF_INET6:
bzero(so_faddr, sizeof (so_faddr));
+ len = sizeof (so_faddr);
- if (so->so_state & SS_ISBOUND) {
- /* only done once on a connection */
- (void) SOP_GETSOCKNAME(so);
- }
-
- mutex_enter(&so->so_lock);
- len = min(so->so_laddr_len, sizeof (so_laddr));
- bcopy(so->so_laddr_sa, so_laddr, len);
- mutex_exit(&so->so_lock);
-
+ (void) socket_getpeername(so,
+ (struct sockaddr *)so_faddr, &len, B_FALSE, CRED());
add_sock_token = 1;
break;
@@ -3517,7 +3500,7 @@ auf_connect(struct t_audit_data *tad, int error, rval_t *rval)
char so_laddr[sizeof (struct sockaddr_in6)];
char so_faddr[sizeof (struct sockaddr_in6)];
int err, fd;
- int len;
+ socklen_t len;
short so_family, so_type;
int add_sock_token = 0;
@@ -3539,24 +3522,14 @@ auf_connect(struct t_audit_data *tad, int error, rval_t *rval)
switch (so_family) {
case AF_INET:
case AF_INET6:
- /*
- * no local address then need to get it from lower
- * levels.
- */
- if (so->so_state & SS_ISBOUND) {
- /* only done once on a connection */
- (void) SOP_GETSOCKNAME(so);
- (void) SOP_GETPEERNAME(so);
- }
bzero(so_laddr, sizeof (so_laddr));
bzero(so_faddr, sizeof (so_faddr));
- mutex_enter(&so->so_lock);
- len = min(so->so_laddr_len, sizeof (so_laddr));
- bcopy(so->so_laddr_sa, so_laddr, len);
+ len = sizeof (so_laddr);
+ (void) socket_getsockname(so, (struct sockaddr *)so_laddr,
+ &len, CRED());
if (error) {
- mutex_exit(&so->so_lock);
if (uap->addr == NULL)
break;
if (uap->len <= 0)
@@ -3569,9 +3542,9 @@ auf_connect(struct t_audit_data *tad, int error, rval_t *rval)
#endif
} else {
/* sanity check on length */
- len = min(so->so_faddr_len, sizeof (so_faddr));
- bcopy(so->so_faddr_sa, so_faddr, len);
- mutex_exit(&so->so_lock);
+ len = sizeof (so_faddr);
+ (void) socket_getpeername(so,
+ (struct sockaddr *)so_faddr, &len, B_FALSE, CRED());
}
add_sock_token = 1;
@@ -3614,7 +3587,7 @@ aus_shutdown(struct t_audit_data *tad)
char so_laddr[sizeof (struct sockaddr_in6)];
char so_faddr[sizeof (struct sockaddr_in6)];
int err, fd;
- int len;
+ socklen_t len;
short so_family, so_type;
int add_sock_token = 0;
file_t *fp; /* unix domain sockets */
@@ -3641,23 +3614,12 @@ aus_shutdown(struct t_audit_data *tad)
bzero(so_laddr, sizeof (so_laddr));
bzero(so_faddr, sizeof (so_faddr));
- if (so->so_state & SS_ISBOUND) {
- /*
- * no local address then need to get it from lower
- * levels.
- */
- if (so->so_laddr_len == 0)
- (void) SOP_GETSOCKNAME(so);
- if (so->so_faddr_len == 0)
- (void) SOP_GETPEERNAME(so);
- }
-
- mutex_enter(&so->so_lock);
- len = min(so->so_laddr_len, sizeof (so_laddr));
- bcopy(so->so_laddr_sa, so_laddr, len);
- len = min(so->so_faddr_len, sizeof (so_faddr));
- bcopy(so->so_faddr_sa, so_faddr, len);
- mutex_exit(&so->so_lock);
+ len = sizeof (so_laddr);
+ (void) socket_getsockname(so,
+ (struct sockaddr *)so_laddr, &len, CRED());
+ len = sizeof (so_faddr);
+ (void) socket_getpeername(so,
+ (struct sockaddr *)so_faddr, &len, B_FALSE, CRED());
add_sock_token = 1;
@@ -3721,7 +3683,7 @@ auf_setsockopt(struct t_audit_data *tad, int error, rval_t *rval)
char so_faddr[sizeof (struct sockaddr_in6)];
char val[AU_BUFSIZE];
int err, fd;
- int len;
+ socklen_t len;
short so_family, so_type;
int add_sock_token = 0;
file_t *fp; /* unix domain sockets */
@@ -3751,24 +3713,16 @@ auf_setsockopt(struct t_audit_data *tad, int error, rval_t *rval)
switch (so_family) {
case AF_INET:
case AF_INET6:
-
bzero((void *)so_laddr, sizeof (so_laddr));
bzero((void *)so_faddr, sizeof (so_faddr));
- if (so->so_state & SS_ISBOUND) {
- if (so->so_laddr_len == 0)
- (void) SOP_GETSOCKNAME(so);
- if (so->so_faddr_len == 0)
- (void) SOP_GETPEERNAME(so);
- }
-
/* get local and foreign addresses */
- mutex_enter(&so->so_lock);
- len = min(so->so_laddr_len, sizeof (so_laddr));
- bcopy(so->so_laddr_sa, so_laddr, len);
- len = min(so->so_faddr_len, sizeof (so_faddr));
- bcopy(so->so_faddr_sa, so_faddr, len);
- mutex_exit(&so->so_lock);
+ len = sizeof (so_laddr);
+ (void) socket_getsockname(so, (struct sockaddr *)so_laddr,
+ &len, CRED());
+ len = sizeof (so_faddr);
+ (void) socket_getpeername(so, (struct sockaddr *)so_faddr,
+ &len, B_FALSE, CRED());
add_sock_token = 1;
@@ -3892,7 +3846,7 @@ auf_recvmsg(
int err;
char so_laddr[sizeof (struct sockaddr_in6)];
char so_faddr[sizeof (struct sockaddr_in6)];
- int len;
+ socklen_t len;
file_t *fp; /* unix domain sockets */
struct f_audit_data *fad; /* unix domain sockets */
short so_family, so_type;
@@ -3942,10 +3896,9 @@ auf_recvmsg(
bzero((void *)so_faddr, sizeof (so_faddr));
/* get local address */
- mutex_enter(&so->so_lock);
- len = min(so->so_laddr_len, sizeof (so_laddr));
- bcopy(so->so_laddr_sa, so_laddr, len);
- mutex_exit(&so->so_lock);
+ len = sizeof (so_laddr);
+ (void) socket_getsockname(so,
+ (struct sockaddr *)so_laddr, &len, CRED());
/* get peer address */
STRUCT_INIT(msg, get_udatamodel());
@@ -3995,21 +3948,13 @@ auf_recvmsg(
bzero((void *)so_laddr, sizeof (so_laddr));
bzero((void *)so_faddr, sizeof (so_faddr));
- if (so->so_state & SS_ISBOUND) {
-
- if (so->so_laddr_len == 0)
- (void) SOP_GETSOCKNAME(so);
- if (so->so_faddr_len == 0)
- (void) SOP_GETPEERNAME(so);
-
- /* get local and foreign addresses */
- mutex_enter(&so->so_lock);
- len = min(so->so_laddr_len, sizeof (so_laddr));
- bcopy(so->so_laddr_sa, so_laddr, len);
- len = min(so->so_faddr_len, sizeof (so_faddr));
- bcopy(so->so_faddr_sa, so_faddr, len);
- mutex_exit(&so->so_lock);
- }
+ /* get local and foreign addresses */
+ len = sizeof (so_laddr);
+ (void) socket_getsockname(so,
+ (struct sockaddr *)so_laddr, &len, CRED());
+ len = sizeof (so_faddr);
+ (void) socket_getpeername(so,
+ (struct sockaddr *)so_faddr, &len, B_FALSE, CRED());
add_sock_token = 1;
}
@@ -4103,7 +4048,7 @@ auf_recvfrom(
int fd;
short so_family, so_type;
int add_sock_token = 0;
- int len;
+ socklen_t len;
int err;
struct file *fp;
struct f_audit_data *fad; /* unix domain sockets */
@@ -4149,10 +4094,9 @@ auf_recvfrom(
add_sock_token = 1;
/* get local address */
- mutex_enter(&so->so_lock);
- len = min(so->so_laddr_len, sizeof (so_laddr));
- bcopy(so->so_laddr_sa, so_laddr, len);
- mutex_exit(&so->so_lock);
+ len = sizeof (so_laddr);
+ (void) socket_getsockname(so,
+ (struct sockaddr *)so_laddr, &len, CRED());
/* get peer address */
bzero((void *)so_faddr, sizeof (so_faddr));
@@ -4206,21 +4150,13 @@ auf_recvfrom(
bzero((void *)so_laddr, sizeof (so_laddr));
bzero((void *)so_faddr, sizeof (so_faddr));
- if (so->so_state & SS_ISBOUND) {
-
- if (so->so_laddr_len == 0)
- (void) SOP_GETSOCKNAME(so);
- if (so->so_faddr_len == 0)
- (void) SOP_GETPEERNAME(so);
-
- /* get local and foreign addresses */
- mutex_enter(&so->so_lock);
- len = min(so->so_laddr_len, sizeof (so_laddr));
- bcopy(so->so_laddr_sa, so_laddr, len);
- len = min(so->so_faddr_len, sizeof (so_faddr));
- bcopy(so->so_faddr_sa, so_faddr, len);
- mutex_exit(&so->so_lock);
- }
+ /* get local and foreign addresses */
+ len = sizeof (so_laddr);
+ (void) socket_getsockname(so,
+ (struct sockaddr *)so_laddr, &len, CRED());
+ len = sizeof (so_faddr);
+ (void) socket_getpeername(so,
+ (struct sockaddr *)so_faddr, &len, B_FALSE, CRED());
add_sock_token = 1;
}
@@ -4306,7 +4242,7 @@ auf_sendmsg(struct t_audit_data *tad, int error, rval_t *rval)
int fd;
short so_family, so_type;
int add_sock_token = 0;
- int len;
+ socklen_t len;
struct file *fp;
struct f_audit_data *fad;
caddr_t msg_name;
@@ -4351,10 +4287,9 @@ auf_sendmsg(struct t_audit_data *tad, int error, rval_t *rval)
bzero((void *)so_faddr, sizeof (so_faddr));
/* get local address */
- mutex_enter(&so->so_lock);
- len = min(so->so_laddr_len, sizeof (so_laddr));
- bcopy(so->so_laddr_sa, so_laddr, len);
- mutex_exit(&so->so_lock);
+ len = sizeof (so_laddr);
+ (void) socket_getsockname(so,
+ (struct sockaddr *)so_laddr, &len, CRED());
/* get peer address */
STRUCT_INIT(msg, get_udatamodel());
@@ -4405,21 +4340,13 @@ auf_sendmsg(struct t_audit_data *tad, int error, rval_t *rval)
bzero((void *)so_laddr, sizeof (so_laddr));
bzero((void *)so_faddr, sizeof (so_faddr));
- if (so->so_state & SS_ISBOUND) {
-
- if (so->so_laddr_len == 0)
- (void) SOP_GETSOCKNAME(so);
- if (so->so_faddr_len == 0)
- (void) SOP_GETPEERNAME(so);
-
- /* get local and foreign addresses */
- mutex_enter(&so->so_lock);
- len = min(so->so_laddr_len, sizeof (so_laddr));
- bcopy(so->so_laddr_sa, so_laddr, len);
- len = min(so->so_faddr_len, sizeof (so_faddr));
- bcopy(so->so_faddr_sa, so_faddr, len);
- mutex_exit(&so->so_lock);
- }
+ /* get local and foreign addresses */
+ len = sizeof (so_laddr);
+ (void) socket_getsockname(so,
+ (struct sockaddr *)so_laddr, &len, CRED());
+ len = sizeof (so_faddr);
+ (void) socket_getpeername(so,
+ (struct sockaddr *)so_faddr, &len, B_FALSE, CRED());
add_sock_token = 1;
}
@@ -4506,7 +4433,7 @@ auf_sendto(struct t_audit_data *tad, int error, rval_t *rval)
socklen_t tolen;
int err;
int fd;
- int len;
+ socklen_t len;
short so_family, so_type;
int add_sock_token = 0;
struct file *fp;
@@ -4556,10 +4483,9 @@ auf_sendto(struct t_audit_data *tad, int error, rval_t *rval)
bzero((void *)so_faddr, sizeof (so_faddr));
/* get local address */
- mutex_enter(&so->so_lock);
- len = min(so->so_laddr_len, sizeof (so_laddr));
- bcopy(so->so_laddr_sa, so_laddr, len);
- mutex_exit(&so->so_lock);
+ len = sizeof (so_laddr);
+ (void) socket_getsockname(so,
+ (struct sockaddr *)so_laddr, &len, CRED());
/* get peer address */
@@ -4610,21 +4536,13 @@ auf_sendto(struct t_audit_data *tad, int error, rval_t *rval)
bzero((void *)so_laddr, sizeof (so_laddr));
bzero((void *)so_faddr, sizeof (so_faddr));
- if (so->so_state & SS_ISBOUND) {
-
- if (so->so_laddr_len == 0)
- (void) SOP_GETSOCKNAME(so);
- if (so->so_faddr_len == 0)
- (void) SOP_GETPEERNAME(so);
-
- /* get local and foreign addresses */
- mutex_enter(&so->so_lock);
- len = min(so->so_laddr_len, sizeof (so_laddr));
- bcopy(so->so_laddr_sa, so_laddr, len);
- len = min(so->so_faddr_len, sizeof (so_faddr));
- bcopy(so->so_faddr_sa, so_faddr, len);
- mutex_exit(&so->so_lock);
- }
+ /* get local and foreign addresses */
+ len = sizeof (so_laddr);
+ (void) socket_getsockname(so,
+ (struct sockaddr *)so_laddr, &len, CRED());
+ len = sizeof (so_faddr);
+ (void) socket_getpeername(so,
+ (struct sockaddr *)so_faddr, &len, B_FALSE, CRED());
add_sock_token = 1;
}
@@ -5394,7 +5312,7 @@ auf_recv(tad, error, rval)
struct f_audit_data *fad;
int fd;
int err;
- int len;
+ socklen_t len;
short so_family, so_type;
register struct a {
long fd;
@@ -5457,17 +5375,13 @@ auf_recv(tad, error, rval)
bzero((void *)so_laddr, sizeof (so_laddr));
bzero((void *)so_faddr, sizeof (so_faddr));
- /* only done once on a connection */
- (void) SOP_GETSOCKNAME(so);
- (void) SOP_GETPEERNAME(so);
-
/* get local and foreign addresses */
- mutex_enter(&so->so_lock);
- len = min(so->so_laddr_len, sizeof (so_laddr));
- bcopy(so->so_laddr_sa, so_laddr, len);
- len = min(so->so_faddr_len, sizeof (so_faddr));
- bcopy(so->so_faddr_sa, so_faddr, len);
- mutex_exit(&so->so_lock);
+ len = sizeof (so_laddr);
+ (void) socket_getsockname(so,
+ (struct sockaddr *)so_laddr, &len, CRED());
+ len = sizeof (so_faddr);
+ (void) socket_getpeername(so,
+ (struct sockaddr *)so_faddr, &len, B_FALSE, CRED());
/*
* only way to drop out of switch. Note that we
@@ -5532,7 +5446,7 @@ auf_send(tad, error, rval)
struct f_audit_data *fad;
int fd;
int err;
- int len;
+ socklen_t len;
short so_family, so_type;
register struct a {
long fd;
@@ -5597,17 +5511,13 @@ auf_send(tad, error, rval)
bzero((void *)so_laddr, sizeof (so_laddr));
bzero((void *)so_faddr, sizeof (so_faddr));
- /* only done once on a connection */
- (void) SOP_GETSOCKNAME(so);
- (void) SOP_GETPEERNAME(so);
-
/* get local and foreign addresses */
- mutex_enter(&so->so_lock);
- len = min(so->so_laddr_len, sizeof (so_laddr));
- bcopy(so->so_laddr_sa, so_laddr, len);
- len = min(so->so_faddr_len, sizeof (so_faddr));
- bcopy(so->so_faddr_sa, so_faddr, len);
- mutex_exit(&so->so_lock);
+ len = sizeof (so_laddr);
+ (void) socket_getsockname(so,
+ (struct sockaddr *)so_laddr, &len, CRED());
+ len = sizeof (so_faddr);
+ (void) socket_getpeername(so,
+ (struct sockaddr *)so_faddr, &len, B_FALSE, CRED());
/*
* only way to drop out of switch. Note that we
diff --git a/usr/src/uts/common/fs/smbsrv/smb_negotiate.c b/usr/src/uts/common/fs/smbsrv/smb_negotiate.c
index fb3498f545..48f6e53458 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_negotiate.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_negotiate.c
@@ -293,9 +293,9 @@ smb_com_negotiate(smb_request_t *sr)
switch (dialect) {
case PC_NETWORK_PROGRAM_1_0: /* core */
- (void) sosetsockopt(sr->session->sock, SOL_SOCKET, SO_RCVBUF,
- (const void *)&smb_dos_tcp_rcvbuf,
- sizeof (smb_dos_tcp_rcvbuf));
+ (void) ksocket_setsockopt(sr->session->sock, SOL_SOCKET,
+ SO_RCVBUF, (const void *)&smb_dos_tcp_rcvbuf,
+ sizeof (smb_dos_tcp_rcvbuf), CRED());
rc = smbsr_encode_result(sr, 1, 0, "bww", 1, sel_pos, 0);
break;
@@ -306,9 +306,9 @@ smb_com_negotiate(smb_request_t *sr)
case LANMAN1_0:
case LM1_2X002:
case DOS_LM1_2X002:
- (void) sosetsockopt(sr->session->sock, SOL_SOCKET, SO_RCVBUF,
- (const void *)&smb_dos_tcp_rcvbuf,
- sizeof (smb_dos_tcp_rcvbuf));
+ (void) ksocket_setsockopt(sr->session->sock, SOL_SOCKET,
+ SO_RCVBUF, (const void *)&smb_dos_tcp_rcvbuf,
+ sizeof (smb_dos_tcp_rcvbuf), CRED());
sr->smb_flg |= SMB_FLAGS_LOCK_AND_READ_OK;
rc = smbsr_encode_result(sr, 13, VAR_BCC,
"bwwwwwwlYww2.w#c",
@@ -331,9 +331,9 @@ smb_com_negotiate(smb_request_t *sr)
case DOS_LANMAN2_1:
case LANMAN2_1:
- (void) sosetsockopt(sr->session->sock, SOL_SOCKET, SO_RCVBUF,
- (const void *)&smb_dos_tcp_rcvbuf,
- sizeof (smb_dos_tcp_rcvbuf));
+ (void) ksocket_setsockopt(sr->session->sock, SOL_SOCKET,
+ SO_RCVBUF, (const void *)&smb_dos_tcp_rcvbuf,
+ sizeof (smb_dos_tcp_rcvbuf), CRED());
sr->smb_flg |= SMB_FLAGS_LOCK_AND_READ_OK;
rc = smbsr_encode_result(sr, 13, VAR_BCC,
"bwwwwwwlYww2.w#cs",
@@ -356,9 +356,9 @@ smb_com_negotiate(smb_request_t *sr)
break;
case NT_LM_0_12:
- (void) sosetsockopt(sr->session->sock, SOL_SOCKET, SO_RCVBUF,
- (const void *)&smb_nt_tcp_rcvbuf,
- sizeof (smb_nt_tcp_rcvbuf));
+ (void) ksocket_setsockopt(sr->session->sock, SOL_SOCKET,
+ SO_RCVBUF, (const void *)&smb_nt_tcp_rcvbuf,
+ sizeof (smb_nt_tcp_rcvbuf), CRED());
capabilities = CAP_LARGE_FILES
| CAP_NT_SMBS
| CAP_STATUS32
diff --git a/usr/src/uts/common/fs/smbsrv/smb_net.c b/usr/src/uts/common/fs/smbsrv/smb_net.c
index 4593cfec6b..ef41d911db 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_net.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_net.c
@@ -35,6 +35,7 @@
#include <sys/fs/snode.h>
#include <sys/fs/dv_node.h>
#include <sys/vnode.h>
+#include <sys/ksocket.h>
#undef mem_free /* XXX Remove this after we convert everything to kmem_alloc */
#include <smbsrv/smb_vops.h>
@@ -103,58 +104,19 @@ smb_net_fini(void)
* smb_iov_sorecv: Receive data into an iovec from a socket
*/
-struct sonode *
+ksocket_t
smb_socreate(int domain, int type, int protocol)
{
- vnode_t *dvp = NULL;
- vnode_t *vp = NULL;
- struct snode *csp = NULL;
- int err = 0;
- major_t maj;
-
- if ((vp = solookup(domain, type, protocol, NULL, &err)) == NULL) {
-
- /*
- * solookup calls sogetvp if the vp is not found in the cache.
- * Since the call to sogetvp is hardwired to use USERSPACE
- * and declared static we'll do the work here instead.
- */
- err = lookupname(type == SOCK_STREAM ? "/dev/tcp" : "/dev/udp",
- UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
- if (err)
- return (NULL);
-
- /* Check that it is the correct vnode */
- if (vp->v_type != VCHR) {
- VN_RELE(vp);
- return (NULL);
- }
+ ksocket_t sock;
+ int err = 0;
- csp = VTOS(VTOS(vp)->s_commonvp);
- if (!(csp->s_flag & SDIPSET)) {
- char *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- err = ddi_dev_pathname(vp->v_rdev, S_IFCHR,
- pathname);
- if (err == 0) {
- err = devfs_lookupname(pathname, NULLVPP,
- &dvp);
- }
- VN_RELE(vp);
- kmem_free(pathname, MAXPATHLEN);
- if (err != 0) {
- return (NULL);
- }
- vp = dvp;
- }
-
- maj = getmajor(vp->v_rdev);
- if (!STREAMSTAB(maj)) {
- VN_RELE(vp);
- return (NULL);
- }
- }
+ err = ksocket_socket(&sock, domain, type, protocol, KSOCKET_SLEEP,
+ CRED());
- return (socreate(vp, domain, type, protocol, SOV_DEFAULT, NULL, &err));
+ if (err != 0)
+ return (NULL);
+ else
+ return (sock);
}
/*
@@ -165,9 +127,9 @@ smb_socreate(int domain, int type, int protocol)
* regain control of a thread stuck in smb_sorecv.
*/
void
-smb_soshutdown(struct sonode *so)
+smb_soshutdown(ksocket_t so)
{
- (void) soshutdown(so, SHUT_RDWR);
+ (void) ksocket_shutdown(so, SHUT_RDWR, CRED());
}
/*
@@ -177,82 +139,27 @@ smb_soshutdown(struct sonode *so)
* behavior will result.
*/
void
-smb_sodestroy(struct sonode *so)
+smb_sodestroy(ksocket_t so)
{
- vnode_t *vp = SOTOV(so);
-
- (void) VOP_CLOSE(vp, 0, 1, 0, kcred, NULL);
- VN_RELE(vp);
+ (void) ksocket_close(so, CRED());
}
int
-smb_sorecv(struct sonode *so, void *msg, size_t len)
+smb_sorecv(ksocket_t so, void *msg, size_t len)
{
- iovec_t iov;
+ size_t recvd;
int err;
ASSERT(so != NULL);
ASSERT(len != 0);
- /*
- * Fill in iovec and receive data
- */
- iov.iov_base = msg;
- iov.iov_len = len;
-
- if ((err = smb_iov_sorecv(so, &iov, 1, len)) != 0) {
+ if ((err = ksocket_recv(so, msg, len, MSG_WAITALL, &recvd,
+ CRED())) != 0) {
return (err);
}
/* Successful receive */
- return (0);
-}
-
-/*
- * smb_iov_sorecv - Receives an iovec from a connection
- *
- * This function gets the data asked for from the socket. It will return
- * only when all the requested data has been retrieved or if an error
- * occurs.
- *
- * Returns 0 for success, the socket errno value if sorecvmsg fails, and
- * -1 if sorecvmsg returns success but uio_resid != 0
- */
-int
-smb_iov_sorecv(struct sonode *so, iovec_t *iop, int iovlen, size_t total_len)
-{
- struct msghdr msg;
- struct uio uio;
- int error;
-
- ASSERT(iop != NULL);
-
- /* Initialization of the message header. */
- bzero(&msg, sizeof (msg));
- msg.msg_iov = iop;
- msg.msg_flags = MSG_WAITALL;
- msg.msg_iovlen = iovlen;
-
- /* Initialization of the uio structure. */
- bzero(&uio, sizeof (uio));
- uio.uio_iov = iop;
- uio.uio_iovcnt = iovlen;
- uio.uio_segflg = UIO_SYSSPACE;
- uio.uio_resid = total_len;
-
- if ((error = sorecvmsg(so, &msg, &uio)) == 0) {
- /* Received data */
- if (uio.uio_resid == 0) {
- /* All requested data received. Success */
- return (0);
- } else {
- /* Not all data was sent. Failure */
- return (-1);
- }
- }
-
- /* Receive failed */
- return (error);
+ return ((recvd == len) ? 0 : -1);
}
/*
@@ -327,13 +234,12 @@ smb_net_txr_free(smb_txreq_t *txr)
* queued and the routine returns immediately.
*/
int
-smb_net_txr_send(struct sonode *so, smb_txlst_t *txl, smb_txreq_t *txr)
+smb_net_txr_send(ksocket_t so, smb_txlst_t *txl, smb_txreq_t *txr)
{
list_t local;
int rc = 0;
- iovec_t iov;
- struct msghdr msg;
- struct uio uio;
+ size_t sent = 0;
+ size_t len;
ASSERT(txl->tl_magic == SMB_TXLST_MAGIC);
@@ -355,25 +261,11 @@ smb_net_txr_send(struct sonode *so, smb_txlst_t *txl, smb_txreq_t *txr)
ASSERT(txr->tr_magic == SMB_TXREQ_MAGIC);
list_remove(&local, txr);
- iov.iov_base = (void *)txr->tr_buf;
- iov.iov_len = txr->tr_len;
-
- bzero(&msg, sizeof (msg));
- msg.msg_iov = &iov;
- msg.msg_flags = MSG_WAITALL;
- msg.msg_iovlen = 1;
-
- bzero(&uio, sizeof (uio));
- uio.uio_iov = &iov;
- uio.uio_iovcnt = 1;
- uio.uio_segflg = UIO_SYSSPACE;
- uio.uio_resid = txr->tr_len;
-
- rc = sosendmsg(so, &msg, &uio);
-
+ len = txr->tr_len;
+ rc = ksocket_send(so, txr->tr_buf, txr->tr_len,
+ MSG_WAITALL, &sent, CRED());
smb_net_txr_free(txr);
-
- if ((rc == 0) && (uio.uio_resid == 0))
+ if ((rc == 0) && (sent == len))
continue;
if (rc == 0)
diff --git a/usr/src/uts/common/fs/smbsrv/smb_server.c b/usr/src/uts/common/fs/smbsrv/smb_server.c
index eb3f1d82a3..9296f123be 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_server.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_server.c
@@ -1242,7 +1242,7 @@ smb_server_listen(
int pthread_create_error)
{
int rc;
- struct sonode *s_so;
+ ksocket_t s_so;
uint32_t on = 1;
smb_session_t *session;
@@ -1263,14 +1263,16 @@ smb_server_listen(
if (ld->ld_so) {
- (void) sosetsockopt(ld->ld_so, SOL_SOCKET,
- SO_REUSEADDR, (const void *)&on, sizeof (on));
+ (void) ksocket_setsockopt(ld->ld_so, SOL_SOCKET,
+ SO_REUSEADDR, (const void *)&on, sizeof (on),
+ CRED());
- rc = sobind(ld->ld_so, (struct sockaddr *)&ld->ld_sin,
- sizeof (ld->ld_sin), 0, 0);
+ rc = ksocket_bind(ld->ld_so,
+ (struct sockaddr *)&ld->ld_sin,
+ sizeof (ld->ld_sin), CRED());
if (rc == 0) {
- rc = solisten(ld->ld_so, 20);
+ rc = ksocket_listen(ld->ld_so, 20, CRED());
if (rc < 0) {
cmn_err(CE_WARN,
"Port %d: listen failed", port);
@@ -1297,19 +1299,22 @@ smb_server_listen(
DTRACE_PROBE1(so__wait__accept, struct sonode *, ld->ld_so);
for (;;) {
- rc = soaccept(ld->ld_so, 0, &s_so);
+ rc = ksocket_accept(ld->ld_so, NULL, NULL, &s_so, CRED());
if (rc == 0) {
uint32_t txbuf_size = 128*1024;
uint32_t on = 1;
DTRACE_PROBE1(so__accept, struct sonode *, s_so);
- (void) sosetsockopt(s_so, IPPROTO_TCP, TCP_NODELAY,
- (const void *)&on, sizeof (on));
- (void) sosetsockopt(s_so, SOL_SOCKET, SO_KEEPALIVE,
- (const void *)&on, sizeof (on));
- (void) sosetsockopt(s_so, SOL_SOCKET, SO_SNDBUF,
- (const void *)&txbuf_size, sizeof (txbuf_size));
+ (void) ksocket_setsockopt(s_so, IPPROTO_TCP,
+ TCP_NODELAY, (const void *)&on, sizeof (on),
+ CRED());
+ (void) ksocket_setsockopt(s_so, SOL_SOCKET,
+ SO_KEEPALIVE, (const void *)&on, sizeof (on),
+ CRED());
+ (void) ksocket_setsockopt(s_so, SOL_SOCKET, SO_SNDBUF,
+ (const void *)&txbuf_size, sizeof (txbuf_size),
+ CRED());
/*
* Create a session for this connection.
*/
diff --git a/usr/src/uts/common/fs/smbsrv/smb_session.c b/usr/src/uts/common/fs/smbsrv/smb_session.c
index f76c6d77d1..571dee63c3 100644
--- a/usr/src/uts/common/fs/smbsrv/smb_session.c
+++ b/usr/src/uts/common/fs/smbsrv/smb_session.c
@@ -634,11 +634,10 @@ smb_session_message(smb_session_t *session)
* Port will be SSN_SRVC_TCP_PORT or SMB_SRVC_TCP_PORT.
*/
smb_session_t *
-smb_session_create(struct sonode *new_so, uint16_t port, smb_server_t *sv)
+smb_session_create(ksocket_t new_so, uint16_t port, smb_server_t *sv)
{
- uint32_t ipaddr;
- uint32_t local_ipaddr;
struct sockaddr_in sin;
+ socklen_t slen;
smb_session_t *session;
session = kmem_cache_alloc(sv->si_cache_session, KM_SLEEP);
@@ -670,13 +669,18 @@ smb_session_create(struct sonode *new_so, uint16_t port, smb_server_t *sv)
smb_rwx_init(&session->s_lock);
if (new_so) {
- bcopy(new_so->so_faddr_sa, &sin, new_so->so_faddr_len);
- ipaddr = sin.sin_addr.s_addr;
- bcopy(new_so->so_laddr_sa, &sin, new_so->so_faddr_len);
- local_ipaddr = sin.sin_addr.s_addr;
+ slen = sizeof (sin);
+
+ (void) ksocket_getsockname(new_so, (struct sockaddr *)&sin,
+ &slen, CRED());
+ session->local_ipaddr = sin.sin_addr.s_addr;
+
+ slen = sizeof (sin);
+ (void) ksocket_getpeername(new_so, (struct sockaddr *)&sin,
+ &slen, CRED());
+ session->ipaddr = sin.sin_addr.s_addr;
+
session->s_local_port = port;
- session->ipaddr = ipaddr;
- session->local_ipaddr = local_ipaddr;
session->sock = new_so;
}
diff --git a/usr/src/uts/common/fs/sockfs/nl7c.c b/usr/src/uts/common/fs/sockfs/nl7c.c
index 002d111c3a..fe3619ab6c 100644
--- a/usr/src/uts/common/fs/sockfs/nl7c.c
+++ b/usr/src/uts/common/fs/sockfs/nl7c.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* NL7C (Network Layer 7 Cache) as part of SOCKFS provides an in-kernel
* gateway cache for the request/response message based L7 protocol HTTP
@@ -57,6 +55,7 @@
#include <netinet/in.h>
#include <fs/sockfs/nl7c.h>
#include <fs/sockfs/nl7curi.h>
+#include <fs/sockfs/socktpi.h>
#include <inet/nca/ncadoorhdr.h>
#include <inet/nca/ncalogd.h>
@@ -90,7 +89,7 @@ extern void nl7c_nca_init(void);
*
* This list is searched at bind(3SOCKET) time when an application doesn't
* explicitly set AF_NCA but instead uses AF_INET, if a match is found then
- * the underlying socket is marked so_nl7c_flags NL7C_ENABLED.
+ * the underlying socket is marked sti_nl7c_flags NL7C_ENABLED.
*/
typedef struct nl7c_addr_s {
@@ -121,7 +120,7 @@ nl7c_listener_addr(void *arg, struct sonode *so)
if (p->listener == NULL)
p->listener = so;
- so->so_nl7c_addr = arg;
+ SOTOTPI(so)->sti_nl7c_addr = arg;
}
struct sonode *
@@ -256,7 +255,7 @@ nl7c_mi_report_addr(mblk_t *mp)
int a4 = ip & 0xFF;
(void) mi_sprintf(addr, "%d.%d.%d.%d",
- a1, a2, a3, a4);
+ a1, a2, a3, a4);
}
so = p->listener;
(void) mi_mpprintf(mp, "%p %s:%d %d",
@@ -398,7 +397,7 @@ ncaportconf_read(void)
if (ret != 0) {
/* Error of some sort, tell'm about it */
cmn_err(CE_WARN, "%s: read error %d",
- portconf, ret);
+ portconf, ret);
break;
}
if (resid == sizeof (buf)) {
@@ -564,7 +563,7 @@ ncakmodconf_read(void)
if (ret != 0) {
/* Error of some sort, tell'm about it */
cmn_err(CE_WARN, "%s: read error %d",
- status, ret);
+ status, ret);
break;
}
if (resid == sizeof (buf)) {
@@ -687,7 +686,7 @@ ncalogdconf_read(void)
if (ret != 0) {
/* Error of some sort, tell'm about it */
cmn_err(CE_WARN, "%s: read error %d",
- ncalogd, ret);
+ ncalogd, ret);
break;
}
if (resid == sizeof (buf)) {
@@ -933,7 +932,8 @@ boolean_t
nl7c_process(struct sonode *so, boolean_t nonblocking)
{
vnode_t *vp = SOTOV(so);
- mblk_t *rmp = so->so_nl7c_rcv_mp;
+ sotpi_info_t *sti = SOTOTPI(so);
+ mblk_t *rmp = sti->sti_nl7c_rcv_mp;
clock_t timout;
rval_t rval;
uchar_t pri;
@@ -942,7 +942,7 @@ nl7c_process(struct sonode *so, boolean_t nonblocking)
boolean_t more;
boolean_t ret = B_FALSE;
boolean_t first = B_TRUE;
- boolean_t pollin = (so->so_nl7c_flags & NL7C_POLLIN);
+ boolean_t pollin = (sti->sti_nl7c_flags & NL7C_POLLIN);
nl7c_proc_cnt++;
@@ -950,7 +950,7 @@ nl7c_process(struct sonode *so, boolean_t nonblocking)
error = so_lock_read_intr(so, nonblocking ? FNDELAY|FNONBLOCK : 0);
if (error) {
/* Couldn't read lock, pass on this socket */
- so->so_nl7c_flags = 0;
+ sti->sti_nl7c_flags = 0;
nl7c_proc_noLRI++;
return (B_FALSE);
}
@@ -958,7 +958,7 @@ nl7c_process(struct sonode *so, boolean_t nonblocking)
mutex_exit(&so->so_lock);
if (pollin)
- so->so_nl7c_flags &= ~NL7C_POLLIN;
+ sti->sti_nl7c_flags &= ~NL7C_POLLIN;
/* Initialize some kstrgetmsg() constants */
pflag = MSG_ANY | MSG_DELAYERROR;
@@ -966,7 +966,7 @@ nl7c_process(struct sonode *so, boolean_t nonblocking)
if (nonblocking) {
/* Non blocking so don't block */
timout = 0;
- } else if (so->so_nl7c_flags & NL7C_SOPERSIST) {
+ } else if (sti->sti_nl7c_flags & NL7C_SOPERSIST) {
/* 2nd or more time(s) here so use keep-alive value */
timout = nca_http_keep_alive_timeout;
} else {
@@ -996,18 +996,18 @@ nl7c_process(struct sonode *so, boolean_t nonblocking)
/* Error of some sort */
nl7c_proc_error++;
rval.r_v.r_v2 = error;
- so->so_nl7c_flags = 0;
+ sti->sti_nl7c_flags = 0;
break;
}
error = 0;
}
if (rmp != NULL) {
- mblk_t *mp = so->so_nl7c_rcv_mp;
+ mblk_t *mp = sti->sti_nl7c_rcv_mp;
if (mp == NULL) {
/* Just new data, common case */
- so->so_nl7c_rcv_mp = rmp;
+ sti->sti_nl7c_rcv_mp = rmp;
} else {
/* Add new data to tail */
while (mp->b_cont != NULL)
@@ -1015,13 +1015,14 @@ nl7c_process(struct sonode *so, boolean_t nonblocking)
mp->b_cont = rmp;
}
}
- if (so->so_nl7c_rcv_mp == NULL) {
+ if (sti->sti_nl7c_rcv_mp == NULL) {
/* No data */
nl7c_proc_nodata++;
if (timout > 0 || (first && pollin)) {
/* Expected data so EOF */
ret = B_TRUE;
- } else if (so->so_nl7c_flags & NL7C_SOPERSIST) {
+ } else if (sti->sti_nl7c_flags &
+ NL7C_SOPERSIST) {
/* Persistent so just checking */
ret = B_FALSE;
}
@@ -1035,7 +1036,7 @@ nl7c_process(struct sonode *so, boolean_t nonblocking)
more = nl7c_parse(so, nonblocking, &ret);
- if (ret == B_TRUE && (so->so_nl7c_flags & NL7C_SOPERSIST)) {
+ if (ret == B_TRUE && (sti->sti_nl7c_flags & NL7C_SOPERSIST)) {
/*
* Parse complete, cache hit, response on its way,
* socket is persistent so try to process the next
@@ -1045,7 +1046,7 @@ nl7c_process(struct sonode *so, boolean_t nonblocking)
ret = B_FALSE;
break;
}
- if (so->so_nl7c_rcv_mp) {
+ if (sti->sti_nl7c_rcv_mp) {
/* More recv-side data, pipelined */
nl7c_proc_again++;
goto again;
@@ -1061,10 +1062,10 @@ nl7c_process(struct sonode *so, boolean_t nonblocking)
} while (more);
- if (so->so_nl7c_rcv_mp) {
+ if (sti->sti_nl7c_rcv_mp) {
nl7c_proc_rcv++;
}
- so->so_nl7c_rcv_rval = rval.r_vals;
+ sti->sti_nl7c_rcv_rval = rval.r_vals;
/* Renter so_lock, caller called with it enter()ed */
mutex_enter(&so->so_lock);
so_unlock_read(so);
diff --git a/usr/src/uts/common/fs/sockfs/nl7c.h b/usr/src/uts/common/fs/sockfs/nl7c.h
index 68914a3a58..6cd27c5efd 100644
--- a/usr/src/uts/common/fs/sockfs/nl7c.h
+++ b/usr/src/uts/common/fs/sockfs/nl7c.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_SOCKFS_NL7C_H
#define _SYS_SOCKFS_NL7C_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -39,8 +37,17 @@ extern "C" {
#include <sys/socket.h>
#include <sys/socketvar.h>
+
/*
- * NL7C (uint64_t)(struct sonode).so_nl7c_flags:
+ * NCA_DEV NCA device
+ *
+ * NCA_INET_DEV TPI device for the INET based transport that NCA will use.
+ */
+#define NCA_DEV "/dev/nca"
+#define NCA_INET_DEV "/dev/tcp"
+
+/*
+ * NL7C (uint64_t)(sotpi_info_t).sti_nl7c_flags:
*/
#define NL7C_ENABLED 0x00000001 /* NL7C enabled socket */
@@ -71,6 +78,10 @@ void nl7c_urifree(struct sonode *);
void nl7c_close(struct sonode *);
boolean_t nl7c_parse(struct sonode *, boolean_t, boolean_t *);
+extern void *nl7c_lookup_addr(void *, t_uscalar_t);
+extern void *nl7c_add_addr(void *, t_uscalar_t);
+extern void nl7c_listener_addr(void *, struct sonode *);
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/fs/sockfs/nl7chttp.c b/usr/src/uts/common/fs/sockfs/nl7chttp.c
index 20f726a4c2..81dd8a99a5 100644
--- a/usr/src/uts/common/fs/sockfs/nl7chttp.c
+++ b/usr/src/uts/common/fs/sockfs/nl7chttp.c
@@ -19,16 +19,15 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/sysmacros.h>
#include <sys/strsubr.h>
#include <fs/sockfs/nl7c.h>
#include <fs/sockfs/nl7curi.h>
+#include <fs/sockfs/socktpi.h>
#include <inet/nca/ncadoorhdr.h>
#include <inet/nca/ncalogd.h>
@@ -578,7 +577,7 @@ http_date2time_t(char *cp, char *ep)
leap--;
leap = leap / 4 - leap / 100 + leap / 400 - zeroleap;
secs = ((((year - 1970) * 365 + dom[month] + day - 1 + leap) * 24
- + hour) * 60 + min) * 60 + sec;
+ + hour) * 60 + min) * 60 + sec;
return (secs);
}
@@ -1167,7 +1166,7 @@ nl7c_http_cond(uri_desc_t *req, uri_desc_t *res)
mblk_t *
nl7c_http_persist(struct sonode *so)
{
- uint64_t flags = so->so_nl7c_flags & NL7C_SCHEMEPRIV;
+ uint64_t flags = SOTOTPI(so)->sti_nl7c_flags & NL7C_SCHEMEPRIV;
mblk_t *mp;
if (flags & HTTP_CONN_CL)
@@ -1187,6 +1186,7 @@ nl7c_http_persist(struct sonode *so)
boolean_t
nl7c_http_request(char **cpp, char *ep, uri_desc_t *uri, struct sonode *so)
{
+ sotpi_info_t *sti = SOTOTPI(so);
http_t *http = kmem_cache_alloc(http_kmc, KM_SLEEP);
char *cp = *cpp;
char *hp;
@@ -1429,20 +1429,20 @@ done:
*
*/
if (persist)
- so->so_nl7c_flags |= NL7C_SOPERSIST;
+ sti->sti_nl7c_flags |= NL7C_SOPERSIST;
else
- so->so_nl7c_flags &= ~NL7C_SOPERSIST;
+ sti->sti_nl7c_flags &= ~NL7C_SOPERSIST;
if (http->major == 1) {
- so->so_nl7c_flags &= ~NL7C_SCHEMEPRIV;
+ sti->sti_nl7c_flags &= ~NL7C_SCHEMEPRIV;
if (http->minor >= 1) {
if (! persist)
- so->so_nl7c_flags |= HTTP_CONN_CL;
+ sti->sti_nl7c_flags |= HTTP_CONN_CL;
} else {
if (persist)
- so->so_nl7c_flags |= HTTP_CONN_KA;
+ sti->sti_nl7c_flags |= HTTP_CONN_KA;
else
- so->so_nl7c_flags |= HTTP_CONN_CL;
+ sti->sti_nl7c_flags |= HTTP_CONN_CL;
}
}
/*
@@ -1464,6 +1464,7 @@ more:
boolean_t
nl7c_http_response(char **cpp, char *ep, uri_desc_t *uri, struct sonode *so)
{
+ sotpi_info_t *sti = SOTOTPI(so);
http_t *http = uri->scheme;
char *cp = *cpp;
char *hp;
@@ -1753,20 +1754,20 @@ done:
/* Set socket persist state */
if (persist)
- so->so_nl7c_flags |= NL7C_SOPERSIST;
+ sti->sti_nl7c_flags |= NL7C_SOPERSIST;
else
- so->so_nl7c_flags &= ~NL7C_SOPERSIST;
+ sti->sti_nl7c_flags &= ~NL7C_SOPERSIST;
if (http->major == 1) {
- so->so_nl7c_flags &= ~NL7C_SCHEMEPRIV;
+ sti->sti_nl7c_flags &= ~NL7C_SCHEMEPRIV;
if (http->minor >= 1) {
if (! persist)
- so->so_nl7c_flags |= HTTP_CONN_CL;
+ sti->sti_nl7c_flags |= HTTP_CONN_CL;
} else {
if (persist)
- so->so_nl7c_flags |= HTTP_CONN_KA;
+ sti->sti_nl7c_flags |= HTTP_CONN_KA;
else
- so->so_nl7c_flags |= HTTP_CONN_CL;
+ sti->sti_nl7c_flags |= HTTP_CONN_CL;
}
}
diff --git a/usr/src/uts/common/fs/sockfs/nl7curi.c b/usr/src/uts/common/fs/sockfs/nl7curi.c
index fb1bf2f000..61f72258fc 100644
--- a/usr/src/uts/common/fs/sockfs/nl7curi.c
+++ b/usr/src/uts/common/fs/sockfs/nl7curi.c
@@ -33,6 +33,7 @@
#include <sys/sendfile.h>
#include <fs/sockfs/nl7c.h>
#include <fs/sockfs/nl7curi.h>
+#include <fs/sockfs/socktpi_impl.h>
#include <inet/common.h>
#include <inet/ip.h>
@@ -1017,9 +1018,10 @@ next:
void
nl7c_urifree(struct sonode *so)
{
- uri_desc_t *uri = (uri_desc_t *)so->so_nl7c_uri;
+ sotpi_info_t *sti = SOTOTPI(so);
+ uri_desc_t *uri = (uri_desc_t *)sti->sti_nl7c_uri;
- so->so_nl7c_uri = NULL;
+ sti->sti_nl7c_uri = NULL;
if (uri->hash != URI_TEMP) {
uri_delete(uri);
mutex_enter(&uri->proclock);
@@ -1109,7 +1111,8 @@ pass:
int
nl7c_data(struct sonode *so, uio_t *uio)
{
- uri_desc_t *uri = (uri_desc_t *)so->so_nl7c_uri;
+ sotpi_info_t *sti = SOTOTPI(so);
+ uri_desc_t *uri = (uri_desc_t *)sti->sti_nl7c_uri;
iovec_t *iov;
int cnt;
int sz = uio->uio_resid;
@@ -1123,13 +1126,13 @@ nl7c_data(struct sonode *so, uio_t *uio)
if (uri == NULL) {
/* Socket & NL7C out of sync, disable NL7C */
- so->so_nl7c_flags = 0;
+ sti->sti_nl7c_flags = 0;
nl7c_uri_NULL1++;
return (-1);
}
- if (so->so_nl7c_flags & NL7C_WAITWRITE) {
- so->so_nl7c_flags &= ~NL7C_WAITWRITE;
+ if (sti->sti_nl7c_flags & NL7C_WAITWRITE) {
+ sti->sti_nl7c_flags &= ~NL7C_WAITWRITE;
first = B_TRUE;
} else {
first = B_FALSE;
@@ -1191,9 +1194,9 @@ nl7c_data(struct sonode *so, uio_t *uio)
* so close the URI processing for this so.
*/
nl7c_close(so);
- if (! (so->so_nl7c_flags & NL7C_SOPERSIST)) {
+ if (! (sti->sti_nl7c_flags & NL7C_SOPERSIST)) {
/* Not a persistent connection */
- so->so_nl7c_flags = 0;
+ sti->sti_nl7c_flags = 0;
}
}
@@ -1203,7 +1206,7 @@ fail:
if (alloc != NULL) {
kmem_free(alloc, sz);
}
- so->so_nl7c_flags = 0;
+ sti->sti_nl7c_flags = 0;
nl7c_urifree(so);
return (error);
@@ -1275,7 +1278,8 @@ int
nl7c_sendfilev(struct sonode *so, u_offset_t *fileoff, sendfilevec_t *sfvp,
int sfvc, ssize_t *xfer)
{
- uri_desc_t *uri = (uri_desc_t *)so->so_nl7c_uri;
+ sotpi_info_t *sti = SOTOTPI(so);
+ uri_desc_t *uri = (uri_desc_t *)sti->sti_nl7c_uri;
file_t *fp = NULL;
vnode_t *vp = NULL;
char *data = NULL;
@@ -1294,13 +1298,13 @@ nl7c_sendfilev(struct sonode *so, u_offset_t *fileoff, sendfilevec_t *sfvp,
if (uri == NULL) {
/* Socket & NL7C out of sync, disable NL7C */
- so->so_nl7c_flags = 0;
+ sti->sti_nl7c_flags = 0;
nl7c_uri_NULL2++;
return (0);
}
- if (so->so_nl7c_flags & NL7C_WAITWRITE)
- so->so_nl7c_flags &= ~NL7C_WAITWRITE;
+ if (sti->sti_nl7c_flags & NL7C_WAITWRITE)
+ sti->sti_nl7c_flags &= ~NL7C_WAITWRITE;
while (sfvc-- > 0) {
/*
@@ -1435,15 +1439,18 @@ nl7c_sendfilev(struct sonode *so, u_offset_t *fileoff, sendfilevec_t *sfvp,
* so close the URI processing for this so.
*/
nl7c_close(so);
- if (! (so->so_nl7c_flags & NL7C_SOPERSIST)) {
+ if (! (sti->sti_nl7c_flags & NL7C_SOPERSIST)) {
/* Not a persistent connection */
- so->so_nl7c_flags = 0;
+ sti->sti_nl7c_flags = 0;
}
}
return (0);
fail:
+ if (error == EPIPE)
+ tsignal(curthread, SIGPIPE);
+
if (alloc != NULL)
kmem_free(data, len);
@@ -1457,7 +1464,7 @@ fail:
atomic_add_64(&nl7c_uri_bytes, total_count);
}
- so->so_nl7c_flags = 0;
+ sti->sti_nl7c_flags = 0;
nl7c_urifree(so);
return (error);
@@ -1472,7 +1479,8 @@ fail:
void
nl7c_close(struct sonode *so)
{
- uri_desc_t *uri = (uri_desc_t *)so->so_nl7c_uri;
+ sotpi_info_t *sti = SOTOTPI(so);
+ uri_desc_t *uri = (uri_desc_t *)sti->sti_nl7c_uri;
if (uri == NULL) {
/*
@@ -1484,7 +1492,7 @@ nl7c_close(struct sonode *so)
}
return;
}
- so->so_nl7c_uri = NULL;
+ sti->sti_nl7c_uri = NULL;
if (uri->hash != URI_TEMP) {
mutex_enter(&uri->proclock);
uri->proc = NULL;
@@ -1679,7 +1687,6 @@ kstrwritempnoqwait(struct vnode *vp, mblk_t *mp)
if (error != 0) {
if (!(stp->sd_flag & STPLEX) &&
(stp->sd_wput_opt & SW_SIGPIPE)) {
- tsignal(curthread, SIGPIPE);
error = EPIPE;
}
return (error);
@@ -1700,7 +1707,7 @@ uri_rd_response(struct sonode *so,
boolean_t first)
{
vnode_t *vp = SOTOV(so);
- int max_mblk = (int)((tcp_t *)so->so_priv)->tcp_mss;
+ int max_mblk = (int)vp->v_stream->sd_maxblk;
int wsz;
mblk_t *mp, *wmp, *persist;
int write_bytes;
@@ -1934,8 +1941,9 @@ static char pchars[] = {
boolean_t
nl7c_parse(struct sonode *so, boolean_t nonblocking, boolean_t *ret)
{
- char *cp = (char *)so->so_nl7c_rcv_mp->b_rptr;
- char *ep = (char *)so->so_nl7c_rcv_mp->b_wptr;
+ sotpi_info_t *sti = SOTOTPI(so);
+ char *cp = (char *)sti->sti_nl7c_rcv_mp->b_rptr;
+ char *ep = (char *)sti->sti_nl7c_rcv_mp->b_wptr;
char *get = "GET ";
char *post = "POST ";
char c;
@@ -1945,7 +1953,7 @@ nl7c_parse(struct sonode *so, boolean_t nonblocking, boolean_t *ret)
mblk_t *reqmp;
uint32_t hv = 0;
- if ((reqmp = dupb(so->so_nl7c_rcv_mp)) == NULL) {
+ if ((reqmp = dupb(sti->sti_nl7c_rcv_mp)) == NULL) {
nl7c_uri_pass_dupbfail++;
goto pass;
}
@@ -1965,7 +1973,7 @@ nl7c_parse(struct sonode *so, boolean_t nonblocking, boolean_t *ret)
/*
* Set request time to current time.
*/
- so->so_nl7c_rtime = gethrestime_sec();
+ sti->sti_nl7c_rtime = gethrestime_sec();
/*
* Parse the Request-Line for the URI.
@@ -2043,7 +2051,7 @@ nl7c_parse(struct sonode *so, boolean_t nonblocking, boolean_t *ret)
}
if (uri->hash == URI_TEMP) {
- if (so->so_nl7c_flags & NL7C_SOPERSIST) {
+ if (sti->sti_nl7c_flags & NL7C_SOPERSIST) {
/* Temporary URI so skip hash processing */
nl7c_uri_request++;
nl7c_uri_temp++;
@@ -2073,10 +2081,10 @@ nl7c_parse(struct sonode *so, boolean_t nonblocking, boolean_t *ret)
* We have the response cached, update recv mblk rptr
* to reflect the data consumed in parse.
*/
- mblk_t *mp = so->so_nl7c_rcv_mp;
+ mblk_t *mp = sti->sti_nl7c_rcv_mp;
if (cp == (char *)mp->b_wptr) {
- so->so_nl7c_rcv_mp = mp->b_cont;
+ sti->sti_nl7c_rcv_mp = mp->b_cont;
mp->b_cont = NULL;
freeb(mp);
} else {
@@ -2094,12 +2102,12 @@ nl7c_parse(struct sonode *so, boolean_t nonblocking, boolean_t *ret)
if (so->so_family == AF_INET) {
/* Only support IPv4 addrs */
faddr = ((struct sockaddr_in *)
- so->so_faddr_sa) ->sin_addr.s_addr;
+ sti->sti_faddr_sa) ->sin_addr.s_addr;
} else {
faddr = 0;
}
/* XXX need to pass response type, e.g. 200, 304 */
- nl7c_logd_log(ruri, uri, so->so_nl7c_rtime, faddr);
+ nl7c_logd_log(ruri, uri, sti->sti_nl7c_rtime, faddr);
}
/*
* Release reference on request URI, send the response out
@@ -2125,11 +2133,11 @@ temp:
* read-side processing is suspended (so the next read() gets
* the request data) until a write() is processed by NL7C.
*
- * Note, so->so_nl7c_uri now owns the REF_INIT() ref.
+ * Note, sti->sti_nl7c_uri now owns the REF_INIT() ref.
*/
uri->proc = so;
- so->so_nl7c_uri = uri;
- so->so_nl7c_flags |= NL7C_WAITWRITE;
+ sti->sti_nl7c_uri = uri;
+ sti->sti_nl7c_flags |= NL7C_WAITWRITE;
*ret = B_FALSE;
return (B_FALSE);
@@ -2147,7 +2155,7 @@ pass:
if (uri) {
REF_RELE(uri);
}
- so->so_nl7c_flags = 0;
+ sti->sti_nl7c_flags = 0;
*ret = B_FALSE;
return (B_FALSE);
}
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon.c b/usr/src/uts/common/fs/sockfs/sockcommon.c
new file mode 100644
index 0000000000..02c3c16df5
--- /dev/null
+++ b/usr/src/uts/common/fs/sockfs/sockcommon.c
@@ -0,0 +1,1092 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/debug.h>
+#include <sys/cmn_err.h>
+#include <sys/vfs.h>
+#include <sys/policy.h>
+#include <sys/modctl.h>
+
+#include <sys/sunddi.h>
+
+#include <sys/strsun.h>
+#include <sys/stropts.h>
+#include <sys/strsubr.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sodirect.h>
+#include <sys/uio.h>
+
+#include <inet/ipclassifier.h>
+#include <fs/sockfs/sockcommon.h>
+#include <fs/sockfs/nl7c.h>
+#include <inet/ip.h>
+
+extern int xnet_skip_checks, xnet_check_print, xnet_truncate_print;
+
+static struct kmem_cache *sock_sod_cache;
+
+/*
+ * Common socket access functions.
+ *
+ * Instead of accessing the sonode switch directly (i.e., SOP_xxx()),
+ * the socket_xxx() function should be used.
+ */
+
+/*
+ * Try to create a new sonode of the requested <family, type, protocol>.
+ */
+/* ARGSUSED */
+struct sonode *
+socket_create(int family, int type, int protocol, char *devpath, char *mod,
+ int flags, int version, struct cred *cr, int *errorp)
+{
+ struct sonode *so;
+ struct sockparams *sp = NULL;
+
+ /*
+ * Look for a sockparams entry that match the given criteria.
+ * solookup() returns with the entry held.
+ */
+ *errorp = solookup(family, type, protocol, &sp);
+ if (sp == NULL) {
+ int kmflags = (flags == SOCKET_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
+ /*
+ * There is no matching sockparams entry. An ephemeral entry is
+ * created if the caller specifies a device or a socket module.
+ */
+ if (devpath != NULL) {
+ sp = sockparams_hold_ephemeral_bydev(family, type,
+ protocol, devpath, kmflags, errorp);
+ } else if (mod != NULL) {
+ sp = sockparams_hold_ephemeral_bymod(family, type,
+ protocol, mod, kmflags, errorp);
+ } else {
+ return (NULL);
+ }
+
+ if (sp == NULL)
+ return (NULL);
+ }
+
+ ASSERT(sp->sp_smod_info != NULL);
+ ASSERT(flags == SOCKET_SLEEP || flags == SOCKET_NOSLEEP);
+ so = sp->sp_smod_info->smod_sock_create_func(sp, family, type,
+ protocol, version, flags, errorp, cr);
+ if (so == NULL) {
+ SOCKPARAMS_DEC_REF(sp);
+ } else {
+ if ((*errorp = SOP_INIT(so, NULL, cr, flags)) == 0) {
+ /* Cannot fail, only bumps so_count */
+ (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
+ } else {
+ socket_destroy(so);
+ so = NULL;
+ }
+ }
+ return (so);
+}
+
+struct sonode *
+socket_newconn(struct sonode *parent, sock_lower_handle_t lh,
+ sock_downcalls_t *dc, int flags, int *errorp)
+{
+ struct sonode *so;
+ struct sockparams *sp;
+ struct cred *cr;
+
+ if ((cr = CRED()) == NULL)
+ cr = kcred;
+
+ sp = parent->so_sockparams;
+ ASSERT(sp != NULL);
+
+ so = sp->sp_smod_info->smod_sock_create_func(sp, parent->so_family,
+ parent->so_type, parent->so_protocol, parent->so_version, flags,
+ errorp, cr);
+ if (so != NULL) {
+ SOCKPARAMS_INC_REF(sp);
+
+ so->so_proto_handle = lh;
+ so->so_downcalls = dc;
+ /*
+ * This function may be called in interrupt context, and CRED()
+ * will be NULL. In this case, pass in kcred.
+ */
+ if ((*errorp = SOP_INIT(so, parent, cr, flags)) == 0) {
+ /* Cannot fail, only bumps so_count */
+ (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
+ } else {
+ socket_destroy(so);
+ so = NULL;
+ }
+ }
+
+ return (so);
+}
+
+/*
+ * Bind local endpoint.
+ */
+int
+socket_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
+ int flags, cred_t *cr)
+{
+ return (SOP_BIND(so, name, namelen, flags, cr));
+}
+
+/*
+ * Turn socket into a listen socket.
+ */
+int
+socket_listen(struct sonode *so, int backlog, cred_t *cr)
+{
+ if (backlog < 0) {
+ backlog = 0;
+ }
+
+ /*
+ * Use the same qlimit as in BSD. BSD checks the qlimit
+ * before queuing the next connection implying that a
+ * listen(sock, 0) allows one connection to be queued.
+ * BSD also uses 1.5 times the requested backlog.
+ *
+ * XNS Issue 4 required a strict interpretation of the backlog.
+ * This has been waived subsequently for Issue 4 and the change
+ * incorporated in XNS Issue 5. So we aren't required to do
+ * anything special for XPG apps.
+ */
+ if (backlog >= (INT_MAX - 1) / 3)
+ backlog = INT_MAX;
+ else
+ backlog = backlog * 3 / 2 + 1;
+
+ return (SOP_LISTEN(so, backlog, cr));
+}
+
+/*
+ * Accept incoming connection.
+ */
+int
+socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop)
+{
+ return (SOP_ACCEPT(lso, fflag, cr, nsop));
+}
+
+/*
+ * Active open.
+ */
+int
+socket_connect(struct sonode *so, const struct sockaddr *name,
+ socklen_t namelen, int fflag, int flags, cred_t *cr)
+{
+ int error;
+
+ /*
+ * Handle a connect to a name parameter of type AF_UNSPEC like a
+ * connect to a null address. This is the portable method to
+ * unconnect a socket.
+ */
+ if ((namelen >= sizeof (sa_family_t)) &&
+ (name->sa_family == AF_UNSPEC)) {
+ name = NULL;
+ namelen = 0;
+ }
+
+ error = SOP_CONNECT(so, name, namelen, fflag, flags, cr);
+
+ if (error == EHOSTUNREACH && flags & _SOCONNECT_XPG4_2) {
+ /*
+ * X/Open specification contains a requirement that
+ * ENETUNREACH be returned but does not require
+ * EHOSTUNREACH. In order to keep the test suite
+ * happy we mess with the errno here.
+ */
+ error = ENETUNREACH;
+ }
+
+ return (error);
+}
+
+/*
+ * Get address of remote node.
+ */
+int
+socket_getpeername(struct sonode *so, struct sockaddr *addr,
+ socklen_t *addrlen, boolean_t accept, cred_t *cr)
+{
+ ASSERT(*addrlen > 0);
+ return (SOP_GETPEERNAME(so, addr, addrlen, accept, cr));
+
+}
+
+/*
+ * Get local address.
+ */
+int
+socket_getsockname(struct sonode *so, struct sockaddr *addr,
+ socklen_t *addrlen, cred_t *cr)
+{
+ return (SOP_GETSOCKNAME(so, addr, addrlen, cr));
+
+}
+
+/*
+ * Called from shutdown().
+ */
+int
+socket_shutdown(struct sonode *so, int how, cred_t *cr)
+{
+ return (SOP_SHUTDOWN(so, how, cr));
+}
+
+/*
+ * Get socket options.
+ */
+/*ARGSUSED*/
+int
+socket_getsockopt(struct sonode *so, int level, int option_name,
+ void *optval, socklen_t *optlenp, int flags, cred_t *cr)
+{
+ return (SOP_GETSOCKOPT(so, level, option_name, optval,
+ optlenp, flags, cr));
+}
+
+/*
+ * Set socket options
+ */
+int
+socket_setsockopt(struct sonode *so, int level, int option_name,
+ const void *optval, t_uscalar_t optlen, cred_t *cr)
+{
+ /* Caller allocates aligned optval, or passes null */
+ ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
+ /* If optval is null optlen is 0, and vice-versa */
+ ASSERT(optval != NULL || optlen == 0);
+ ASSERT(optlen != 0 || optval == NULL);
+
+ /* No options should be zero-length */
+ if (optlen == 0)
+ return (EINVAL);
+
+ return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
+}
+
+int
+socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
+ cred_t *cr)
+{
+ int error = 0;
+ ssize_t orig_resid = uiop->uio_resid;
+
+ /*
+ * Do not bypass the cache if we are doing a local (AF_UNIX) write.
+ */
+ if (so->so_family == AF_UNIX)
+ uiop->uio_extflg |= UIO_COPY_CACHED;
+ else
+ uiop->uio_extflg &= ~UIO_COPY_CACHED;
+
+ error = SOP_SENDMSG(so, msg, uiop, cr);
+ switch (error) {
+ default:
+ break;
+ case EINTR:
+ case ETIME:
+ case EWOULDBLOCK:
+ /* We did a partial send */
+ if (uiop->uio_resid != orig_resid)
+ error = 0;
+ break;
+ case EPIPE:
+ if ((so->so_mode & SM_KERNEL) == 0)
+ tsignal(curthread, SIGPIPE);
+ break;
+ }
+
+ return (error);
+}
+
+int
+socket_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
+ struct cred *cr, mblk_t **mpp)
+{
+ int error = 0;
+
+ error = SOP_SENDMBLK(so, msg, fflag, cr, mpp);
+ if (error == EPIPE) {
+ tsignal(curthread, SIGPIPE);
+ }
+ return (error);
+}
+
+int
+socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
+ cred_t *cr)
+{
+ int error;
+ ssize_t orig_resid = uiop->uio_resid;
+
+ /*
+ * Do not bypass the cache when reading data, as the application
+ * is likely to access the data shortly.
+ */
+ uiop->uio_extflg |= UIO_COPY_CACHED;
+
+ error = SOP_RECVMSG(so, msg, uiop, cr);
+
+ switch (error) {
+ case EINTR:
+ case ETIME:
+ case EWOULDBLOCK:
+ /* We did a partial read */
+ if (uiop->uio_resid != orig_resid)
+ error = 0;
+ break;
+ default:
+ break;
+ }
+ return (error);
+}
+
+int
+socket_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
+ struct cred *cr, int32_t *rvalp)
+{
+ return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
+}
+
+int
+socket_poll(struct sonode *so, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp)
+{
+ return (SOP_POLL(so, events, anyyet, reventsp, phpp));
+}
+
+int
+socket_close(struct sonode *so, int flag, struct cred *cr)
+{
+ return (VOP_CLOSE(SOTOV(so), flag, 1, 0, cr, NULL));
+}
+
+int
+socket_close_internal(struct sonode *so, int flag, cred_t *cr)
+{
+ ASSERT(so->so_count == 0);
+
+ return (SOP_CLOSE(so, flag, cr));
+}
+
+void
+socket_destroy(struct sonode *so)
+{
+ vn_invalid(SOTOV(so));
+ VN_RELE(SOTOV(so));
+}
+
+/* ARGSUSED */
+void
+socket_destroy_internal(struct sonode *so, cred_t *cr)
+{
+ struct sockparams *sp = so->so_sockparams;
+ ASSERT(so->so_count == 0 && sp != NULL);
+
+ sp->sp_smod_info->smod_sock_destroy_func(so);
+
+ SOCKPARAMS_DEC_REF(sp);
+}
+
+/*
+ * TODO Once the common vnode ops is available, then the vnops argument
+ * should be removed.
+ */
+/*ARGSUSED*/
+int
+sonode_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ struct sonode *so = buf;
+ struct vnode *vp;
+
+ vp = so->so_vnode = vn_alloc(kmflags);
+ if (vp == NULL) {
+ return (-1);
+ }
+ vp->v_data = so;
+ vn_setops(vp, socket_vnodeops);
+
+ so->so_priv = NULL;
+ so->so_oobmsg = NULL;
+
+ so->so_proto_handle = NULL;
+
+ so->so_peercred = NULL;
+
+ so->so_rcv_queued = 0;
+ so->so_rcv_q_head = NULL;
+ so->so_rcv_q_last_head = NULL;
+ so->so_rcv_head = NULL;
+ so->so_rcv_last_head = NULL;
+ so->so_rcv_wanted = 0;
+ so->so_rcv_timer_interval = SOCKET_NO_RCVTIMER;
+ so->so_rcv_timer_tid = 0;
+ so->so_rcv_thresh = 0;
+
+ so->so_acceptq_head = NULL;
+ so->so_acceptq_tail = &so->so_acceptq_head;
+ so->so_acceptq_next = NULL;
+ so->so_acceptq_len = 0;
+ so->so_backlog = 0;
+
+ so->so_snd_qfull = B_FALSE;
+
+ mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL);
+ rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL);
+ cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&so->so_want_cv, NULL, CV_DEFAULT, NULL);
+
+ cv_init(&so->so_acceptq_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&so->so_snd_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&so->so_rcv_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+void
+sonode_destructor(void *buf, void *cdrarg)
+{
+ struct sonode *so = buf;
+ struct vnode *vp = SOTOV(so);
+
+ ASSERT(so->so_priv == NULL);
+ ASSERT(so->so_peercred == NULL);
+
+ ASSERT(so->so_oobmsg == NULL);
+
+ ASSERT(so->so_rcv_q_head == NULL);
+
+ ASSERT(so->so_acceptq_head == NULL);
+ ASSERT(so->so_acceptq_tail == &so->so_acceptq_head);
+ ASSERT(so->so_acceptq_next == NULL);
+
+ ASSERT(vp->v_data == so);
+ ASSERT(vn_matchops(vp, socket_vnodeops));
+
+ vn_free(vp);
+
+ mutex_destroy(&so->so_lock);
+ mutex_destroy(&so->so_acceptq_lock);
+ rw_destroy(&so->so_fallback_rwlock);
+
+ cv_destroy(&so->so_state_cv);
+ cv_destroy(&so->so_want_cv);
+ cv_destroy(&so->so_acceptq_cv);
+ cv_destroy(&so->so_snd_cv);
+ cv_destroy(&so->so_rcv_cv);
+ cv_destroy(&so->so_closing_cv);
+}
+
+void
+sonode_init(struct sonode *so, struct sockparams *sp, int family,
+ int type, int protocol, sonodeops_t *sops)
+{
+ vnode_t *vp;
+
+ vp = SOTOV(so);
+
+ so->so_flag = 0;
+
+ so->so_state = 0;
+ so->so_mode = 0;
+
+ so->so_count = 0;
+
+ so->so_family = family;
+ so->so_type = type;
+ so->so_protocol = protocol;
+
+ SOCK_CONNID_INIT(so->so_proto_connid);
+
+ so->so_options = 0;
+ so->so_linger.l_onoff = 0;
+ so->so_linger.l_linger = 0;
+ so->so_sndbuf = 0;
+ so->so_error = 0;
+ so->so_rcvtimeo = 0;
+ so->so_sndtimeo = 0;
+
+ ASSERT(so->so_oobmsg == NULL);
+ so->so_oobmark = 0;
+ so->so_pgrp = 0;
+
+ ASSERT(so->so_peercred == NULL);
+
+ so->so_zoneid = getzoneid();
+
+ so->so_sockparams = sp;
+
+ so->so_ops = sops;
+
+ so->so_proto_handle = NULL;
+
+ so->so_downcalls = NULL;
+
+ so->so_copyflag = 0;
+
+ ASSERT(so->so_acceptq_head == NULL);
+ ASSERT(so->so_acceptq_tail == &so->so_acceptq_head);
+ ASSERT(so->so_acceptq_next == NULL);
+
+ vn_reinit(vp);
+ vp->v_vfsp = rootvfs;
+ vp->v_type = VSOCK;
+ vp->v_rdev = sockdev;
+
+ so->so_rcv_queued = 0;
+ so->so_rcv_q_head = NULL;
+ so->so_rcv_q_last_head = NULL;
+ so->so_rcv_head = NULL;
+ so->so_rcv_last_head = NULL;
+
+ so->so_snd_qfull = B_FALSE;
+ so->so_minpsz = 0;
+
+ so->so_rcv_wakeup = B_FALSE;
+ so->so_snd_wakeup = B_FALSE;
+ so->so_flowctrld = B_FALSE;
+
+ so->so_pollev = 0;
+ bzero(&so->so_poll_list, sizeof (so->so_poll_list));
+ bzero(&so->so_proto_props, sizeof (struct sock_proto_props));
+
+ bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t));
+ so->so_ksock_cb_arg = NULL;
+
+ so->so_max_addr_len = sizeof (struct sockaddr_storage);
+
+ so->so_direct = NULL;
+
+ vn_exists(vp);
+}
+
+void
+sonode_fini(struct sonode *so)
+{
+ mblk_t *mp;
+ vnode_t *vp;
+
+ ASSERT(so->so_count == 0);
+
+ if (so->so_rcv_timer_tid) {
+ ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+ (void) untimeout(so->so_rcv_timer_tid);
+ so->so_rcv_timer_tid = 0;
+ }
+
+ so_acceptq_flush(so);
+
+#ifdef DEBUG
+ mutex_enter(&so->so_lock);
+ ASSERT(so_verify_oobstate(so));
+ mutex_exit(&so->so_lock);
+#endif /* DEBUG */
+ if ((mp = so->so_oobmsg) != NULL) {
+ freemsg(mp);
+ so->so_oobmsg = NULL;
+ so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|
+ SS_RCVATMARK);
+ }
+
+ if (so->so_poll_list.ph_list != NULL) {
+ pollwakeup(&so->so_poll_list, POLLERR);
+ pollhead_clean(&so->so_poll_list);
+ }
+
+ if (so->so_direct != NULL) {
+ sodirect_t *sodp = so->so_direct;
+
+ ASSERT(sodp->sod_uioafh == NULL);
+
+ so->so_direct = NULL;
+ kmem_cache_free(sock_sod_cache, sodp);
+ }
+
+ vp = SOTOV(so);
+ vn_invalid(vp);
+
+ if (so->so_peercred != NULL) {
+ crfree(so->so_peercred);
+ so->so_peercred = NULL;
+ }
+}
+
+/*
+ * This function is called at the beginning of recvmsg().
+ *
+ * If I/OAT is enabled on this sonode, initialize the uioa state machine
+ * with state UIOA_ALLOC.
+ */
+uio_t *
+sod_rcv_init(struct sonode *so, int flags, struct uio **uiopp)
+{
+ struct uio *suiop;
+ struct uio *uiop;
+ sodirect_t *sodp = so->so_direct;
+
+ if (sodp == NULL)
+ return (NULL);
+
+ suiop = NULL;
+ uiop = *uiopp;
+
+ mutex_enter(sodp->sod_lockp);
+ if (uiop->uio_resid >= uioasync.mincnt &&
+ sodp != NULL && (sodp->sod_state & SOD_ENABLED) &&
+ uioasync.enabled && !(flags & MSG_PEEK) &&
+ !(so->so_state & SS_CANTRCVMORE)) {
+ /*
+ * Big enough I/O for uioa min setup and an sodirect socket
+ * and sodirect enabled and uioa enabled and I/O will be done
+ * and not EOF so initialize the sodirect_t uioa_t with "uiop".
+ */
+ if (!uioainit(uiop, &sodp->sod_uioa)) {
+ /*
+ * Successful uioainit() so the uio_t part of the
+ * uioa_t will be used for all uio_t work to follow,
+ * we return the original "uiop" in "suiop".
+ */
+ suiop = uiop;
+ *uiopp = (uio_t *)&sodp->sod_uioa;
+ /*
+ * Before returning to the caller the passed in uio_t
+ * "uiop" will be updated via a call to uioafini()
+ * below.
+ *
+ * Note, the uioa.uioa_state isn't set to UIOA_ENABLED
+ * here as first we have to uioamove() any currently
+ * queued M_DATA mblk_t(s) so it will be done later.
+ */
+ }
+ /*
+ * In either uioainit() success or not case note the number
+ * of uio bytes the caller wants for sod framework and/or
+ * transport (e.g. TCP) strategy.
+ */
+ sodp->sod_want = uiop->uio_resid;
+ } else if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) {
+ /*
+ * No uioa but still using sodirect so note the number of
+ * uio bytes the caller wants for sodirect framework and/or
+ * transport (e.g. TCP) strategy.
+ */
+ sodp->sod_want = uiop->uio_resid;
+ }
+ mutex_exit(sodp->sod_lockp);
+
+ return (suiop);
+}
+
+/*
+ * This function is called at the end of recvmsg(), it finializes all the I/OAT
+ * operations, and reset the uioa state to UIOA_ALLOC.
+ */
+int
+sod_rcv_done(struct sonode *so, struct uio *suiop, struct uio *uiop)
+{
+ int error = 0;
+ sodirect_t *sodp = so->so_direct;
+ mblk_t *mp;
+
+ if (sodp == NULL) {
+ return (0);
+ }
+
+ ASSERT(MUTEX_HELD(sodp->sod_lockp));
+ /* Finish any sodirect and uioa processing */
+ if (suiop != NULL) {
+ /* Finish any uioa_t processing */
+
+ ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
+ error = uioafini(suiop, (uioa_t *)uiop);
+ if ((mp = sodp->sod_uioafh) != NULL) {
+ sodp->sod_uioafh = NULL;
+ sodp->sod_uioaft = NULL;
+ freemsg(mp);
+ }
+ }
+ ASSERT(sodp->sod_uioafh == NULL);
+ if (!(sodp->sod_state & SOD_WAKE_NOT)) {
+ /* Awoke */
+ sodp->sod_state &= SOD_WAKE_CLR;
+ sodp->sod_state |= SOD_WAKE_NOT;
+ }
+ /* Last, clear sod_want value */
+ sodp->sod_want = 0;
+
+ return (error);
+}
+
+/*
+ * Schedule a uioamove() on a mblk. This is ususally called from
+ * protocols (e.g. TCP) on a I/OAT enabled sonode.
+ */
+mblk_t *
+sod_uioa_mblk_init(struct sodirect_s *sodp, mblk_t *mp, size_t msg_size)
+{
+ uioa_t *uioap = &sodp->sod_uioa;
+ mblk_t *mp1 = mp;
+ mblk_t *lmp = NULL;
+
+ ASSERT(DB_TYPE(mp) == M_DATA);
+ ASSERT(msg_size == msgdsize(mp));
+
+ /* Caller must have lock held */
+ ASSERT(MUTEX_HELD(sodp->sod_lockp));
+
+ if (uioap->uioa_state & UIOA_ENABLED) {
+ /* Uioa is enabled */
+
+ if (msg_size > uioap->uio_resid) {
+ /*
+ * There isn't enough uio space for the mblk_t chain
+ * so disable uioa such that this and any additional
+ * mblk_t data is handled by the socket and schedule
+ * the socket for wakeup to finish this uioa.
+ */
+ uioap->uioa_state &= UIOA_CLR;
+ uioap->uioa_state |= UIOA_FINI;
+ if (sodp->sod_state & SOD_WAKE_NOT) {
+ sodp->sod_state &= SOD_WAKE_CLR;
+ sodp->sod_state |= SOD_WAKE_NEED;
+ }
+ return (mp);
+ }
+ do {
+ uint32_t len = MBLKL(mp1);
+
+ if (!uioamove(mp1->b_rptr, len, UIO_READ, uioap)) {
+ /* Scheduled, mark dblk_t as such */
+ DB_FLAGS(mp1) |= DBLK_UIOA;
+ } else {
+ /* Error, turn off async processing */
+ uioap->uioa_state &= UIOA_CLR;
+ uioap->uioa_state |= UIOA_FINI;
+ break;
+ }
+ lmp = mp1;
+ } while ((mp1 = mp1->b_cont) != NULL);
+
+ if (mp1 != NULL || uioap->uio_resid == 0) {
+ /*
+ * Not all mblk_t(s) uioamoved (error) or all uio
+ * space has been consumed so schedule the socket
+ * for wakeup to finish this uio.
+ */
+ sodp->sod_state &= SOD_WAKE_CLR;
+ sodp->sod_state |= SOD_WAKE_NEED;
+
+ /* Break the mblk chain if neccessary. */
+ if (mp1 != NULL && lmp != NULL) {
+ mp->b_next = mp1;
+ lmp->b_cont = NULL;
+ }
+ }
+ }
+ return (mp1);
+}
+
+/*
+ * This function is called on a mblk that thas been successfully uioamoved().
+ */
+void
+sod_uioa_mblk_done(sodirect_t *sodp, mblk_t *bp)
+{
+ if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) {
+ /*
+ * A uioa flaged mblk_t chain, already uio processed,
+ * add it to the sodirect uioa pending free list.
+ *
+ * Note, a b_cont chain headed by a DBLK_UIOA enable
+ * mblk_t must have all mblk_t(s) DBLK_UIOA enabled.
+ */
+ mblk_t *bpt = sodp->sod_uioaft;
+
+ ASSERT(sodp != NULL);
+
+ /*
+ * Add first mblk_t of "bp" chain to current sodirect uioa
+ * free list tail mblk_t, if any, else empty list so new head.
+ */
+ if (bpt == NULL)
+ sodp->sod_uioafh = bp;
+ else
+ bpt->b_cont = bp;
+
+ /*
+ * Walk mblk_t "bp" chain to find tail and adjust rptr of
+ * each to reflect that uioamove() has consumed all data.
+ */
+ bpt = bp;
+ for (;;) {
+ ASSERT(bpt->b_datap->db_flags & DBLK_UIOA);
+
+ bpt->b_rptr = bpt->b_wptr;
+ if (bpt->b_cont == NULL)
+ break;
+ bpt = bpt->b_cont;
+ }
+ /* New sodirect uioa free list tail */
+ sodp->sod_uioaft = bpt;
+
+ /* Only dequeue once with data returned per uioa_t */
+ if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) {
+ sodp->sod_uioa.uioa_state &= UIOA_CLR;
+ sodp->sod_uioa.uioa_state |= UIOA_FINI;
+ }
+ }
+}
+
+/*
+ * When transit from UIOA_INIT state to UIOA_ENABLE state in recvmsg(), call
+ * this function on a non-STREAMS socket to schedule uioamove() on the data
+ * that has already queued in this socket.
+ */
+void
+sod_uioa_so_init(struct sonode *so, struct sodirect_s *sodp, struct uio *uiop)
+{
+ uioa_t *uioap = (uioa_t *)uiop;
+ mblk_t *lbp;
+ mblk_t *wbp;
+ mblk_t *bp;
+ int len;
+ int error;
+ boolean_t in_rcv_q = B_TRUE;
+
+ ASSERT(MUTEX_HELD(sodp->sod_lockp));
+ ASSERT(&sodp->sod_uioa == uioap);
+
+ /*
+ * Walk first b_cont chain in sod_q
+ * and schedule any M_DATA mblk_t's for uio asynchronous move.
+ */
+ bp = so->so_rcv_q_head;
+
+again:
+ /* Walk the chain */
+ lbp = NULL;
+ wbp = bp;
+
+ do {
+ if (bp == NULL)
+ break;
+
+ if (wbp->b_datap->db_type != M_DATA) {
+ /* Not M_DATA, no more uioa */
+ goto nouioa;
+ }
+ if ((len = wbp->b_wptr - wbp->b_rptr) > 0) {
+ /* Have a M_DATA mblk_t with data */
+ if (len > uioap->uio_resid || (so->so_oobmark > 0 &&
+ len + uioap->uioa_mbytes >= so->so_oobmark)) {
+ /* Not enough uio sapce, or beyond oobmark */
+ goto nouioa;
+ }
+ ASSERT(!(wbp->b_datap->db_flags & DBLK_UIOA));
+ error = uioamove(wbp->b_rptr, len,
+ UIO_READ, uioap);
+ if (!error) {
+ /* Scheduled, mark dblk_t as such */
+ wbp->b_datap->db_flags |= DBLK_UIOA;
+ } else {
+ /* Break the mblk chain */
+ goto nouioa;
+ }
+ }
+ /* Save last wbp processed */
+ lbp = wbp;
+ } while ((wbp = wbp->b_cont) != NULL);
+
+ if (in_rcv_q && (bp == NULL || bp->b_next == NULL)) {
+ /*
+ * We get here only once to process the sonode dump area
+ * if so_rcv_q_head is NULL or all the mblks have been
+ * successfully uioamoved()ed.
+ */
+ in_rcv_q = B_FALSE;
+
+ /* move to dump area */
+ bp = so->so_rcv_head;
+ goto again;
+ }
+
+ return;
+
+nouioa:
+ /* No more uioa */
+ uioap->uioa_state &= UIOA_CLR;
+ uioap->uioa_state |= UIOA_FINI;
+
+ /*
+ * If we processed 1 or more mblk_t(s) then we need to split the
+ * current mblk_t chain in 2 so that all the uioamove()ed mblk_t(s)
+ * are in the current chain and the rest are in the following new
+ * chain.
+ */
+ if (lbp != NULL) {
+ /* New end of current chain */
+ lbp->b_cont = NULL;
+
+ /* Insert new chain wbp after bp */
+ if ((wbp->b_next = bp->b_next) == NULL) {
+ /*
+ * No need to grab so_lock, since sod_lockp
+ * points to so_lock.
+ */
+ if (in_rcv_q)
+ so->so_rcv_q_last_head = wbp;
+ else
+ so->so_rcv_last_head = wbp;
+ }
+ bp->b_next = wbp;
+ bp->b_next->b_prev = bp->b_prev;
+ bp->b_prev = lbp;
+ }
+}
+
+/*
+ * Initialize sodirect data structures on a socket.
+ */
+void
+sod_sock_init(struct sonode *so, struct stdata *stp, sod_enq_func enq_func,
+ sod_wakeup_func wake_func, kmutex_t *lockp)
+{
+ sodirect_t *sodp;
+
+ ASSERT(so->so_direct == NULL);
+
+ so->so_state |= SS_SODIRECT;
+
+ sodp = kmem_cache_alloc(sock_sod_cache, KM_SLEEP);
+ sodp->sod_state = SOD_ENABLED | SOD_WAKE_NOT;
+ sodp->sod_want = 0;
+ sodp->sod_q = (stp != NULL) ? RD(stp->sd_wrq) : NULL;
+ sodp->sod_enqueue = enq_func;
+ sodp->sod_wakeup = wake_func;
+ sodp->sod_uioafh = NULL;
+ sodp->sod_uioaft = NULL;
+ sodp->sod_lockp = lockp;
+ /*
+ * Remainder of the sod_uioa members are left uninitialized
+ * but will be initialized later by uioainit() before uioa
+ * is enabled.
+ */
+ sodp->sod_uioa.uioa_state = UIOA_ALLOC;
+ so->so_direct = sodp;
+ if (stp != NULL)
+ stp->sd_sodirect = sodp;
+}
+
+/*
+ * Init the sodirect kmem cache while sockfs is loading.
+ */
+void
+sod_init()
+{
+ /* Allocate sodirect_t kmem_cache */
+ sock_sod_cache = kmem_cache_create("sock_sod_cache",
+ sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+ssize_t
+sod_uioa_mblk(struct sonode *so, mblk_t *mp)
+{
+ sodirect_t *sodp = so->so_direct;
+
+ ASSERT(sodp != NULL);
+ ASSERT(MUTEX_HELD(sodp->sod_lockp));
+
+ ASSERT(sodp->sod_state & SOD_ENABLED);
+ ASSERT(sodp->sod_uioa.uioa_state != (UIOA_ALLOC|UIOA_INIT));
+
+ ASSERT(sodp->sod_uioa.uioa_state & (UIOA_ENABLED|UIOA_FINI));
+
+ if (mp == NULL && so->so_rcv_q_head != NULL) {
+ mp = so->so_rcv_q_head;
+ ASSERT(mp->b_prev != NULL);
+ mp->b_prev = NULL;
+ so->so_rcv_q_head = mp->b_next;
+ if (so->so_rcv_q_head == NULL) {
+ so->so_rcv_q_last_head = NULL;
+ }
+ mp->b_next = NULL;
+ }
+
+ sod_uioa_mblk_done(sodp, mp);
+
+ if (so->so_rcv_q_head == NULL && so->so_rcv_head != NULL &&
+ DB_TYPE(so->so_rcv_head) == M_DATA &&
+ (DB_FLAGS(so->so_rcv_head) & DBLK_UIOA)) {
+ /* more arrived */
+ ASSERT(so->so_rcv_q_head == NULL);
+ mp = so->so_rcv_head;
+ so->so_rcv_head = mp->b_next;
+ if (so->so_rcv_head == NULL)
+ so->so_rcv_last_head = NULL;
+ mp->b_prev = mp->b_next = NULL;
+ sod_uioa_mblk_done(sodp, mp);
+ }
+
+#ifdef DEBUG
+ if (so->so_rcv_q_head != NULL) {
+ mblk_t *m = so->so_rcv_q_head;
+ while (m != NULL) {
+ if (DB_FLAGS(m) & DBLK_UIOA) {
+ cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
+ " in so_rcv_q_head.\n", (void *)m);
+ }
+ m = m->b_next;
+ }
+ }
+ if (so->so_rcv_head != NULL) {
+ mblk_t *m = so->so_rcv_head;
+ while (m != NULL) {
+ if (DB_FLAGS(m) & DBLK_UIOA) {
+ cmn_err(CE_PANIC, "Unexpected I/OAT mblk %p"
+ " in so_rcv_head.\n", (void *)m);
+ }
+ m = m->b_next;
+ }
+ }
+#endif
+ return (sodp->sod_uioa.uioa_mbytes);
+}
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon.h b/usr/src/uts/common/fs/sockfs/sockcommon.h
new file mode 100644
index 0000000000..fb4512c874
--- /dev/null
+++ b/usr/src/uts/common/fs/sockfs/sockcommon.h
@@ -0,0 +1,246 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SOCKCOMMON_H_
+#define _SOCKCOMMON_H_
+
+#pragma ident "@(#)sockcommon.h 1.1 07/06/14 SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/filio.h>
+#include <sys/socket_proto.h>
+
+struct sonode;
+
+extern kmem_cache_t *socket_cache;
+
+/*
+ * Socket access functions
+ *
+ * The following functions should only be used by sockfs, and are common
+ * functions that can be used both by kernel sockets (i.e., no file
+ * descriptors should ever be expected, or created), and to implement
+ * the socket system calls.
+ */
+extern struct sonode *socket_create(int, int, int, char *, char *, int, int,
+ struct cred *, int *);
+extern struct sonode *socket_newconn(struct sonode *, sock_lower_handle_t,
+ sock_downcalls_t *, int, int *);
+extern int socket_bind(struct sonode *, struct sockaddr *, socklen_t, int,
+ struct cred *);
+extern int socket_accept(struct sonode *, int, struct cred *, struct sonode **);
+extern int socket_listen(struct sonode *, int, struct cred *);
+extern int socket_connect(struct sonode *, const struct sockaddr *,
+ socklen_t, int, int, struct cred *);
+extern int socket_getpeername(struct sonode *, struct sockaddr *, socklen_t *,
+ boolean_t, struct cred *);
+extern int socket_getsockname(struct sonode *, struct sockaddr *, socklen_t *,
+ struct cred *);
+extern int socket_shutdown(struct sonode *, int, struct cred *);
+extern int socket_getsockopt(struct sonode *, int, int, void *, socklen_t *,
+ int, struct cred *);
+extern int socket_setsockopt(struct sonode *, int, int, const void *,
+ socklen_t, struct cred *);
+extern int socket_recvmsg(struct sonode *, struct nmsghdr *, struct uio *,
+ struct cred *);
+extern int socket_sendmsg(struct sonode *, struct nmsghdr *, struct uio *,
+ struct cred *);
+extern int socket_sendmblk(struct sonode *, struct nmsghdr *, int,
+ struct cred *, mblk_t **);
+extern int socket_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
+ int32_t *);
+extern int socket_poll(struct sonode *, short, int, short *,
+ struct pollhead **);
+extern int socket_close(struct sonode *, int, struct cred *);
+extern void socket_destroy(struct sonode *);
+
+/*
+ * Cancel the socket push timer.
+ */
+#define SOCKET_TIMER_CANCEL(so) { \
+ timeout_id_t tid; \
+ \
+ ASSERT(MUTEX_HELD(&(so)->so_lock)); \
+ if ((so)->so_rcv_timer_tid != 0) { \
+ tid = (so)->so_rcv_timer_tid; \
+ (so)->so_rcv_timer_tid = 0; \
+ mutex_exit(&(so)->so_lock); \
+ \
+ (void) untimeout(tid); \
+ \
+ mutex_enter(&(so)->so_lock); \
+ } \
+}
+
+#define SOCKET_TIMER_START(so) { \
+ ASSERT(MUTEX_HELD(&(so)->so_lock)); \
+ if ((so)->so_rcv_timer_interval != SOCKET_NO_RCVTIMER) { \
+ (so)->so_rcv_timer_tid = timeout(so_timer_callback, \
+ (so), MSEC_TO_TICK((so)->so_rcv_timer_interval)); \
+ } \
+}
+
+/* Common sonode ops not support */
+extern int so_listen_notsupp(struct sonode *, int, struct cred *);
+extern int so_accept_notsupp(struct sonode *, int, struct cred *,
+ struct sonode **);
+extern int so_getpeername_notsupp(struct sonode *, struct sockaddr *,
+ socklen_t *, boolean_t, struct cred *);
+extern int so_shutdown_notsupp(struct sonode *, int, struct cred *);
+extern int so_sendmblk_notsupp(struct sonode *, struct nmsghdr *,
+ int, struct cred *, mblk_t **);
+
+/* Common sonode ops */
+extern int so_init(struct sonode *, struct sonode *, struct cred *, int);
+extern int so_accept(struct sonode *, int, struct cred *, struct sonode **);
+extern int so_bind(struct sonode *, struct sockaddr *, socklen_t, int,
+ struct cred *);
+extern int so_listen(struct sonode *, int, struct cred *);
+extern int so_connect(struct sonode *, const struct sockaddr *,
+ socklen_t, int, int, struct cred *);
+extern int so_getsockopt(struct sonode *, int, int, void *,
+ socklen_t *, int, struct cred *);
+extern int so_setsockopt(struct sonode *, int, int, const void *,
+ socklen_t, struct cred *);
+extern int so_getpeername(struct sonode *, struct sockaddr *,
+ socklen_t *, boolean_t, struct cred *);
+extern int so_getsockname(struct sonode *, struct sockaddr *,
+ socklen_t *, struct cred *);
+extern int so_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
+ int32_t *);
+extern int so_poll(struct sonode *, short, int, short *,
+ struct pollhead **);
+extern int so_sendmsg(struct sonode *, struct nmsghdr *, struct uio *,
+ struct cred *);
+extern int so_sendmblk(struct sonode *, struct nmsghdr *, int,
+ struct cred *, mblk_t **);
+extern int so_recvmsg(struct sonode *, struct nmsghdr *, struct uio *,
+ struct cred *);
+extern int so_shutdown(struct sonode *, int, struct cred *);
+extern int so_close(struct sonode *, int, struct cred *);
+
+extern int so_tpi_fallback(struct sonode *, struct cred *);
+
+/* Common upcalls */
+extern sock_upper_handle_t so_newconn(sock_upper_handle_t,
+ sock_lower_handle_t, sock_downcalls_t *, struct cred *, pid_t,
+ sock_upcalls_t **);
+extern void so_set_prop(sock_upper_handle_t,
+ struct sock_proto_props *);
+extern ssize_t so_queue_msg(sock_upper_handle_t, mblk_t *, size_t, int,
+ int *, boolean_t *);
+extern void so_signal_oob(sock_upper_handle_t, ssize_t);
+
+extern void so_connected(sock_upper_handle_t, sock_connid_t, struct cred *,
+ pid_t);
+extern int so_disconnected(sock_upper_handle_t, sock_connid_t, int);
+extern void so_txq_full(sock_upper_handle_t, boolean_t);
+extern void so_opctl(sock_upper_handle_t, sock_opctl_action_t, uintptr_t);
+/* Common misc. functions */
+
+ /* accept queue */
+extern int so_acceptq_enqueue(struct sonode *, struct sonode *);
+extern int so_acceptq_enqueue_locked(struct sonode *, struct sonode *);
+extern int so_acceptq_dequeue(struct sonode *, boolean_t,
+ struct sonode **);
+extern void so_acceptq_flush(struct sonode *);
+
+ /* connect */
+extern int so_wait_connected(struct sonode *, boolean_t, sock_connid_t);
+
+ /* send */
+extern int so_snd_wait_qnotfull(struct sonode *, boolean_t);
+extern void so_snd_qfull(struct sonode *so);
+extern void so_snd_qnotfull(struct sonode *so);
+
+extern int socket_chgpgrp(struct sonode *, pid_t);
+extern void socket_sendsig(struct sonode *, int);
+extern int so_dequeue_msg(struct sonode *, mblk_t **, struct uio *,
+ rval_t *, int);
+extern void so_enqueue_msg(struct sonode *, mblk_t *, size_t);
+
+extern mblk_t *socopyinuio(uio_t *, ssize_t, size_t, ssize_t, size_t, int *);
+extern mblk_t *socopyoutuio(mblk_t *, struct uio *, ssize_t, int *);
+
+extern boolean_t somsghasdata(mblk_t *);
+extern void so_rcv_flush(struct sonode *);
+extern int sorecvoob(struct sonode *, struct nmsghdr *, struct uio *,
+ int, boolean_t);
+
+extern void so_timer_callback(void *);
+
+extern struct sonode *socket_sonode_create(struct sockparams *, int, int, int,
+ int, int, int *, struct cred *);
+
+extern void socket_sonode_destroy(struct sonode *);
+extern int socket_init_common(struct sonode *, struct sonode *, int flags,
+ struct cred *);
+extern int socket_getopt_common(struct sonode *, int, int, void *, socklen_t *);
+extern int socket_ioctl_common(struct sonode *, int, intptr_t, int,
+ struct cred *, int32_t *);
+extern int socket_strioc_common(struct sonode *, int, intptr_t, int,
+ struct cred *, int32_t *);
+
+extern int so_zcopy_wait(struct sonode *);
+extern int so_get_mod_version(struct sockparams *);
+
+/* Notification functions */
+extern void so_notify_connected(struct sonode *);
+extern void so_notify_disconnecting(struct sonode *);
+extern void so_notify_disconnected(struct sonode *, int);
+extern void so_notify_writable(struct sonode *);
+extern void so_notify_data(struct sonode *, size_t);
+extern void so_notify_oobsig(struct sonode *);
+extern void so_notify_oobdata(struct sonode *, boolean_t);
+extern void so_notify_eof(struct sonode *);
+extern void so_notify_newconn(struct sonode *);
+extern void so_notify_shutdown(struct sonode *);
+extern void so_notify_error(struct sonode *);
+
+/* Common sonode functions */
+extern int sonode_constructor(void *, void *, int);
+extern void sonode_destructor(void *, void *);
+extern void sonode_init(struct sonode *, struct sockparams *,
+ int, int, int, sonodeops_t *);
+extern void sonode_fini(struct sonode *);
+
+/*
+ * Event flags to socket_sendsig().
+ */
+#define SOCKETSIG_WRITE 0x1
+#define SOCKETSIG_READ 0x2
+#define SOCKETSIG_URG 0x4
+
+extern sonodeops_t so_sonodeops;
+extern sock_upcalls_t so_upcalls;
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* _SOCKCOMMON_H_ */
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
new file mode 100644
index 0000000000..e8fc18552d
--- /dev/null
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
@@ -0,0 +1,1696 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "@(#)sockcommon_sops.c 1.1 07/06/14 SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/debug.h>
+#include <sys/cmn_err.h>
+
+#include <sys/stropts.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+
+#define _SUN_TPI_VERSION 2
+#include <sys/tihdr.h>
+#include <sys/sockio.h>
+#include <sys/sodirect.h>
+#include <sys/kmem_impl.h>
+
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/ddi.h>
+#include <netinet/in.h>
+#include <inet/ip.h>
+
+#include <fs/sockfs/sockcommon.h>
+
+#include <sys/socket_proto.h>
+
+#include <fs/sockfs/socktpi_impl.h>
+#include <sys/tihdr.h>
+#include <fs/sockfs/nl7c.h>
+#include <inet/kssl/ksslapi.h>
+
+
+extern int xnet_skip_checks;
+extern int xnet_check_print;
+
+static void so_queue_oob(sock_upper_handle_t, mblk_t *, size_t);
+
+
+/*ARGSUSED*/
+int
+so_accept_notsupp(struct sonode *lso, int fflag,
+ struct cred *cr, struct sonode **nsop)
+{
+ return (EOPNOTSUPP);
+}
+
+/*ARGSUSED*/
+int
+so_listen_notsupp(struct sonode *so, int backlog, struct cred *cr)
+{
+ return (EOPNOTSUPP);
+}
+
+/*ARGSUSED*/
+int
+so_getsockname_notsupp(struct sonode *so, struct sockaddr *sa,
+ socklen_t *len, struct cred *cr)
+{
+ return (EOPNOTSUPP);
+}
+
+/*ARGSUSED*/
+int
+so_getpeername_notsupp(struct sonode *so, struct sockaddr *addr,
+ socklen_t *addrlen, boolean_t accept, struct cred *cr)
+{
+ return (EOPNOTSUPP);
+}
+
+/*ARGSUSED*/
+int
+so_shutdown_notsupp(struct sonode *so, int how, struct cred *cr)
+{
+ return (EOPNOTSUPP);
+}
+
+/*ARGSUSED*/
+int
+so_sendmblk_notsupp(struct sonode *so, struct msghdr *msg, int fflag,
+ struct cred *cr, mblk_t **mpp)
+{
+ return (EOPNOTSUPP);
+}
+
+/*
+ * Generic Socket Ops
+ */
+
+/* ARGSUSED */
+int
+so_init(struct sonode *so, struct sonode *pso, struct cred *cr, int flags)
+{
+ return (socket_init_common(so, pso, flags, cr));
+}
+
+int
+so_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
+ int flags, struct cred *cr)
+{
+ int error;
+
+ SO_BLOCK_FALLBACK(so, SOP_BIND(so, name, namelen, flags, cr));
+
+ ASSERT(flags == _SOBIND_XPG4_2 || flags == _SOBIND_SOCKBSD);
+
+ /* X/Open requires this check */
+ if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
+ if (xnet_check_print) {
+ printf("sockfs: X/Open bind state check "
+ "caused EINVAL\n");
+ }
+ error = EINVAL;
+ goto done;
+ }
+
+ /*
+ * a bind to a NULL address is interpreted as unbind. So just
+ * do the downcall.
+ */
+ if (name == NULL)
+ goto dobind;
+
+ switch (so->so_family) {
+ case AF_INET:
+ if ((size_t)namelen != sizeof (sin_t)) {
+ error = name->sa_family != so->so_family ?
+ EAFNOSUPPORT : EINVAL;
+ eprintsoline(so, error);
+ goto done;
+ }
+
+ if ((flags & _SOBIND_XPG4_2) &&
+ (name->sa_family != so->so_family)) {
+ /*
+ * This check has to be made for X/Open
+ * sockets however application failures have
+ * been observed when it is applied to
+ * all sockets.
+ */
+ error = EAFNOSUPPORT;
+ eprintsoline(so, error);
+ goto done;
+ }
+ /*
+ * Force a zero sa_family to match so_family.
+ *
+ * Some programs like inetd(1M) don't set the
+ * family field. Other programs leave
+ * sin_family set to garbage - SunOS 4.X does
+ * not check the family field on a bind.
+ * We use the family field that
+ * was passed in to the socket() call.
+ */
+ name->sa_family = so->so_family;
+ break;
+
+ case AF_INET6: {
+#ifdef DEBUG
+ sin6_t *sin6 = (sin6_t *)name;
+#endif
+ if ((size_t)namelen != sizeof (sin6_t)) {
+ error = name->sa_family != so->so_family ?
+ EAFNOSUPPORT : EINVAL;
+ eprintsoline(so, error);
+ goto done;
+ }
+
+ if (name->sa_family != so->so_family) {
+ /*
+ * With IPv6 we require the family to match
+ * unlike in IPv4.
+ */
+ error = EAFNOSUPPORT;
+ eprintsoline(so, error);
+ goto done;
+ }
+#ifdef DEBUG
+ /*
+ * Verify that apps don't forget to clear
+ * sin6_scope_id etc
+ */
+ if (sin6->sin6_scope_id != 0 &&
+ !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
+ zcmn_err(getzoneid(), CE_WARN,
+ "bind with uninitialized sin6_scope_id "
+ "(%d) on socket. Pid = %d\n",
+ (int)sin6->sin6_scope_id,
+ (int)curproc->p_pid);
+ }
+ if (sin6->__sin6_src_id != 0) {
+ zcmn_err(getzoneid(), CE_WARN,
+ "bind with uninitialized __sin6_src_id "
+ "(%d) on socket. Pid = %d\n",
+ (int)sin6->__sin6_src_id,
+ (int)curproc->p_pid);
+ }
+#endif /* DEBUG */
+
+ break;
+ }
+ default:
+ /* Just pass the request to the protocol */
+ goto dobind;
+ }
+
+ /*
+ * First we check if either NCA or KSSL has been enabled for
+ * the requested address, and if so, we fall back to TPI.
+ * If neither of those two services are enabled, then we just
+ * pass the request to the protocol.
+ *
+ * Note that KSSL can only be enabled on a socket if NCA is NOT
+ * enabled for that socket, hence the else-statement below.
+ */
+ if (nl7c_enabled && ((so->so_family == AF_INET ||
+ so->so_family == AF_INET6) &&
+ nl7c_lookup_addr(name, namelen) != NULL)) {
+ /*
+ * NL7C is not supported in non-global zones,
+ * we enforce this restriction here.
+ */
+ if (so->so_zoneid == GLOBAL_ZONEID) {
+ /* NCA should be used, so fall back to TPI */
+ error = so_tpi_fallback(so, cr);
+ SO_UNBLOCK_FALLBACK(so);
+ if (error)
+ return (error);
+ else
+ return (SOP_BIND(so, name, namelen, flags, cr));
+ }
+ } else if (so->so_type == SOCK_STREAM) {
+ /* Check if KSSL has been configured for this address */
+ kssl_ent_t ent;
+ kssl_endpt_type_t type;
+ struct T_bind_req bind_req;
+ mblk_t *mp;
+
+ /*
+ * TODO: Check with KSSL team if we could add a function call
+ * that only queries whether KSSL is enabled for the given
+ * address.
+ */
+ bind_req.PRIM_type = T_BIND_REQ;
+ bind_req.ADDR_length = namelen;
+ bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
+ mp = soallocproto2(&bind_req, sizeof (bind_req),
+ name, namelen, 0, _ALLOC_SLEEP);
+
+ type = kssl_check_proxy(mp, so, &ent);
+ freemsg(mp);
+
+ if (type != KSSL_NO_PROXY) {
+ /*
+ * KSSL has been configured for this address, so
+ * we must fall back to TPI.
+ */
+ kssl_release_ent(ent, so, type);
+ error = so_tpi_fallback(so, cr);
+ SO_UNBLOCK_FALLBACK(so);
+ if (error)
+ return (error);
+ else
+ return (SOP_BIND(so, name, namelen, flags, cr));
+ }
+ }
+
+dobind:
+ error = (*so->so_downcalls->sd_bind)
+ (so->so_proto_handle, name, namelen, cr);
+done:
+ SO_UNBLOCK_FALLBACK(so);
+
+ return (error);
+}
+
+int
+so_listen(struct sonode *so, int backlog, struct cred *cr)
+{
+ int error = 0;
+
+ ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+ SO_BLOCK_FALLBACK(so, SOP_LISTEN(so, backlog, cr));
+
+ error = (*so->so_downcalls->sd_listen)(so->so_proto_handle, backlog,
+ cr);
+
+ SO_UNBLOCK_FALLBACK(so);
+
+ return (error);
+}
+
+
+int
+so_connect(struct sonode *so, const struct sockaddr *name,
+ socklen_t namelen, int fflag, int flags, struct cred *cr)
+{
+ int error = 0;
+ sock_connid_t id;
+
+ ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+ SO_BLOCK_FALLBACK(so, SOP_CONNECT(so, name, namelen, fflag, flags, cr));
+
+ /*
+ * If there is a pending error, return error
+ * This can happen if a non blocking operation caused an error.
+ */
+
+ if (so->so_error != 0) {
+ mutex_enter(&so->so_lock);
+ error = sogeterr(so, B_TRUE);
+ mutex_exit(&so->so_lock);
+ if (error != 0)
+ goto done;
+ }
+
+ error = (*so->so_downcalls->sd_connect)(so->so_proto_handle,
+ name, namelen, &id, cr);
+
+ if (error == EINPROGRESS)
+ error = so_wait_connected(so, fflag & (FNONBLOCK|FNDELAY), id);
+
+done:
+ SO_UNBLOCK_FALLBACK(so);
+ return (error);
+}
+
+/*ARGSUSED*/
+int
+so_accept(struct sonode *so, int fflag, struct cred *cr, struct sonode **nsop)
+{
+ int error = 0;
+ struct sonode *nso;
+
+ *nsop = NULL;
+
+ SO_BLOCK_FALLBACK(so, SOP_ACCEPT(so, fflag, cr, nsop));
+ if ((so->so_state & SS_ACCEPTCONN) == 0) {
+ SO_UNBLOCK_FALLBACK(so);
+ return ((so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW) ?
+ EOPNOTSUPP : EINVAL);
+ }
+
+ if ((error = so_acceptq_dequeue(so, (fflag & (FNONBLOCK|FNDELAY)),
+ &nso)) == 0) {
+ ASSERT(nso != NULL);
+
+ /* finish the accept */
+ error = (*so->so_downcalls->sd_accept)(so->so_proto_handle,
+ nso->so_proto_handle, (sock_upper_handle_t)nso, cr);
+ if (error != 0) {
+ (void) socket_close(nso, 0, cr);
+ socket_destroy(nso);
+ } else {
+ *nsop = nso;
+ }
+ }
+
+ SO_UNBLOCK_FALLBACK(so);
+ return (error);
+}
+
+int
+so_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
+ struct cred *cr)
+{
+ int error, flags;
+ boolean_t dontblock;
+ ssize_t orig_resid;
+ mblk_t *mp;
+
+ SO_BLOCK_FALLBACK(so, SOP_SENDMSG(so, msg, uiop, cr));
+
+ flags = msg->msg_flags;
+ error = 0;
+ dontblock = (flags & MSG_DONTWAIT) ||
+ (uiop->uio_fmode & (FNONBLOCK|FNDELAY));
+
+ if (!(flags & MSG_XPG4_2) && msg->msg_controllen != 0) {
+ /*
+ * Old way of passing fd's is not supported
+ */
+ SO_UNBLOCK_FALLBACK(so);
+ return (EOPNOTSUPP);
+ }
+
+ if ((so->so_mode & SM_ATOMIC) &&
+ uiop->uio_resid > so->so_proto_props.sopp_maxpsz &&
+ so->so_proto_props.sopp_maxpsz != -1) {
+ SO_UNBLOCK_FALLBACK(so);
+ return (EMSGSIZE);
+ }
+
+ /*
+ * For atomic sends we will only do one iteration.
+ */
+ do {
+ if (so->so_state & SS_CANTSENDMORE) {
+ error = EPIPE;
+ break;
+ }
+
+ if (so->so_error != 0) {
+ mutex_enter(&so->so_lock);
+ error = sogeterr(so, B_TRUE);
+ mutex_exit(&so->so_lock);
+ if (error != 0)
+ break;
+ }
+
+ /*
+ * Send down OOB messages even if the send path is being
+ * flow controlled (assuming the protocol supports OOB data).
+ */
+ if (flags & MSG_OOB) {
+ if ((so->so_mode & SM_EXDATA) == 0) {
+ error = EOPNOTSUPP;
+ break;
+ }
+ } else if (so->so_snd_qfull) {
+ /*
+ * Need to wait until the protocol is ready to receive
+ * more data for transmission.
+ */
+ if ((error = so_snd_wait_qnotfull(so, dontblock)) != 0)
+ break;
+ }
+
+ /*
+ * Time to send data to the protocol. We either copy the
+ * data into mblks or pass the uio directly to the protocol.
+ * We decide what to do based on the available down calls.
+ */
+ if (so->so_downcalls->sd_send_uio != NULL) {
+ error = (*so->so_downcalls->sd_send_uio)
+ (so->so_proto_handle, uiop, msg, cr);
+ if (error != 0)
+ break;
+ } else {
+ /* save the resid in case of failure */
+ orig_resid = uiop->uio_resid;
+
+ if ((mp = socopyinuio(uiop,
+ so->so_proto_props.sopp_maxpsz,
+ so->so_proto_props.sopp_wroff,
+ so->so_proto_props.sopp_maxblk,
+ so->so_proto_props.sopp_tail, &error)) == NULL) {
+ break;
+ }
+ ASSERT(uiop->uio_resid >= 0);
+
+ error = (*so->so_downcalls->sd_send)
+ (so->so_proto_handle, mp, msg, cr);
+ if (error != 0) {
+ /*
+ * The send failed. We do not have to free the
+ * mblks, because that is the protocol's
+ * responsibility. However, uio_resid must
+ * remain accurate, so adjust that here.
+ */
+ uiop->uio_resid = orig_resid;
+ break;
+ }
+ }
+ } while (uiop->uio_resid > 0);
+
+ SO_UNBLOCK_FALLBACK(so);
+
+ return (error);
+}
+
+int
+so_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
+ struct cred *cr, mblk_t **mpp)
+{
+ int error;
+ boolean_t dontblock;
+ size_t size;
+ mblk_t *mp = *mpp;
+
+ SO_BLOCK_FALLBACK(so, SOP_SENDMBLK(so, msg, fflag, cr, mpp));
+
+ error = 0;
+ dontblock = (msg->msg_flags & MSG_DONTWAIT) ||
+ (fflag & (FNONBLOCK|FNDELAY));
+ size = msgdsize(mp);
+
+ if (so->so_downcalls->sd_send == NULL) {
+ SO_UNBLOCK_FALLBACK(so);
+ return (EOPNOTSUPP);
+ }
+
+ if ((so->so_mode & SM_ATOMIC) &&
+ size > so->so_proto_props.sopp_maxpsz &&
+ so->so_proto_props.sopp_maxpsz != -1) {
+ SO_UNBLOCK_FALLBACK(so);
+ return (EMSGSIZE);
+ }
+
+ while (mp != NULL) {
+ mblk_t *nmp, *last_mblk;
+ size_t mlen;
+
+ if (so->so_state & SS_CANTSENDMORE) {
+ error = EPIPE;
+ break;
+ }
+ if (so->so_error != 0) {
+ mutex_enter(&so->so_lock);
+ error = sogeterr(so, B_TRUE);
+ mutex_exit(&so->so_lock);
+ if (error != 0)
+ break;
+ }
+ if (so->so_snd_qfull) {
+ /*
+ * Need to wait until the protocol is ready to receive
+ * more data for transmission.
+ */
+ if ((error = so_snd_wait_qnotfull(so, dontblock)) != 0)
+ break;
+ }
+
+ /*
+ * We only allow so_maxpsz of data to be sent down to
+ * the protocol at time.
+ */
+ mlen = MBLKL(mp);
+ nmp = mp->b_cont;
+ last_mblk = mp;
+ while (nmp != NULL) {
+ mlen += MBLKL(nmp);
+ if (mlen > so->so_proto_props.sopp_maxpsz) {
+ last_mblk->b_cont = NULL;
+ break;
+ }
+ last_mblk = nmp;
+ nmp = nmp->b_cont;
+ }
+
+ error = (*so->so_downcalls->sd_send)
+ (so->so_proto_handle, mp, msg, cr);
+ if (error != 0) {
+ /*
+ * The send failed. The protocol will free the mblks
+ * that were sent down. Let the caller deal with the
+ * rest.
+ */
+ *mpp = nmp;
+ break;
+ }
+
+ *mpp = mp = nmp;
+ }
+
+ SO_UNBLOCK_FALLBACK(so);
+
+ return (error);
+}
+
+int
+so_shutdown(struct sonode *so, int how, struct cred *cr)
+{
+ int error;
+
+ SO_BLOCK_FALLBACK(so, SOP_SHUTDOWN(so, how, cr));
+
+ /*
+ * SunOS 4.X has no check for datagram sockets.
+ * 5.X checks that it is connected (ENOTCONN)
+ * X/Open requires that we check the connected state.
+ */
+ if (!(so->so_state & SS_ISCONNECTED)) {
+ if (!xnet_skip_checks) {
+ error = ENOTCONN;
+ if (xnet_check_print) {
+ printf("sockfs: X/Open shutdown check "
+ "caused ENOTCONN\n");
+ }
+ }
+ goto done;
+ }
+
+ error = ((*so->so_downcalls->sd_shutdown)(so->so_proto_handle,
+ how, cr));
+
+ /*
+ * Protocol agreed to shutdown. We need to flush the
+ * receive buffer if the receive side is being shutdown.
+ */
+ if (error == 0 && how != SHUT_WR) {
+ mutex_enter(&so->so_lock);
+ /* wait for active reader to finish */
+ (void) so_lock_read(so, 0);
+
+ so_rcv_flush(so);
+
+ so_unlock_read(so);
+ mutex_exit(&so->so_lock);
+ }
+
+done:
+ SO_UNBLOCK_FALLBACK(so);
+ return (error);
+}
+
+int
+so_getsockname(struct sonode *so, struct sockaddr *addr,
+ socklen_t *addrlen, struct cred *cr)
+{
+ int error;
+
+ SO_BLOCK_FALLBACK(so, SOP_GETSOCKNAME(so, addr, addrlen, cr));
+
+ error = (*so->so_downcalls->sd_getsockname)
+ (so->so_proto_handle, addr, addrlen, cr);
+
+ SO_UNBLOCK_FALLBACK(so);
+ return (error);
+}
+
+int
+so_getpeername(struct sonode *so, struct sockaddr *addr,
+ socklen_t *addrlen, boolean_t accept, struct cred *cr)
+{
+ int error;
+
+ SO_BLOCK_FALLBACK(so, SOP_GETPEERNAME(so, addr, addrlen, accept, cr));
+
+ if (accept) {
+ error = (*so->so_downcalls->sd_getpeername)
+ (so->so_proto_handle, addr, addrlen, cr);
+ } else if (!(so->so_state & SS_ISCONNECTED)) {
+ error = ENOTCONN;
+ } else if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
+ /* Added this check for X/Open */
+ error = EINVAL;
+ if (xnet_check_print) {
+ printf("sockfs: X/Open getpeername check => EINVAL\n");
+ }
+ } else {
+ error = (*so->so_downcalls->sd_getpeername)
+ (so->so_proto_handle, addr, addrlen, cr);
+ }
+
+ SO_UNBLOCK_FALLBACK(so);
+ return (error);
+}
+
+int
+so_getsockopt(struct sonode *so, int level, int option_name,
+ void *optval, socklen_t *optlenp, int flags, struct cred *cr)
+{
+ int error = 0;
+
+ ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+ SO_BLOCK_FALLBACK(so,
+ SOP_GETSOCKOPT(so, level, option_name, optval, optlenp, flags, cr));
+
+ error = socket_getopt_common(so, level, option_name, optval,
+ optlenp);
+ if (error < 0) {
+ error = (*so->so_downcalls->sd_getsockopt)
+ (so->so_proto_handle, level, option_name, optval, optlenp,
+ cr);
+ if (error == ENOPROTOOPT) {
+ if (level == SOL_SOCKET) {
+ /*
+ * If a protocol does not support a particular
+ * socket option, set can fail (not allowed)
+ * but get can not fail. This is the previous
+ * sockfs bahvior.
+ */
+ switch (option_name) {
+ case SO_LINGER:
+ if (*optlenp < (t_uscalar_t)
+ sizeof (struct linger)) {
+ error = EINVAL;
+ break;
+ }
+ error = 0;
+ bzero(optval, sizeof (struct linger));
+ *optlenp = sizeof (struct linger);
+ break;
+ case SO_RCVTIMEO:
+ case SO_SNDTIMEO:
+ if (*optlenp < (t_uscalar_t)
+ sizeof (struct timeval)) {
+ error = EINVAL;
+ break;
+ }
+ error = 0;
+ bzero(optval, sizeof (struct timeval));
+ *optlenp = sizeof (struct timeval);
+ break;
+ case SO_SND_BUFINFO:
+ if (*optlenp < (t_uscalar_t)
+ sizeof (struct so_snd_bufinfo)) {
+ error = EINVAL;
+ break;
+ }
+ error = 0;
+ bzero(optval,
+ sizeof (struct so_snd_bufinfo));
+ *optlenp =
+ sizeof (struct so_snd_bufinfo);
+ break;
+ case SO_DEBUG:
+ case SO_REUSEADDR:
+ case SO_KEEPALIVE:
+ case SO_DONTROUTE:
+ case SO_BROADCAST:
+ case SO_USELOOPBACK:
+ case SO_OOBINLINE:
+ case SO_DGRAM_ERRIND:
+ case SO_SNDBUF:
+ case SO_RCVBUF:
+ error = 0;
+ *((int32_t *)optval) = 0;
+ *optlenp = sizeof (int32_t);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ }
+
+ SO_UNBLOCK_FALLBACK(so);
+ return (error);
+}
+
+int
+so_setsockopt(struct sonode *so, int level, int option_name,
+ const void *optval, socklen_t optlen, struct cred *cr)
+{
+ int error = 0;
+
+ SO_BLOCK_FALLBACK(so,
+ SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
+
+ /* X/Open requires this check */
+ if (so->so_state & SS_CANTSENDMORE && !xnet_skip_checks) {
+ SO_UNBLOCK_FALLBACK(so);
+ if (xnet_check_print)
+ printf("sockfs: X/Open setsockopt check => EINVAL\n");
+ return (EINVAL);
+ }
+
+ if (level == SOL_SOCKET &&
+ ((option_name == SO_RCVTIMEO) || (option_name == SO_SNDTIMEO))) {
+ struct timeval *tl = (struct timeval *)optval;
+ clock_t t_usec;
+
+ if (optlen != (t_uscalar_t)sizeof (struct timeval)) {
+ SO_UNBLOCK_FALLBACK(so);
+ return (EINVAL);
+ }
+ t_usec = tl->tv_sec * 1000 * 1000 + tl->tv_usec;
+ mutex_enter(&so->so_lock);
+ if (option_name == SO_RCVTIMEO)
+ so->so_rcvtimeo = drv_usectohz(t_usec);
+ else
+ so->so_sndtimeo = drv_usectohz(t_usec);
+ mutex_exit(&so->so_lock);
+ SO_UNBLOCK_FALLBACK(so);
+ return (0);
+ }
+ error = (*so->so_downcalls->sd_setsockopt)
+ (so->so_proto_handle, level, option_name, optval, optlen, cr);
+
+ SO_UNBLOCK_FALLBACK(so);
+ return (error);
+}
+
+int
+so_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
+ struct cred *cr, int32_t *rvalp)
+{
+ int error = 0;
+
+ SO_BLOCK_FALLBACK(so, SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
+
+ /*
+ * If there is a pending error, return error
+ * This can happen if a non blocking operation caused an error.
+ */
+ if (so->so_error != 0) {
+ mutex_enter(&so->so_lock);
+ error = sogeterr(so, B_TRUE);
+ mutex_exit(&so->so_lock);
+ if (error != 0)
+ goto done;
+ }
+
+ /*
+ * calling strioc can result in the socket falling back to TPI,
+ * if that is supported.
+ */
+ if ((error = socket_ioctl_common(so, cmd, arg, mode, cr, rvalp)) < 0 &&
+ (error = socket_strioc_common(so, cmd, arg, mode, cr, rvalp)) < 0) {
+ error = (*so->so_downcalls->sd_ioctl)(so->so_proto_handle,
+ cmd, arg, mode, rvalp, cr);
+ }
+
+done:
+ SO_UNBLOCK_FALLBACK(so);
+
+ return (error);
+}
+
+int
+so_poll(struct sonode *so, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp)
+{
+ int state = so->so_state;
+ *reventsp = 0;
+
+ if (so->so_error != 0 &&
+ ((POLLIN|POLLRDNORM|POLLOUT) & events) != 0) {
+ *reventsp = (POLLIN|POLLRDNORM|POLLOUT) & events;
+ return (0);
+ }
+
+ /*
+ * As long as there is buffer to send data, and the socket is
+ * in a state where it can send data (i.e., connected for
+ * connection oriented protocols), then turn on POLLOUT events
+ */
+ if (!so->so_snd_qfull && ((so->so_mode & SM_CONNREQUIRED) == 0 ||
+ state & SS_ISCONNECTED)) {
+ *reventsp |= POLLOUT & events;
+ }
+
+ /*
+ * Turn on POLLIN whenever there is data on the receive queue,
+ * or the socket is in a state where no more data will be received.
+ * Also, if the socket is accepting connections, flip the bit if
+ * there is something on the queue.
+ */
+
+ /* Pending connections */
+ if (so->so_acceptq_len > 0)
+ *reventsp |= (POLLIN|POLLRDNORM) & events;
+
+ /* Data */
+ /* so_downcalls is null for sctp */
+ if (so->so_downcalls != NULL && so->so_downcalls->sd_poll != NULL) {
+ *reventsp |= (*so->so_downcalls->sd_poll)
+ (so->so_proto_handle, events & SO_PROTO_POLLEV, anyyet,
+ CRED()) & events;
+ ASSERT((*reventsp & ~events) == 0);
+ /* do not recheck events */
+ events &= ~SO_PROTO_POLLEV;
+ } else {
+ if (SO_HAVE_DATA(so))
+ *reventsp |= (POLLIN|POLLRDNORM) & events;
+
+ /* Urgent data */
+ if ((state & SS_OOBPEND) != 0)
+ *reventsp |= (POLLRDBAND) & events;
+ }
+
+ if (!*reventsp && !anyyet) {
+ /* Check for read events again, but this time under lock */
+ if (events & (POLLIN|POLLRDNORM)) {
+ mutex_enter(&so->so_lock);
+ if (SO_HAVE_DATA(so) || so->so_acceptq_len > 0) {
+ mutex_exit(&so->so_lock);
+ *reventsp |= (POLLIN|POLLRDNORM) & events;
+ return (0);
+ } else {
+ so->so_pollev |= SO_POLLEV_IN;
+ mutex_exit(&so->so_lock);
+ }
+ }
+ *phpp = &so->so_poll_list;
+ }
+ return (0);
+}
+
+/*
+ * Generic Upcalls
+ */
+void
+so_connected(sock_upper_handle_t sock_handle, sock_connid_t id,
+ cred_t *peer_cred, pid_t peer_cpid)
+{
+ struct sonode *so = (struct sonode *)sock_handle;
+
+ mutex_enter(&so->so_lock);
+ ASSERT(so->so_proto_handle != NULL);
+
+ if (peer_cred != NULL) {
+ if (so->so_peercred != NULL)
+ crfree(so->so_peercred);
+ crhold(peer_cred);
+ so->so_peercred = peer_cred;
+ so->so_cpid = peer_cpid;
+ }
+
+ so->so_proto_connid = id;
+ soisconnected(so);
+ /*
+ * Wake ones who're waiting for conn to become established.
+ */
+ so_notify_connected(so);
+}
+
+int
+so_disconnected(sock_upper_handle_t sock_handle, sock_connid_t id, int error)
+{
+ struct sonode *so = (struct sonode *)sock_handle;
+
+ mutex_enter(&so->so_lock);
+
+ so->so_proto_connid = id;
+ soisdisconnected(so, error);
+ so_notify_disconnected(so, error);
+
+ return (0);
+}
+
+void
+so_opctl(sock_upper_handle_t sock_handle, sock_opctl_action_t action,
+ uintptr_t arg)
+{
+ struct sonode *so = (struct sonode *)sock_handle;
+
+ switch (action) {
+ case SOCK_OPCTL_SHUT_SEND:
+ mutex_enter(&so->so_lock);
+ socantsendmore(so);
+ so_notify_disconnecting(so);
+ break;
+ case SOCK_OPCTL_SHUT_RECV: {
+ mutex_enter(&so->so_lock);
+ socantrcvmore(so);
+ so_notify_eof(so);
+ break;
+ }
+ case SOCK_OPCTL_ENAB_ACCEPT:
+ mutex_enter(&so->so_lock);
+ so->so_state |= SS_ACCEPTCONN;
+ so->so_backlog = (unsigned int)arg;
+ mutex_exit(&so->so_lock);
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+}
+
+void
+so_txq_full(sock_upper_handle_t sock_handle, boolean_t qfull)
+{
+ struct sonode *so = (struct sonode *)sock_handle;
+
+ if (qfull) {
+ so_snd_qfull(so);
+ } else {
+ so_snd_qnotfull(so);
+ mutex_enter(&so->so_lock);
+ so_notify_writable(so);
+ }
+}
+
+sock_upper_handle_t
+so_newconn(sock_upper_handle_t parenthandle,
+ sock_lower_handle_t proto_handle, sock_downcalls_t *sock_downcalls,
+ struct cred *peer_cred, pid_t peer_cpid, sock_upcalls_t **sock_upcallsp)
+{
+ struct sonode *so = (struct sonode *)parenthandle;
+ struct sonode *nso;
+ int error;
+
+ ASSERT(proto_handle != NULL);
+
+ if ((so->so_state & SS_ACCEPTCONN) == 0 ||
+ so->so_acceptq_len >= so->so_backlog)
+ return (NULL);
+
+ nso = socket_newconn(so, proto_handle, sock_downcalls, SOCKET_NOSLEEP,
+ &error);
+ if (nso == NULL)
+ return (NULL);
+
+ if (peer_cred != NULL) {
+ crhold(peer_cred);
+ nso->so_peercred = peer_cred;
+ nso->so_cpid = peer_cpid;
+ }
+
+ (void) so_acceptq_enqueue(so, nso);
+ mutex_enter(&so->so_lock);
+ so_notify_newconn(so);
+
+ *sock_upcallsp = &so_upcalls;
+
+ return ((sock_upper_handle_t)nso);
+}
+
+void
+so_set_prop(sock_upper_handle_t sock_handle, struct sock_proto_props *soppp)
+{
+ struct sonode *so;
+
+ so = (struct sonode *)sock_handle;
+
+ mutex_enter(&so->so_lock);
+
+ if (soppp->sopp_flags & SOCKOPT_MAXBLK)
+ so->so_proto_props.sopp_maxblk = soppp->sopp_maxblk;
+ if (soppp->sopp_flags & SOCKOPT_WROFF)
+ so->so_proto_props.sopp_wroff = soppp->sopp_wroff;
+ if (soppp->sopp_flags & SOCKOPT_TAIL)
+ so->so_proto_props.sopp_tail = soppp->sopp_tail;
+ if (soppp->sopp_flags & SOCKOPT_RCVHIWAT)
+ so->so_proto_props.sopp_rxhiwat = soppp->sopp_rxhiwat;
+ if (soppp->sopp_flags & SOCKOPT_RCVLOWAT)
+ so->so_proto_props.sopp_rxlowat = soppp->sopp_rxlowat;
+ if (soppp->sopp_flags & SOCKOPT_MAXPSZ)
+ so->so_proto_props.sopp_maxpsz = soppp->sopp_maxpsz;
+ if (soppp->sopp_flags & SOCKOPT_MINPSZ)
+ so->so_proto_props.sopp_minpsz = soppp->sopp_minpsz;
+ if (soppp->sopp_flags & SOCKOPT_ZCOPY) {
+ if (soppp->sopp_zcopyflag & ZCVMSAFE) {
+ so->so_proto_props.sopp_zcopyflag |= STZCVMSAFE;
+ so->so_proto_props.sopp_zcopyflag &= ~STZCVMUNSAFE;
+ } else if (soppp->sopp_zcopyflag & ZCVMUNSAFE) {
+ so->so_proto_props.sopp_zcopyflag |= STZCVMUNSAFE;
+ so->so_proto_props.sopp_zcopyflag &= ~STZCVMSAFE;
+ }
+
+ if (soppp->sopp_zcopyflag & COPYCACHED) {
+ so->so_proto_props.sopp_zcopyflag |= STRCOPYCACHED;
+ }
+ }
+ if (soppp->sopp_flags & SOCKOPT_OOBINLINE)
+ so->so_proto_props.sopp_oobinline = soppp->sopp_oobinline;
+ if (soppp->sopp_flags & SOCKOPT_RCVTIMER)
+ so->so_proto_props.sopp_rcvtimer = soppp->sopp_rcvtimer;
+ if (soppp->sopp_flags & SOCKOPT_RCVTHRESH)
+ so->so_proto_props.sopp_rcvthresh = soppp->sopp_rcvthresh;
+ if (soppp->sopp_flags & SOCKOPT_MAXADDRLEN)
+ so->so_proto_props.sopp_maxaddrlen = soppp->sopp_maxaddrlen;
+
+ mutex_exit(&so->so_lock);
+
+#ifdef DEBUG
+ soppp->sopp_flags &= ~(SOCKOPT_MAXBLK | SOCKOPT_WROFF | SOCKOPT_TAIL |
+ SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | SOCKOPT_MAXPSZ |
+ SOCKOPT_ZCOPY | SOCKOPT_OOBINLINE | SOCKOPT_RCVTIMER |
+ SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ);
+ ASSERT(soppp->sopp_flags == 0);
+#endif
+}
+
+/* ARGSUSED */
+ssize_t
+so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp,
+ size_t msg_size, int flags, int *errorp, boolean_t *force_pushp)
+{
+ struct sonode *so = (struct sonode *)sock_handle;
+ boolean_t force_push = B_TRUE;
+ int space_left;
+ sodirect_t *sodp = so->so_direct;
+
+ ASSERT(errorp != NULL);
+ *errorp = 0;
+ if (mp == NULL) {
+ if (msg_size > 0) {
+ ASSERT(so->so_downcalls->sd_recv_uio != NULL);
+ mutex_enter(&so->so_lock);
+ /* the notify functions will drop the lock */
+ if (flags & MSG_OOB)
+ so_notify_oobdata(so, IS_SO_OOB_INLINE(so));
+ else
+ so_notify_data(so, msg_size);
+ return (0);
+ }
+ /*
+ * recv space check
+ */
+ mutex_enter(&so->so_lock);
+ space_left = so->so_rcvbuf - so->so_rcv_queued;
+ if (space_left <= 0) {
+ so->so_flowctrld = B_TRUE;
+ *errorp = ENOSPC;
+ space_left = -1;
+ }
+ goto done_unlock;
+ }
+
+ ASSERT(mp->b_next == NULL);
+ ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO);
+ ASSERT(msg_size == msgdsize(mp));
+
+ if (flags & MSG_OOB) {
+ so_queue_oob(sock_handle, mp, msg_size);
+ return (0);
+ }
+
+ if (force_pushp != NULL)
+ force_push = *force_pushp;
+
+ if (DB_TYPE(mp) == M_PROTO && !__TPI_PRIM_ISALIGNED(mp->b_rptr)) {
+ /* The read pointer is not aligned correctly for TPI */
+ zcmn_err(getzoneid(), CE_WARN,
+ "sockfs: Unaligned TPI message received. rptr = %p\n",
+ (void *)mp->b_rptr);
+ freemsg(mp);
+ mutex_enter(sodp->sod_lockp);
+ SOD_UIOAFINI(sodp);
+ mutex_exit(sodp->sod_lockp);
+
+ return (so->so_rcvbuf - so->so_rcv_queued);
+ }
+
+ mutex_enter(&so->so_lock);
+ if (so->so_state & (SS_FALLBACK_PENDING | SS_FALLBACK_COMP)) {
+ SOD_DISABLE(sodp);
+ mutex_exit(&so->so_lock);
+ *errorp = EOPNOTSUPP;
+ return (-1);
+ }
+ if (so->so_state & SS_CANTRCVMORE) {
+ freemsg(mp);
+ SOD_DISABLE(sodp);
+ mutex_exit(&so->so_lock);
+ return (0);
+ }
+
+ /* process the mblk via I/OAT if capable */
+ if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) {
+ if (DB_TYPE(mp) == M_DATA) {
+ (void) sod_uioa_mblk_init(sodp, mp, msg_size);
+ } else {
+ SOD_UIOAFINI(sodp);
+ }
+ }
+
+ if (mp->b_next == NULL) {
+ so_enqueue_msg(so, mp, msg_size);
+ } else {
+ do {
+ mblk_t *nmp;
+
+ if ((nmp = mp->b_next) != NULL) {
+ mp->b_next = NULL;
+ }
+ so_enqueue_msg(so, mp, msgdsize(mp));
+ mp = nmp;
+ } while (mp != NULL);
+ }
+
+ space_left = so->so_rcvbuf - so->so_rcv_queued;
+ if (space_left <= 0) {
+ so->so_flowctrld = B_TRUE;
+ *errorp = ENOSPC;
+ space_left = -1;
+ }
+
+ if (force_push || so->so_rcv_queued >= so->so_rcv_thresh ||
+ so->so_rcv_queued >= so->so_rcv_wanted ||
+ (sodp != NULL && so->so_rcv_queued >= sodp->sod_want)) {
+ SOCKET_TIMER_CANCEL(so);
+ /*
+ * so_notify_data will release the lock
+ */
+ so_notify_data(so, so->so_rcv_queued);
+
+ if (force_pushp != NULL)
+ *force_pushp = B_TRUE;
+ goto done;
+ } else if (so->so_rcv_timer_tid == 0) {
+ /* Make sure the recv push timer is running */
+ SOCKET_TIMER_START(so);
+ }
+
+done_unlock:
+ mutex_exit(&so->so_lock);
+done:
+ return (space_left);
+}
+
+/*
+ * Set the offset of where the oob data is relative to the bytes in
+ * queued. Also generate SIGURG
+ */
+void
+so_signal_oob(sock_upper_handle_t sock_handle, ssize_t offset)
+{
+ struct sonode *so;
+
+ ASSERT(offset >= 0);
+ so = (struct sonode *)sock_handle;
+ mutex_enter(&so->so_lock);
+ SOD_UIOAFINI(so->so_direct);
+
+ /*
+ * New urgent data on the way so forget about any old
+ * urgent data.
+ */
+ so->so_state &= ~(SS_HAVEOOBDATA|SS_HADOOBDATA);
+
+ /*
+ * Record that urgent data is pending.
+ */
+ so->so_state |= SS_OOBPEND;
+
+ if (so->so_oobmsg != NULL) {
+ dprintso(so, 1, ("sock: discarding old oob\n"));
+ freemsg(so->so_oobmsg);
+ so->so_oobmsg = NULL;
+ }
+
+ /*
+ * set the offset where the urgent byte is
+ */
+ so->so_oobmark = so->so_rcv_queued + offset;
+ if (so->so_oobmark == 0)
+ so->so_state |= SS_RCVATMARK;
+ else
+ so->so_state &= ~SS_RCVATMARK;
+
+ so_notify_oobsig(so);
+}
+
+/*
+ * Queue the OOB byte
+ */
+static void
+so_queue_oob(sock_upper_handle_t sock_handle, mblk_t *mp, size_t len)
+{
+ struct sonode *so;
+
+ so = (struct sonode *)sock_handle;
+ mutex_enter(&so->so_lock);
+ SOD_UIOAFINI(so->so_direct);
+
+ ASSERT(mp != NULL);
+ if (!IS_SO_OOB_INLINE(so)) {
+ so->so_oobmsg = mp;
+ so->so_state |= SS_HAVEOOBDATA;
+ } else {
+ so_enqueue_msg(so, mp, len);
+ }
+
+ so_notify_oobdata(so, IS_SO_OOB_INLINE(so));
+}
+
+int
+so_close(struct sonode *so, int flag, struct cred *cr)
+{
+ int error;
+
+ error = (*so->so_downcalls->sd_close)(so->so_proto_handle, flag, cr);
+
+ /*
+ * At this point there will be no more upcalls from the protocol
+ */
+ mutex_enter(&so->so_lock);
+ so_rcv_flush(so);
+ mutex_exit(&so->so_lock);
+
+ return (error);
+}
+
+void
+so_zcopy_notify(sock_upper_handle_t sock_handle)
+{
+ struct sonode *so = (struct sonode *)sock_handle;
+
+ mutex_enter(&so->so_lock);
+ so->so_copyflag |= STZCNOTIFY;
+ cv_broadcast(&so->so_copy_cv);
+ mutex_exit(&so->so_lock);
+}
+
+void
+so_set_error(sock_upper_handle_t sock_handle, int error)
+{
+ struct sonode *so = (struct sonode *)sock_handle;
+
+ mutex_enter(&so->so_lock);
+
+ soseterror(so, error);
+
+ so_notify_error(so);
+}
+
+/*
+ * so_recvmsg - read data from the socket
+ *
+ * There are two ways of obtaining data; either we ask the protocol to
+ * copy directly into the supplied buffer, or we copy data from the
+ * sonode's receive queue. The decision which one to use depends on
+ * whether the protocol has a sd_recv_uio down call.
+ */
+int
+so_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
+ struct cred *cr)
+{
+ rval_t rval;
+ int flags = 0;
+ t_uscalar_t controllen, namelen;
+ int error = 0;
+ int ret;
+ mblk_t *mctlp = NULL;
+ union T_primitives *tpr;
+ void *control;
+ ssize_t saved_resid;
+ struct uio *suiop;
+
+ SO_BLOCK_FALLBACK(so, SOP_RECVMSG(so, msg, uiop, cr));
+
+ if ((so->so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
+ (so->so_mode & SM_CONNREQUIRED)) {
+ SO_UNBLOCK_FALLBACK(so);
+ return (ENOTCONN);
+ }
+
+ if (msg->msg_flags & MSG_PEEK)
+ msg->msg_flags &= ~MSG_WAITALL;
+
+ if (so->so_mode & SM_ATOMIC)
+ msg->msg_flags |= MSG_TRUNC;
+
+ if (msg->msg_flags & MSG_OOB) {
+ if ((so->so_mode & SM_EXDATA) == 0) {
+ error = EOPNOTSUPP;
+ } else if (so->so_downcalls->sd_recv_uio != NULL) {
+ error = (*so->so_downcalls->sd_recv_uio)
+ (so->so_proto_handle, uiop, msg, cr);
+ } else {
+ error = sorecvoob(so, msg, uiop, msg->msg_flags,
+ IS_SO_OOB_INLINE(so));
+ }
+ SO_UNBLOCK_FALLBACK(so);
+ return (error);
+ }
+
+ /*
+ * If the protocol has the recv down call, then pass the request
+ * down.
+ */
+ if (so->so_downcalls->sd_recv_uio != NULL) {
+ error = (*so->so_downcalls->sd_recv_uio)
+ (so->so_proto_handle, uiop, msg, cr);
+ SO_UNBLOCK_FALLBACK(so);
+ return (error);
+ }
+
+ /*
+ * Reading data from the socket buffer
+ */
+ flags = msg->msg_flags;
+ msg->msg_flags = 0;
+
+ /*
+ * Set msg_controllen and msg_namelen to zero here to make it
+ * simpler in the cases that no control or name is returned.
+ */
+ controllen = msg->msg_controllen;
+ namelen = msg->msg_namelen;
+ msg->msg_controllen = 0;
+ msg->msg_namelen = 0;
+
+ mutex_enter(&so->so_lock);
+ /* Set SOREADLOCKED */
+ error = so_lock_read_intr(so,
+ uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
+ mutex_exit(&so->so_lock);
+ if (error) {
+ SO_UNBLOCK_FALLBACK(so);
+ return (error);
+ }
+
+ suiop = sod_rcv_init(so, flags, &uiop);
+retry:
+ saved_resid = uiop->uio_resid;
+ error = so_dequeue_msg(so, &mctlp, uiop, &rval, flags);
+ if (error != 0) {
+ goto out;
+ }
+ /*
+ * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
+ * For non-datagrams MOREDATA is used to set MSG_EOR.
+ */
+ ASSERT(!(rval.r_val1 & MORECTL));
+ if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
+ msg->msg_flags |= MSG_TRUNC;
+ if (mctlp == NULL) {
+ dprintso(so, 1, ("so_recvmsg: got M_DATA\n"));
+
+ mutex_enter(&so->so_lock);
+ /* Set MSG_EOR based on MOREDATA */
+ if (!(rval.r_val1 & MOREDATA)) {
+ if (so->so_state & SS_SAVEDEOR) {
+ msg->msg_flags |= MSG_EOR;
+ so->so_state &= ~SS_SAVEDEOR;
+ }
+ }
+ /*
+ * If some data was received (i.e. not EOF) and the
+ * read/recv* has not been satisfied wait for some more.
+ */
+ if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
+ uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
+ mutex_exit(&so->so_lock);
+ goto retry;
+ }
+
+ goto out_locked;
+ }
+ /* strsock_proto has already verified length and alignment */
+ tpr = (union T_primitives *)mctlp->b_rptr;
+ dprintso(so, 1, ("so_recvmsg: type %d\n", tpr->type));
+ switch (tpr->type) {
+ case T_DATA_IND: {
+ /*
+ * Set msg_flags to MSG_EOR based on
+ * MORE_flag and MOREDATA.
+ */
+ mutex_enter(&so->so_lock);
+ so->so_state &= ~SS_SAVEDEOR;
+ if (!(tpr->data_ind.MORE_flag & 1)) {
+ if (!(rval.r_val1 & MOREDATA))
+ msg->msg_flags |= MSG_EOR;
+ else
+ so->so_state |= SS_SAVEDEOR;
+ }
+ freemsg(mctlp);
+ /*
+ * If some data was received (i.e. not EOF) and the
+ * read/recv* has not been satisfied wait for some more.
+ */
+ if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
+ uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
+ mutex_exit(&so->so_lock);
+ goto retry;
+ }
+ goto out_locked;
+ }
+ case T_UNITDATA_IND: {
+ void *addr;
+ t_uscalar_t addrlen;
+ void *abuf;
+ t_uscalar_t optlen;
+ void *opt;
+
+ if (namelen != 0) {
+ /* Caller wants source address */
+ addrlen = tpr->unitdata_ind.SRC_length;
+ addr = sogetoff(mctlp, tpr->unitdata_ind.SRC_offset,
+ addrlen, 1);
+ if (addr == NULL) {
+ freemsg(mctlp);
+ error = EPROTO;
+ eprintsoline(so, error);
+ goto out;
+ }
+ ASSERT(so->so_family != AF_UNIX);
+ }
+ optlen = tpr->unitdata_ind.OPT_length;
+ if (optlen != 0) {
+ t_uscalar_t ncontrollen;
+
+ /*
+ * Extract any source address option.
+ * Determine how large cmsg buffer is needed.
+ */
+ opt = sogetoff(mctlp, tpr->unitdata_ind.OPT_offset,
+ optlen, __TPI_ALIGN_SIZE);
+
+ if (opt == NULL) {
+ freemsg(mctlp);
+ error = EPROTO;
+ eprintsoline(so, error);
+ goto out;
+ }
+ if (so->so_family == AF_UNIX)
+ so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
+ ncontrollen = so_cmsglen(mctlp, opt, optlen,
+ !(flags & MSG_XPG4_2));
+ if (controllen != 0)
+ controllen = ncontrollen;
+ else if (ncontrollen != 0)
+ msg->msg_flags |= MSG_CTRUNC;
+ } else {
+ controllen = 0;
+ }
+
+ if (namelen != 0) {
+ /*
+ * Return address to caller.
+ * Caller handles truncation if length
+ * exceeds msg_namelen.
+ * NOTE: AF_UNIX NUL termination is ensured by
+ * the sender's copyin_name().
+ */
+ abuf = kmem_alloc(addrlen, KM_SLEEP);
+
+ bcopy(addr, abuf, addrlen);
+ msg->msg_name = abuf;
+ msg->msg_namelen = addrlen;
+ }
+
+ if (controllen != 0) {
+ /*
+ * Return control msg to caller.
+ * Caller handles truncation if length
+ * exceeds msg_controllen.
+ */
+ control = kmem_zalloc(controllen, KM_SLEEP);
+
+ error = so_opt2cmsg(mctlp, opt, optlen,
+ !(flags & MSG_XPG4_2), control, controllen);
+ if (error) {
+ freemsg(mctlp);
+ if (msg->msg_namelen != 0)
+ kmem_free(msg->msg_name,
+ msg->msg_namelen);
+ kmem_free(control, controllen);
+ eprintsoline(so, error);
+ goto out;
+ }
+ msg->msg_control = control;
+ msg->msg_controllen = controllen;
+ }
+
+ freemsg(mctlp);
+ goto out;
+ }
+ case T_OPTDATA_IND: {
+ struct T_optdata_req *tdr;
+ void *opt;
+ t_uscalar_t optlen;
+
+ tdr = (struct T_optdata_req *)mctlp->b_rptr;
+ optlen = tdr->OPT_length;
+ if (optlen != 0) {
+ t_uscalar_t ncontrollen;
+ /*
+ * Determine how large cmsg buffer is needed.
+ */
+ opt = sogetoff(mctlp,
+ tpr->optdata_ind.OPT_offset, optlen,
+ __TPI_ALIGN_SIZE);
+
+ if (opt == NULL) {
+ freemsg(mctlp);
+ error = EPROTO;
+ eprintsoline(so, error);
+ goto out;
+ }
+
+ ncontrollen = so_cmsglen(mctlp, opt, optlen,
+ !(flags & MSG_XPG4_2));
+ if (controllen != 0)
+ controllen = ncontrollen;
+ else if (ncontrollen != 0)
+ msg->msg_flags |= MSG_CTRUNC;
+ } else {
+ controllen = 0;
+ }
+
+ if (controllen != 0) {
+ /*
+ * Return control msg to caller.
+ * Caller handles truncation if length
+ * exceeds msg_controllen.
+ */
+ control = kmem_zalloc(controllen, KM_SLEEP);
+
+ error = so_opt2cmsg(mctlp, opt, optlen,
+ !(flags & MSG_XPG4_2), control, controllen);
+ if (error) {
+ freemsg(mctlp);
+ kmem_free(control, controllen);
+ eprintsoline(so, error);
+ goto out;
+ }
+ msg->msg_control = control;
+ msg->msg_controllen = controllen;
+ }
+
+ /*
+ * Set msg_flags to MSG_EOR based on
+ * DATA_flag and MOREDATA.
+ */
+ mutex_enter(&so->so_lock);
+ so->so_state &= ~SS_SAVEDEOR;
+ if (!(tpr->data_ind.MORE_flag & 1)) {
+ if (!(rval.r_val1 & MOREDATA))
+ msg->msg_flags |= MSG_EOR;
+ else
+ so->so_state |= SS_SAVEDEOR;
+ }
+ freemsg(mctlp);
+ /*
+ * If some data was received (i.e. not EOF) and the
+ * read/recv* has not been satisfied wait for some more.
+ * Not possible to wait if control info was received.
+ */
+ if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
+ controllen == 0 &&
+ uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
+ mutex_exit(&so->so_lock);
+ goto retry;
+ }
+ goto out_locked;
+ }
+ default:
+ cmn_err(CE_CONT, "so_recvmsg bad type %x \n",
+ tpr->type);
+ freemsg(mctlp);
+ error = EPROTO;
+ ASSERT(0);
+ }
+out:
+ mutex_enter(&so->so_lock);
+out_locked:
+ /* The sod_lockp pointers to the sonode so_lock */
+ ret = sod_rcv_done(so, suiop, uiop);
+ if (ret != 0 && error == 0)
+ error = ret;
+
+ so_unlock_read(so); /* Clear SOREADLOCKED */
+ mutex_exit(&so->so_lock);
+
+ SO_UNBLOCK_FALLBACK(so);
+
+ return (error);
+}
+
+sonodeops_t so_sonodeops = {
+ so_init, /* sop_init */
+ so_accept, /* sop_accept */
+ so_bind, /* sop_bind */
+ so_listen, /* sop_listen */
+ so_connect, /* sop_connect */
+ so_recvmsg, /* sop_recvmsg */
+ so_sendmsg, /* sop_sendmsg */
+ so_sendmblk, /* sop_sendmblk */
+ so_getpeername, /* sop_getpeername */
+ so_getsockname, /* sop_getsockname */
+ so_shutdown, /* sop_shutdown */
+ so_getsockopt, /* sop_getsockopt */
+ so_setsockopt, /* sop_setsockopt */
+ so_ioctl, /* sop_ioctl */
+ so_poll, /* sop_poll */
+ so_close, /* sop_close */
+};
+
+sock_upcalls_t so_upcalls = {
+ so_newconn,
+ so_connected,
+ so_disconnected,
+ so_opctl,
+ so_queue_msg,
+ so_set_prop,
+ so_txq_full,
+ so_signal_oob,
+ so_zcopy_notify,
+ so_set_error
+};
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
new file mode 100644
index 0000000000..c1cfa6bf5f
--- /dev/null
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
@@ -0,0 +1,1970 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/signal.h>
+#include <sys/cmn_err.h>
+
+#include <sys/stropts.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sockio.h>
+#include <sys/sodirect.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/atomic.h>
+
+#include <fs/sockfs/sockcommon.h>
+#include <fs/sockfs/socktpi.h>
+#include <sys/ddi.h>
+#include <inet/ip.h>
+#include <sys/time.h>
+#include <sys/cmn_err.h>
+
+#ifdef SOCK_TEST
+extern int do_useracc;
+extern clock_t sock_test_timelimit;
+#endif /* SOCK_TEST */
+
+#define MBLK_PULL_LEN 64
+uint32_t so_mblk_pull_len = MBLK_PULL_LEN;
+
+#ifdef DEBUG
+boolean_t so_debug_length = B_FALSE;
+static boolean_t so_check_length(sonode_t *so);
+#endif
+
+int
+so_acceptq_enqueue_locked(struct sonode *so, struct sonode *nso)
+{
+ ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
+ ASSERT(nso->so_acceptq_next == NULL);
+
+ *so->so_acceptq_tail = nso;
+ so->so_acceptq_tail = &nso->so_acceptq_next;
+ so->so_acceptq_len++;
+
+ /* Wakeup a single consumer */
+ cv_signal(&so->so_acceptq_cv);
+
+ return (so->so_acceptq_len);
+}
+
+/*
+ * int so_acceptq_enqueue(struct sonode *so, struct sonode *nso)
+ *
+ * Enqueue an incoming connection on a listening socket.
+ *
+ * Arguments:
+ * so - listening socket
+ * nso - new connection
+ *
+ * Returns:
+ * Number of queued connections, including the new connection
+ */
+int
+so_acceptq_enqueue(struct sonode *so, struct sonode *nso)
+{
+ int conns;
+
+ mutex_enter(&so->so_acceptq_lock);
+ conns = so_acceptq_enqueue_locked(so, nso);
+ mutex_exit(&so->so_acceptq_lock);
+
+ return (conns);
+}
+
+static int
+so_acceptq_dequeue_locked(struct sonode *so, boolean_t dontblock,
+ struct sonode **nsop)
+{
+ struct sonode *nso = NULL;
+
+ *nsop = NULL;
+ ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
+ while ((nso = so->so_acceptq_head) == NULL) {
+ /*
+ * No need to check so_error here, because it is not
+ * possible for a listening socket to be reset or otherwise
+ * disconnected.
+ *
+ * So now we just need check if it's ok to wait.
+ */
+ if (dontblock)
+ return (EWOULDBLOCK);
+ if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
+ return (EINTR);
+
+ if (cv_wait_sig_swap(&so->so_acceptq_cv,
+ &so->so_acceptq_lock) == 0)
+ return (EINTR);
+ }
+
+ ASSERT(nso != NULL);
+ so->so_acceptq_head = nso->so_acceptq_next;
+ nso->so_acceptq_next = NULL;
+
+ if (so->so_acceptq_head == NULL) {
+ ASSERT(so->so_acceptq_tail == &nso->so_acceptq_next);
+ so->so_acceptq_tail = &so->so_acceptq_head;
+ }
+ ASSERT(so->so_acceptq_len > 0);
+ --so->so_acceptq_len;
+
+ *nsop = nso;
+
+ return (0);
+}
+
+/*
+ * int so_acceptq_dequeue(struct sonode *, boolean_t, struct sonode **)
+ *
+ * Pulls a connection off of the accept queue.
+ *
+ * Arguments:
+ * so - listening socket
+ * dontblock - indicate whether it's ok to sleep if there are no
+ * connections on the queue
+ * nsop - Value-return argument
+ *
+ * Return values:
+ * 0 when a connection is successfully dequeued, in which case nsop
+ * is set to point to the new connection. Upon failure a non-zero
+ * value is returned, and the value of nsop is set to NULL.
+ *
+ * Note:
+ * so_acceptq_dequeue() may return prematurly if the socket is falling
+ * back to TPI.
+ */
+int
+so_acceptq_dequeue(struct sonode *so, boolean_t dontblock,
+ struct sonode **nsop)
+{
+ int error;
+
+ mutex_enter(&so->so_acceptq_lock);
+ error = so_acceptq_dequeue_locked(so, dontblock, nsop);
+ mutex_exit(&so->so_acceptq_lock);
+
+ return (error);
+}
+
+/*
+ * void so_acceptq_flush(struct sonode *so)
+ *
+ * Removes all pending connections from a listening socket, and
+ * frees the associated resources.
+ *
+ * Arguments
+ * so - listening socket
+ *
+ * Return values:
+ * None.
+ *
+ * Note:
+ * The caller has to ensure that no calls to so_acceptq_enqueue() or
+ * so_acceptq_dequeue() occur while the accept queue is being flushed.
+ * So either the socket needs to be in a state where no operations
+ * would come in, or so_lock needs to be obtained.
+ */
+void
+so_acceptq_flush(struct sonode *so)
+{
+ struct sonode *nso;
+
+ nso = so->so_acceptq_head;
+
+ while (nso != NULL) {
+ struct sonode *nnso = NULL;
+
+ nnso = nso->so_acceptq_next;
+ nso->so_acceptq_next = NULL;
+ /*
+ * Since the socket is on the accept queue, there can
+ * only be one reference. We drop the reference and
+ * just blow off the socket.
+ */
+ ASSERT(nso->so_count == 1);
+ nso->so_count--;
+ socket_destroy(nso);
+ nso = nnso;
+ }
+
+ so->so_acceptq_head = NULL;
+ so->so_acceptq_tail = &so->so_acceptq_head;
+ so->so_acceptq_len = 0;
+}
+
+int
+so_wait_connected_locked(struct sonode *so, boolean_t nonblock,
+ sock_connid_t id)
+{
+ ASSERT(MUTEX_HELD(&so->so_lock));
+
+ /*
+ * The protocol has notified us that a connection attempt is being
+ * made, so before we wait for a notification to arrive we must
+ * clear out any errors associated with earlier connection attempts.
+ */
+ if (so->so_error != 0 && SOCK_CONNID_LT(so->so_proto_connid, id))
+ so->so_error = 0;
+
+ while (SOCK_CONNID_LT(so->so_proto_connid, id)) {
+ if (nonblock)
+ return (EINPROGRESS);
+
+ if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
+ return (EINTR);
+
+ if (cv_wait_sig_swap(&so->so_state_cv, &so->so_lock) == 0)
+ return (EINTR);
+ }
+
+ if (so->so_error != 0)
+ return (sogeterr(so, B_TRUE));
+ /*
+ * Under normal circumstances, so_error should contain an error
+ * in case the connect failed. However, it is possible for another
+ * thread to come in a consume the error, so generate a sensible
+ * error in that case.
+ */
+ if ((so->so_state & SS_ISCONNECTED) == 0)
+ return (ECONNREFUSED);
+
+ return (0);
+}
+
+/*
+ * int so_wait_connected(struct sonode *so, boolean_t nonblock,
+ * sock_connid_t id)
+ *
+ * Wait until the socket is connected or an error has occured.
+ *
+ * Arguments:
+ * so - socket
+ * nonblock - indicate whether it's ok to sleep if the connection has
+ * not yet been established
+ * gen - generation number that was returned by the protocol
+ * when the operation was started
+ *
+ * Returns:
+ * 0 if the connection attempt was successful, or an error indicating why
+ * the connection attempt failed.
+ */
+int
+so_wait_connected(struct sonode *so, boolean_t nonblock, sock_connid_t id)
+{
+ int error;
+
+ mutex_enter(&so->so_lock);
+ error = so_wait_connected_locked(so, nonblock, id);
+ mutex_exit(&so->so_lock);
+
+ return (error);
+}
+
+int
+so_snd_wait_qnotfull_locked(struct sonode *so, boolean_t dontblock)
+{
+ int error;
+
+ ASSERT(MUTEX_HELD(&so->so_lock));
+ while (so->so_snd_qfull) {
+ if (so->so_state & SS_CANTSENDMORE)
+ return (EPIPE);
+ if (dontblock)
+ return (EWOULDBLOCK);
+
+ if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
+ return (EINTR);
+
+ if (so->so_sndtimeo == 0) {
+ /*
+ * Zero means disable timeout.
+ */
+ error = cv_wait_sig(&so->so_snd_cv, &so->so_lock);
+ } else {
+ clock_t now;
+
+ time_to_wait(&now, so->so_sndtimeo);
+ error = cv_timedwait_sig(&so->so_snd_cv, &so->so_lock,
+ now);
+ }
+ if (error == 0)
+ return (EINTR);
+ else if (error == -1)
+ return (ETIME);
+ }
+ return (0);
+}
+
+/*
+ * int so_wait_sendbuf(struct sonode *so, boolean_t dontblock)
+ *
+ * Wait for the transport to notify us about send buffers becoming
+ * available.
+ */
+int
+so_snd_wait_qnotfull(struct sonode *so, boolean_t dontblock)
+{
+ int error = 0;
+
+ mutex_enter(&so->so_lock);
+ if (so->so_snd_qfull) {
+ so->so_snd_wakeup = B_TRUE;
+ error = so_snd_wait_qnotfull_locked(so, dontblock);
+ so->so_snd_wakeup = B_FALSE;
+ }
+ mutex_exit(&so->so_lock);
+
+ return (error);
+}
+
+void
+so_snd_qfull(struct sonode *so)
+{
+ mutex_enter(&so->so_lock);
+ so->so_snd_qfull = B_TRUE;
+ mutex_exit(&so->so_lock);
+}
+
+void
+so_snd_qnotfull(struct sonode *so)
+{
+ mutex_enter(&so->so_lock);
+ so->so_snd_qfull = B_FALSE;
+ /* wake up everyone waiting for buffers */
+ cv_broadcast(&so->so_snd_cv);
+ mutex_exit(&so->so_lock);
+}
+
+/*
+ * Change the process/process group to which SIGIO is sent.
+ */
+int
+socket_chgpgrp(struct sonode *so, pid_t pid)
+{
+ int error;
+
+ ASSERT(MUTEX_HELD(&so->so_lock));
+ if (pid != 0) {
+ /*
+ * Permissions check by sending signal 0.
+ * Note that when kill fails it does a
+ * set_errno causing the system call to fail.
+ */
+ error = kill(pid, 0);
+ if (error != 0) {
+ return (error);
+ }
+ }
+ so->so_pgrp = pid;
+ return (0);
+}
+
+
+/*
+ * Generate a SIGIO, for 'writable' events include siginfo structure,
+ * for read events just send the signal.
+ */
+/*ARGSUSED*/
+static void
+socket_sigproc(proc_t *proc, int event)
+{
+ k_siginfo_t info;
+
+ ASSERT(event & (SOCKETSIG_WRITE | SOCKETSIG_READ | SOCKETSIG_URG));
+
+ if (event & SOCKETSIG_WRITE) {
+ info.si_signo = SIGPOLL;
+ info.si_code = POLL_OUT;
+ info.si_errno = 0;
+ info.si_fd = 0;
+ info.si_band = 0;
+ sigaddq(proc, NULL, &info, KM_NOSLEEP);
+ }
+ if (event & SOCKETSIG_READ) {
+ sigtoproc(proc, NULL, SIGPOLL);
+ }
+ if (event & SOCKETSIG_URG) {
+ sigtoproc(proc, NULL, SIGURG);
+ }
+}
+
+void
+socket_sendsig(struct sonode *so, int event)
+{
+ proc_t *proc;
+
+ ASSERT(MUTEX_HELD(&so->so_lock));
+
+ if (so->so_pgrp == 0 || (!(so->so_state & SS_ASYNC) &&
+ event != SOCKETSIG_URG)) {
+ return;
+ }
+
+ dprint(3, ("sending sig %d to %d\n", event, so->so_pgrp));
+
+ if (so->so_pgrp > 0) {
+ /*
+ * XXX This unfortunately still generates
+ * a signal when a fd is closed but
+ * the proc is active.
+ */
+ mutex_enter(&pidlock);
+ proc = prfind(so->so_pgrp);
+ if (proc == NULL) {
+ mutex_exit(&pidlock);
+ return;
+ }
+ mutex_enter(&proc->p_lock);
+ mutex_exit(&pidlock);
+ socket_sigproc(proc, event);
+ mutex_exit(&proc->p_lock);
+ } else {
+ /*
+ * Send to process group. Hold pidlock across
+ * calls to socket_sigproc().
+ */
+ pid_t pgrp = -so->so_pgrp;
+
+ mutex_enter(&pidlock);
+ proc = pgfind(pgrp);
+ while (proc != NULL) {
+ mutex_enter(&proc->p_lock);
+ socket_sigproc(proc, event);
+ mutex_exit(&proc->p_lock);
+ proc = proc->p_pglink;
+ }
+ mutex_exit(&pidlock);
+ }
+}
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+/* Copy userdata into a new mblk_t */
+mblk_t *
+socopyinuio(uio_t *uiop, ssize_t iosize, size_t wroff, ssize_t maxblk,
+ size_t tail_len, int *errorp)
+{
+ mblk_t *head = NULL, **tail = &head;
+
+ ASSERT(iosize == INFPSZ || iosize > 0);
+
+ if (iosize == INFPSZ || iosize > uiop->uio_resid)
+ iosize = uiop->uio_resid;
+
+ if (maxblk == INFPSZ)
+ maxblk = iosize;
+
+ /* Nothing to do in these cases, so we're done */
+ if (iosize < 0 || maxblk < 0 || (maxblk == 0 && iosize > 0))
+ goto done;
+
+ /*
+ * We will enter the loop below if iosize is 0; it will allocate an
+ * empty message block and call uiomove(9F) which will just return.
+ * We could avoid that with an extra check but would only slow
+ * down the much more likely case where iosize is larger than 0.
+ */
+ do {
+ ssize_t blocksize;
+ mblk_t *mp;
+
+ blocksize = MIN(iosize, maxblk);
+ ASSERT(blocksize >= 0);
+ if ((mp = allocb(wroff + blocksize + tail_len,
+ BPRI_MED)) == NULL) {
+ *errorp = ENOMEM;
+ return (head);
+ }
+ mp->b_rptr += wroff;
+ mp->b_wptr = mp->b_rptr + blocksize;
+
+ *tail = mp;
+ tail = &mp->b_cont;
+
+ /* uiomove(9F) either returns 0 or EFAULT */
+ if ((*errorp = uiomove(mp->b_rptr, (size_t)blocksize,
+ UIO_WRITE, uiop)) != 0) {
+ ASSERT(*errorp != ENOMEM);
+ freemsg(head);
+ return (NULL);
+ }
+
+ iosize -= blocksize;
+ } while (iosize > 0);
+
+done:
+ *errorp = 0;
+ return (head);
+}
+
+mblk_t *
+socopyoutuio(mblk_t *mp, struct uio *uiop, ssize_t max_read, int *errorp)
+{
+ int error;
+ ptrdiff_t n;
+ mblk_t *nmp;
+
+ ASSERT(mp->b_wptr >= mp->b_rptr);
+
+ /*
+ * max_read is the offset of the oobmark and read can not go pass
+ * the oobmark.
+ */
+ if (max_read == INFPSZ || max_read > uiop->uio_resid)
+ max_read = uiop->uio_resid;
+
+ do {
+ if ((n = MIN(max_read, MBLKL(mp))) != 0) {
+ ASSERT(n > 0);
+
+ error = uiomove(mp->b_rptr, n, UIO_READ, uiop);
+ if (error != 0) {
+ freemsg(mp);
+ *errorp = error;
+ return (NULL);
+ }
+ }
+
+ mp->b_rptr += n;
+ max_read -= n;
+ while (mp != NULL && (mp->b_rptr >= mp->b_wptr)) {
+ /*
+ * get rid of zero length mblks
+ */
+ nmp = mp;
+ mp = mp->b_cont;
+ freeb(nmp);
+ }
+ } while (mp != NULL && max_read > 0);
+
+ *errorp = 0;
+ return (mp);
+}
+
+static void
+so_prepend_msg(struct sonode *so, mblk_t *mp, mblk_t *last_tail)
+{
+ ASSERT(last_tail != NULL);
+ mp->b_next = so->so_rcv_q_head;
+ mp->b_prev = last_tail;
+ ASSERT(!(DB_FLAGS(mp) & DBLK_UIOA));
+
+ if (so->so_rcv_q_head == NULL) {
+ ASSERT(so->so_rcv_q_last_head == NULL);
+ so->so_rcv_q_last_head = mp;
+#ifdef DEBUG
+ } else {
+ ASSERT(!(DB_FLAGS(so->so_rcv_q_head) & DBLK_UIOA));
+#endif
+ }
+ so->so_rcv_q_head = mp;
+
+#ifdef DEBUG
+ if (so_debug_length) {
+ mutex_enter(&so->so_lock);
+ ASSERT(so_check_length(so));
+ mutex_exit(&so->so_lock);
+ }
+#endif
+}
+
+static void
+process_new_message(struct sonode *so, mblk_t *mp_head, mblk_t *mp_last_head)
+{
+ ASSERT(mp_head->b_prev != NULL);
+ if (so->so_rcv_q_head == NULL) {
+ so->so_rcv_q_head = mp_head;
+ so->so_rcv_q_last_head = mp_last_head;
+ ASSERT(so->so_rcv_q_last_head->b_prev != NULL);
+ } else {
+ boolean_t flag_equal = ((DB_FLAGS(mp_head) & DBLK_UIOA) ==
+ (DB_FLAGS(so->so_rcv_q_last_head) & DBLK_UIOA));
+
+ if (mp_head->b_next == NULL &&
+ DB_TYPE(mp_head) == M_DATA &&
+ DB_TYPE(so->so_rcv_q_last_head) == M_DATA && flag_equal) {
+ so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
+ so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
+ mp_head->b_prev = NULL;
+ } else if (flag_equal && (DB_FLAGS(mp_head) & DBLK_UIOA)) {
+ /*
+ * Append to last_head if more than one mblks, and both
+ * mp_head and last_head are I/OAT mblks.
+ */
+ ASSERT(mp_head->b_next != NULL);
+ so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
+ so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
+ mp_head->b_prev = NULL;
+
+ so->so_rcv_q_last_head->b_next = mp_head->b_next;
+ mp_head->b_next = NULL;
+ so->so_rcv_q_last_head = mp_last_head;
+ } else {
+#ifdef DEBUG
+ {
+ mblk_t *tmp_mblk;
+ tmp_mblk = mp_head;
+ while (tmp_mblk != NULL) {
+ ASSERT(tmp_mblk->b_prev != NULL);
+ tmp_mblk = tmp_mblk->b_next;
+ }
+ }
+#endif
+ so->so_rcv_q_last_head->b_next = mp_head;
+ so->so_rcv_q_last_head = mp_last_head;
+ }
+ }
+}
+
+int
+so_dequeue_msg(struct sonode *so, mblk_t **mctlp, struct uio *uiop,
+ rval_t *rvalp, int flags)
+{
+ mblk_t *mp, *nmp;
+ mblk_t *savemp, *savemptail;
+ mblk_t *new_msg_head;
+ mblk_t *new_msg_last_head;
+ mblk_t *last_tail;
+ boolean_t partial_read;
+ boolean_t reset_atmark = B_FALSE;
+ int more = 0;
+ int error;
+ ssize_t oobmark;
+ sodirect_t *sodp = so->so_direct;
+
+ partial_read = B_FALSE;
+ *mctlp = NULL;
+again:
+ mutex_enter(&so->so_lock);
+again1:
+#ifdef DEBUG
+ if (so_debug_length) {
+ ASSERT(so_check_length(so));
+ }
+#endif
+ /*
+ * First move messages from the dump area to processing area
+ */
+ if (sodp != NULL) {
+ /* No need to grab sod_lockp since it pointers to so_lock */
+ if (sodp->sod_state & SOD_ENABLED) {
+ ASSERT(sodp->sod_lockp == &so->so_lock);
+
+ if (sodp->sod_uioa.uioa_state & UIOA_ALLOC) {
+ /* nothing to uioamove */
+ sodp = NULL;
+ } else if (sodp->sod_uioa.uioa_state & UIOA_INIT) {
+ sodp->sod_uioa.uioa_state &= UIOA_CLR;
+ sodp->sod_uioa.uioa_state |= UIOA_ENABLED;
+ /*
+ * try to uioamove() the data that
+ * has already queued.
+ */
+ sod_uioa_so_init(so, sodp, uiop);
+ }
+ } else {
+ sodp = NULL;
+ }
+ }
+ new_msg_head = so->so_rcv_head;
+ new_msg_last_head = so->so_rcv_last_head;
+ so->so_rcv_head = NULL;
+ so->so_rcv_last_head = NULL;
+ oobmark = so->so_oobmark;
+ /*
+ * We can release the lock as there can only be one reader
+ */
+ mutex_exit(&so->so_lock);
+
+ if (so->so_state & SS_RCVATMARK) {
+ reset_atmark = B_TRUE;
+ }
+ if (new_msg_head != NULL) {
+ process_new_message(so, new_msg_head, new_msg_last_head);
+ }
+ savemp = savemptail = NULL;
+ rvalp->r_val1 = 0;
+ error = 0;
+ mp = so->so_rcv_q_head;
+
+ if (mp != NULL &&
+ (so->so_rcv_timer_tid == 0 ||
+ so->so_rcv_queued >= so->so_rcv_thresh)) {
+ partial_read = B_FALSE;
+
+ if (flags & MSG_PEEK) {
+ if ((nmp = dupmsg(mp)) == NULL &&
+ (nmp = copymsg(mp)) == NULL) {
+ size_t size = msgsize(mp);
+
+ error = strwaitbuf(size, BPRI_HI);
+ if (error) {
+ return (error);
+ }
+ goto again;
+ }
+ mp = nmp;
+ } else {
+ ASSERT(mp->b_prev != NULL);
+ last_tail = mp->b_prev;
+ mp->b_prev = NULL;
+ so->so_rcv_q_head = mp->b_next;
+ if (so->so_rcv_q_head == NULL) {
+ so->so_rcv_q_last_head = NULL;
+ }
+ mp->b_next = NULL;
+ }
+
+ ASSERT(mctlp != NULL);
+ /*
+ * First process PROTO or PCPROTO blocks, if any.
+ */
+ if (DB_TYPE(mp) != M_DATA) {
+ *mctlp = mp;
+ savemp = mp;
+ savemptail = mp;
+ ASSERT(DB_TYPE(mp) == M_PROTO ||
+ DB_TYPE(mp) == M_PCPROTO);
+ while (mp->b_cont != NULL &&
+ DB_TYPE(mp->b_cont) != M_DATA) {
+ ASSERT(DB_TYPE(mp->b_cont) == M_PROTO ||
+ DB_TYPE(mp->b_cont) == M_PCPROTO);
+ mp = mp->b_cont;
+ savemptail = mp;
+ }
+ mp = savemptail->b_cont;
+ savemptail->b_cont = NULL;
+ }
+
+ ASSERT(DB_TYPE(mp) == M_DATA);
+ /*
+ * Now process DATA blocks, if any. Note that for sodirect
+ * enabled socket, uio_resid can be 0.
+ */
+ if (uiop->uio_resid >= 0) {
+ ssize_t copied = 0;
+
+ if (sodp != NULL && (DB_FLAGS(mp) & DBLK_UIOA)) {
+ mutex_enter(sodp->sod_lockp);
+ ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
+ copied = sod_uioa_mblk(so, mp);
+ if (copied > 0)
+ partial_read = B_TRUE;
+ mutex_exit(sodp->sod_lockp);
+ /* mark this mblk as processed */
+ mp = NULL;
+ } else {
+ ssize_t oldresid = uiop->uio_resid;
+
+ if (MBLKL(mp) < so_mblk_pull_len) {
+ if (pullupmsg(mp, -1) == 1) {
+ last_tail = mp;
+ }
+ }
+ /*
+ * Can not read beyond the oobmark
+ */
+ mp = socopyoutuio(mp, uiop,
+ oobmark == 0 ? INFPSZ : oobmark, &error);
+ if (error != 0) {
+ freemsg(*mctlp);
+ *mctlp = NULL;
+ more = 0;
+ goto done;
+ }
+ ASSERT(oldresid >= uiop->uio_resid);
+ copied = oldresid - uiop->uio_resid;
+ if (oldresid > uiop->uio_resid)
+ partial_read = B_TRUE;
+ }
+ ASSERT(copied >= 0);
+ if (copied > 0 && !(flags & MSG_PEEK)) {
+ mutex_enter(&so->so_lock);
+ so->so_rcv_queued -= copied;
+ ASSERT(so->so_oobmark >= 0);
+ if (so->so_oobmark > 0) {
+ so->so_oobmark -= copied;
+ ASSERT(so->so_oobmark >= 0);
+ if (so->so_oobmark == 0) {
+ ASSERT(so->so_state &
+ SS_OOBPEND);
+ so->so_oobmark = 0;
+ so->so_state |= SS_RCVATMARK;
+ }
+ }
+ if (so->so_flowctrld && so->so_rcv_queued <
+ so->so_rcvlowat) {
+ so->so_flowctrld = B_FALSE;
+ mutex_exit(&so->so_lock);
+ /*
+ * open up flow control
+ */
+ (*so->so_downcalls->sd_clr_flowctrl)
+ (so->so_proto_handle);
+ } else {
+ mutex_exit(&so->so_lock);
+ }
+ }
+ }
+ if (mp != NULL) { /* more data blocks in msg */
+ more |= MOREDATA;
+ if ((flags & (MSG_PEEK|MSG_TRUNC))) {
+ if (flags & MSG_TRUNC) {
+ mutex_enter(&so->so_lock);
+ so->so_rcv_queued -= msgdsize(mp);
+ mutex_exit(&so->so_lock);
+ }
+ freemsg(mp);
+ } else if (partial_read && !somsghasdata(mp)) {
+ /*
+ * Avoid queuing a zero-length tail part of
+ * a message. partial_read == 1 indicates that
+ * we read some of the message.
+ */
+ freemsg(mp);
+ more &= ~MOREDATA;
+ } else {
+ if (savemp != NULL &&
+ (flags & MSG_DUPCTRL)) {
+ mblk_t *nmp;
+ /*
+ * There should only be non data mblks
+ */
+ ASSERT(DB_TYPE(savemp) != M_DATA &&
+ DB_TYPE(savemptail) != M_DATA);
+try_again:
+ if ((nmp = dupmsg(savemp)) == NULL &&
+ (nmp = copymsg(savemp)) == NULL) {
+
+ size_t size = msgsize(savemp);
+
+ error = strwaitbuf(size,
+ BPRI_HI);
+ if (error != 0) {
+ /*
+ * In case we
+ * cannot copy
+ * control data
+ * free the remaining
+ * data.
+ */
+ freemsg(mp);
+ goto done;
+ }
+ goto try_again;
+ }
+
+ ASSERT(nmp != NULL);
+ ASSERT(DB_TYPE(nmp) != M_DATA);
+ savemptail->b_cont = mp;
+ *mctlp = nmp;
+ mp = savemp;
+ }
+ /*
+ * putback mp
+ */
+ so_prepend_msg(so, mp, last_tail);
+ }
+ }
+
+ /* fast check so_rcv_head if there is more data */
+ if (partial_read && !(so->so_state & SS_RCVATMARK) &&
+ *mctlp == NULL && uiop->uio_resid > 0 &&
+ !(flags & MSG_PEEK) && so->so_rcv_head != NULL) {
+ goto again;
+ }
+ } else if (!partial_read) {
+ mutex_enter(&so->so_lock);
+ if (so->so_error != 0) {
+ error = sogeterr(so, !(flags & MSG_PEEK));
+ mutex_exit(&so->so_lock);
+ return (error);
+ }
+ /*
+ * No pending data. Return right away for nonblocking
+ * socket, otherwise sleep waiting for data.
+ */
+ if (!(so->so_state & SS_CANTRCVMORE)) {
+ if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
+ (flags & MSG_DONTWAIT)) {
+ error = EWOULDBLOCK;
+ } else {
+ if (so->so_state & (SS_CLOSING |
+ SS_FALLBACK_PENDING)) {
+ mutex_exit(&so->so_lock);
+ error = EINTR;
+ goto done;
+ }
+
+ if (so->so_rcv_head != NULL) {
+ goto again1;
+ }
+ so->so_rcv_wakeup = B_TRUE;
+ so->so_rcv_wanted = uiop->uio_resid;
+ if (so->so_rcvtimeo == 0) {
+ /*
+ * Zero means disable timeout.
+ */
+ error = cv_wait_sig(&so->so_rcv_cv,
+ &so->so_lock);
+ } else {
+ clock_t now;
+ time_to_wait(&now, so->so_rcvtimeo);
+ error = cv_timedwait_sig(&so->so_rcv_cv,
+ &so->so_lock, now);
+ }
+ so->so_rcv_wakeup = B_FALSE;
+ so->so_rcv_wanted = 0;
+
+ if (error == 0) {
+ error = EINTR;
+ } else if (error == -1) {
+ error = ETIME;
+ } else {
+ goto again1;
+ }
+ }
+ }
+ mutex_exit(&so->so_lock);
+ }
+ if (reset_atmark && partial_read && !(flags & MSG_PEEK)) {
+ /*
+ * We are passed the mark, update state
+ * 4.3BSD and 4.4BSD clears the mark when peeking across it.
+ * The draft Posix socket spec states that the mark should
+ * not be cleared when peeking. We follow the latter.
+ */
+ mutex_enter(&so->so_lock);
+ ASSERT(so_verify_oobstate(so));
+ so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
+ freemsg(so->so_oobmsg);
+ so->so_oobmsg = NULL;
+ ASSERT(so_verify_oobstate(so));
+ mutex_exit(&so->so_lock);
+ }
+ ASSERT(so->so_rcv_wakeup == B_FALSE);
+done:
+ if (sodp != NULL) {
+ mutex_enter(sodp->sod_lockp);
+ if ((sodp->sod_state & SOD_ENABLED) &&
+ (sodp->sod_uioa.uioa_state & UIOA_ENABLED)) {
+ SOD_UIOAFINI(sodp);
+ if (sodp->sod_uioa.uioa_mbytes > 0) {
+ ASSERT(so->so_rcv_q_head != NULL ||
+ so->so_rcv_head != NULL);
+ so->so_rcv_queued -= sod_uioa_mblk(so, NULL);
+ if (error == EWOULDBLOCK)
+ error = 0;
+ }
+ }
+ mutex_exit(sodp->sod_lockp);
+ }
+#ifdef DEBUG
+ if (so_debug_length) {
+ mutex_enter(&so->so_lock);
+ ASSERT(so_check_length(so));
+ mutex_exit(&so->so_lock);
+ }
+#endif
+ rvalp->r_val1 = more;
+ return (error);
+}
+
+void
+so_enqueue_msg(struct sonode *so, mblk_t *mp, size_t msg_size)
+{
+ ASSERT(MUTEX_HELD(&so->so_lock));
+
+#ifdef DEBUG
+ if (so_debug_length) {
+ ASSERT(so_check_length(so));
+ }
+#endif
+ so->so_rcv_queued += msg_size;
+
+ if (so->so_rcv_head == NULL) {
+ ASSERT(so->so_rcv_last_head == NULL);
+ so->so_rcv_head = mp;
+ so->so_rcv_last_head = mp;
+ } else if ((DB_TYPE(mp) == M_DATA &&
+ DB_TYPE(so->so_rcv_last_head) == M_DATA) &&
+ ((DB_FLAGS(mp) & DBLK_UIOA) ==
+ (DB_FLAGS(so->so_rcv_last_head) & DBLK_UIOA))) {
+ /* Added to the end */
+ ASSERT(so->so_rcv_last_head != NULL);
+ ASSERT(so->so_rcv_last_head->b_prev != NULL);
+ so->so_rcv_last_head->b_prev->b_cont = mp;
+ } else {
+ /* Start a new end */
+ so->so_rcv_last_head->b_next = mp;
+ so->so_rcv_last_head = mp;
+ }
+ while (mp->b_cont != NULL)
+ mp = mp->b_cont;
+
+ so->so_rcv_last_head->b_prev = mp;
+#ifdef DEBUG
+ if (so_debug_length) {
+ ASSERT(so_check_length(so));
+ }
+#endif
+}
+
+/*
+ * Return B_TRUE if there is data in the message, B_FALSE otherwise.
+ */
+boolean_t
+somsghasdata(mblk_t *mp)
+{
+ for (; mp; mp = mp->b_cont)
+ if (mp->b_datap->db_type == M_DATA) {
+ ASSERT(mp->b_wptr >= mp->b_rptr);
+ if (mp->b_wptr > mp->b_rptr)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+/*
+ * Flush the read side of sockfs.
+ *
+ * The caller must be sure that a reader is not already active when the
+ * buffer is being flushed.
+ */
+void
+so_rcv_flush(struct sonode *so)
+{
+ mblk_t *mp;
+
+ ASSERT(MUTEX_HELD(&so->so_lock));
+
+ if (so->so_oobmsg != NULL) {
+ freemsg(so->so_oobmsg);
+ so->so_oobmsg = NULL;
+ so->so_oobmark = 0;
+ so->so_state &=
+ ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|SS_RCVATMARK);
+ }
+
+ /*
+ * Free messages sitting in the send and recv queue
+ */
+ while (so->so_rcv_q_head != NULL) {
+ mp = so->so_rcv_q_head;
+ so->so_rcv_q_head = mp->b_next;
+ mp->b_next = mp->b_prev = NULL;
+ freemsg(mp);
+ }
+ while (so->so_rcv_head != NULL) {
+ mp = so->so_rcv_head;
+ so->so_rcv_head = mp->b_next;
+ mp->b_next = mp->b_prev = NULL;
+ freemsg(mp);
+ }
+ so->so_rcv_queued = 0;
+ so->so_rcv_q_head = NULL;
+ so->so_rcv_q_last_head = NULL;
+ so->so_rcv_head = NULL;
+ so->so_rcv_last_head = NULL;
+}
+
+/*
+ * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
+ */
+int
+sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags,
+ boolean_t oob_inline)
+{
+ mblk_t *mp, *nmp;
+ int error;
+
+ dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n", (void *)so, (void *)msg,
+ flags));
+
+ if (msg != NULL) {
+ /*
+ * There is never any oob data with addresses or control since
+ * the T_EXDATA_IND does not carry any options.
+ */
+ msg->msg_controllen = 0;
+ msg->msg_namelen = 0;
+ msg->msg_flags = 0;
+ }
+
+ mutex_enter(&so->so_lock);
+ ASSERT(so_verify_oobstate(so));
+ if (oob_inline ||
+ (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
+ dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
+ mutex_exit(&so->so_lock);
+ return (EINVAL);
+ }
+ if (!(so->so_state & SS_HAVEOOBDATA)) {
+ dprintso(so, 1, ("sorecvoob: no data yet\n"));
+ mutex_exit(&so->so_lock);
+ return (EWOULDBLOCK);
+ }
+ ASSERT(so->so_oobmsg != NULL);
+ mp = so->so_oobmsg;
+ if (flags & MSG_PEEK) {
+ /*
+ * Since recv* can not return ENOBUFS we can not use dupmsg.
+ * Instead we revert to the consolidation private
+ * allocb_wait plus bcopy.
+ */
+ mblk_t *mp1;
+
+ mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
+ ASSERT(mp1);
+
+ while (mp != NULL) {
+ ssize_t size;
+
+ size = MBLKL(mp);
+ bcopy(mp->b_rptr, mp1->b_wptr, size);
+ mp1->b_wptr += size;
+ ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
+ mp = mp->b_cont;
+ }
+ mp = mp1;
+ } else {
+ /*
+ * Update the state indicating that the data has been consumed.
+ * Keep SS_OOBPEND set until data is consumed past the mark.
+ */
+ so->so_oobmsg = NULL;
+ so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
+ }
+ ASSERT(so_verify_oobstate(so));
+ mutex_exit(&so->so_lock);
+
+ error = 0;
+ nmp = mp;
+ while (nmp != NULL && uiop->uio_resid > 0) {
+ ssize_t n = MBLKL(nmp);
+
+ n = MIN(n, uiop->uio_resid);
+ if (n > 0)
+ error = uiomove(nmp->b_rptr, n,
+ UIO_READ, uiop);
+ if (error)
+ break;
+ nmp = nmp->b_cont;
+ }
+ ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
+ freemsg(mp);
+ return (error);
+}
+
+/*
+ * Allocate and initializ sonode
+ */
+/* ARGSUSED */
+struct sonode *
+socket_sonode_create(struct sockparams *sp, int family, int type,
+ int protocol, int version, int sflags, int *errorp, struct cred *cr)
+{
+ sonode_t *so;
+ int kmflags;
+
+ /*
+ * Choose the right set of sonodeops based on the upcall and
+ * down call version that the protocol has provided
+ */
+ if (SOCK_UC_VERSION != sp->sp_smod_info->smod_uc_version ||
+ SOCK_DC_VERSION != sp->sp_smod_info->smod_dc_version) {
+ /*
+ * mismatch
+ */
+#ifdef DEBUG
+ cmn_err(CE_CONT, "protocol and socket module version mismatch");
+#endif
+ *errorp = EINVAL;
+ return (NULL);
+ }
+
+ kmflags = (sflags & SOCKET_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
+
+ so = kmem_cache_alloc(socket_cache, kmflags);
+ if (so == NULL) {
+ *errorp = ENOMEM;
+ return (NULL);
+ }
+
+ sonode_init(so, sp, family, type, protocol, &so_sonodeops);
+
+ if (version == SOV_DEFAULT)
+ version = so_default_version;
+
+ so->so_version = (short)version;
+
+ /*
+ * set the default values to be INFPSZ
+ * if a protocol desires it can change the value later
+ */
+ so->so_proto_props.sopp_rxhiwat = SOCKET_RECVHIWATER;
+ so->so_proto_props.sopp_rxlowat = SOCKET_RECVLOWATER;
+ so->so_proto_props.sopp_maxpsz = INFPSZ;
+ so->so_proto_props.sopp_maxblk = INFPSZ;
+
+ return (so);
+}
+
+int
+socket_init_common(struct sonode *so, struct sonode *pso, int flags, cred_t *cr)
+{
+ int error = 0;
+
+ if (pso != NULL) {
+ /*
+ * We have a passive open, so inherit basic state from
+ * the parent (listener).
+ *
+ * No need to grab the new sonode's lock, since there is no
+ * one that can have a reference to it.
+ */
+ mutex_enter(&pso->so_lock);
+
+ so->so_state |= SS_ISCONNECTED | (pso->so_state & SS_ASYNC);
+ so->so_pgrp = pso->so_pgrp;
+ so->so_rcvtimeo = pso->so_rcvtimeo;
+ so->so_sndtimeo = pso->so_sndtimeo;
+ /*
+ * Make note of the socket level options. TCP and IP level
+ * options are already inherited. We could do all this after
+ * accept is successful but doing it here simplifies code and
+ * no harm done for error case.
+ */
+ so->so_options = pso->so_options & (SO_DEBUG|SO_REUSEADDR|
+ SO_KEEPALIVE| SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
+ SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
+ so->so_proto_props = pso->so_proto_props;
+ so->so_mode = pso->so_mode;
+
+ mutex_exit(&pso->so_lock);
+
+ if (uioasync.enabled) {
+ sod_sock_init(so, NULL, NULL, NULL, &so->so_lock);
+ }
+ return (0);
+ } else {
+ struct sockparams *sp = so->so_sockparams;
+ sock_upcalls_t *upcalls_to_use;
+
+ /*
+ * Based on the version number select the right upcalls to
+ * pass down. Currently we only have one version so choose
+ * default
+ */
+ upcalls_to_use = &so_upcalls;
+
+ /* active open, so create a lower handle */
+ so->so_proto_handle =
+ sp->sp_smod_info->smod_proto_create_func(so->so_family,
+ so->so_type, so->so_protocol, &so->so_downcalls,
+ &so->so_mode, &error, flags, cr);
+
+ if (so->so_proto_handle == NULL) {
+ ASSERT(error != 0);
+ /*
+ * To be safe; if a lower handle cannot be created, and
+ * the proto does not give a reason why, assume there
+ * was a lack of memory.
+ */
+ return ((error == 0) ? ENOMEM : error);
+ }
+ ASSERT(so->so_downcalls != NULL);
+ ASSERT(so->so_downcalls->sd_send != NULL ||
+ so->so_downcalls->sd_send_uio != NULL);
+ if (so->so_downcalls->sd_recv_uio != NULL) {
+ ASSERT(so->so_downcalls->sd_poll != NULL);
+ so->so_pollev |= SO_POLLEV_ALWAYS;
+ }
+
+ (*so->so_downcalls->sd_activate)(so->so_proto_handle,
+ (sock_upper_handle_t)so, upcalls_to_use, 0, cr);
+
+ /* Wildcard */
+
+ /*
+ * FIXME No need for this, the protocol can deal with it in
+ * sd_create(). Should update ICMP.
+ */
+ if (so->so_protocol != so->so_sockparams->sp_protocol) {
+ int protocol = so->so_protocol;
+ int error;
+ /*
+ * Issue SO_PROTOTYPE setsockopt.
+ */
+ error = socket_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
+ &protocol, (t_uscalar_t)sizeof (protocol), cr);
+ if (error) {
+ (void) (*so->so_downcalls->sd_close)
+ (so->so_proto_handle, 0, cr);
+
+ mutex_enter(&so->so_lock);
+ so_rcv_flush(so);
+ mutex_exit(&so->so_lock);
+ /*
+ * Setsockopt often fails with ENOPROTOOPT but
+ * socket() should fail with
+ * EPROTONOSUPPORT/EPROTOTYPE.
+ */
+ return (EPROTONOSUPPORT);
+ }
+ }
+ return (0);
+ }
+}
+
+/*
+ * int socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
+ * struct cred *cr, int32_t *rvalp)
+ *
+ * Handle ioctls that manipulate basic socket state; non-blocking,
+ * async, etc.
+ *
+ * Returns:
+ * < 0 - ioctl was not handle
+ * >= 0 - ioctl was handled, if > 0, then it is an errno
+ *
+ * Notes:
+ * Assumes the standard receive buffer is used to obtain info for
+ * NREAD.
+ */
+/* ARGSUSED */
+int
+socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
+ struct cred *cr, int32_t *rvalp)
+{
+ switch (cmd) {
+ case FIONBIO: {
+ int32_t value;
+
+ if (so_copyin((void *)arg, &value, sizeof (int32_t),
+ (mode & (int)FKIOCTL)))
+ return (EFAULT);
+
+ mutex_enter(&so->so_lock);
+ if (value) {
+ so->so_state |= SS_NDELAY;
+ } else {
+ so->so_state &= ~SS_NDELAY;
+ }
+ mutex_exit(&so->so_lock);
+ return (0);
+ }
+ case FIOASYNC: {
+ int32_t value;
+
+ if (so_copyin((void *)arg, &value, sizeof (int32_t),
+ (mode & (int)FKIOCTL)))
+ return (EFAULT);
+
+ mutex_enter(&so->so_lock);
+
+ if (value) {
+ /* Turn on SIGIO */
+ so->so_state |= SS_ASYNC;
+ } else {
+ /* Turn off SIGIO */
+ so->so_state &= ~SS_ASYNC;
+ }
+ mutex_exit(&so->so_lock);
+
+ return (0);
+ }
+
+ case SIOCSPGRP:
+ case FIOSETOWN: {
+ int error;
+ pid_t pid;
+
+ if (so_copyin((void *)arg, &pid, sizeof (pid_t),
+ (mode & (int)FKIOCTL)))
+ return (EFAULT);
+
+ mutex_enter(&so->so_lock);
+ error = (pid != so->so_pgrp) ? socket_chgpgrp(so, pid) : 0;
+ mutex_exit(&so->so_lock);
+ return (error);
+ }
+ case SIOCGPGRP:
+ case FIOGETOWN:
+ if (so_copyout(&so->so_pgrp, (void *)arg,
+ sizeof (pid_t), (mode & (int)FKIOCTL)))
+ return (EFAULT);
+
+ return (0);
+ case SIOCATMARK: {
+ int retval;
+
+ /*
+ * Only protocols that support urgent data can handle ATMARK.
+ */
+ if ((so->so_mode & SM_EXDATA) == 0)
+ return (EINVAL);
+
+ /*
+ * If the protocol is maintaining its own buffer, then the
+ * request must be passed down.
+ */
+ if (so->so_downcalls->sd_recv_uio != NULL)
+ return (-1);
+
+ retval = (so->so_state & SS_RCVATMARK) != 0;
+
+ if (so_copyout(&retval, (void *)arg, sizeof (int),
+ (mode & (int)FKIOCTL))) {
+ return (EFAULT);
+ }
+ return (0);
+ }
+
+ case FIONREAD: {
+ int retval;
+
+ /*
+ * If the protocol is maintaining its own buffer, then the
+ * request must be passed down.
+ */
+ if (so->so_downcalls->sd_recv_uio != NULL)
+ return (-1);
+
+ retval = MIN(so->so_rcv_queued, INT_MAX);
+
+ if (so_copyout(&retval, (void *)arg,
+ sizeof (retval), (mode & (int)FKIOCTL))) {
+ return (EFAULT);
+ }
+ return (0);
+ }
+
+ case _I_GETPEERCRED: {
+ int error = 0;
+
+ if ((mode & FKIOCTL) == 0)
+ return (EINVAL);
+
+ mutex_enter(&so->so_lock);
+ if ((so->so_mode & SM_CONNREQUIRED) == 0) {
+ error = ENOTSUP;
+ } else if ((so->so_state & SS_ISCONNECTED) == 0) {
+ error = ENOTCONN;
+ } else if (so->so_peercred != NULL) {
+ k_peercred_t *kp = (k_peercred_t *)arg;
+ kp->pc_cr = so->so_peercred;
+ kp->pc_cpid = so->so_cpid;
+ crhold(so->so_peercred);
+ } else {
+ error = EINVAL;
+ }
+ mutex_exit(&so->so_lock);
+ return (error);
+ }
+ default:
+ return (-1);
+ }
+}
+
+/*
+ * Process STREAMS related ioctls. If a I_PUSH/POP operation is specified
+ * then the socket will fall back to TPI.
+ *
+ * Returns:
+ * < 0 - ioctl was not handle
+ * >= 0 - ioctl was handled, if > 0, then it is an errno
+ */
+int
+socket_strioc_common(struct sonode *so, int cmd, intptr_t arg, int mode,
+ struct cred *cr, int32_t *rvalp)
+{
+ switch (cmd) {
+ case _I_INSERT:
+ case _I_REMOVE:
+ case I_FIND:
+ case I_LIST:
+ return (EOPNOTSUPP);
+
+ case I_PUSH:
+ case I_POP: {
+ int retval;
+
+ if ((retval = so_tpi_fallback(so, cr)) == 0) {
+ /* Reissue the ioctl */
+ ASSERT(so->so_rcv_q_head == NULL);
+ return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
+ }
+ return (retval);
+ }
+ case I_LOOK:
+ if (so_copyout("sockmod", (void *)arg, strlen("sockmod") + 1,
+ (mode & (int)FKIOCTL))) {
+ return (EFAULT);
+ }
+ return (0);
+ default:
+ return (-1);
+ }
+}
+
+int
+socket_getopt_common(struct sonode *so, int level, int option_name,
+ void *optval, socklen_t *optlenp)
+{
+ if (level != SOL_SOCKET)
+ return (-1);
+
+ switch (option_name) {
+ case SO_ERROR:
+ case SO_DOMAIN:
+ case SO_TYPE:
+ case SO_ACCEPTCONN: {
+ int32_t value;
+ socklen_t optlen = *optlenp;
+
+ if (optlen < (t_uscalar_t)sizeof (int32_t)) {
+ return (EINVAL);
+ }
+
+ switch (option_name) {
+ case SO_ERROR:
+ mutex_enter(&so->so_lock);
+ value = sogeterr(so, B_TRUE);
+ mutex_exit(&so->so_lock);
+ break;
+ case SO_DOMAIN:
+ value = so->so_family;
+ break;
+ case SO_TYPE:
+ value = so->so_type;
+ break;
+ case SO_ACCEPTCONN:
+ if (so->so_state & SS_ACCEPTCONN)
+ value = SO_ACCEPTCONN;
+ else
+ value = 0;
+ break;
+ }
+
+ bcopy(&value, optval, sizeof (value));
+ *optlenp = sizeof (value);
+
+ return (0);
+ }
+ case SO_SNDTIMEO:
+ case SO_RCVTIMEO: {
+ clock_t value;
+ socklen_t optlen = *optlenp;
+
+ if (optlen < (t_uscalar_t)sizeof (struct timeval)) {
+ return (EINVAL);
+ }
+ if (option_name == SO_RCVTIMEO)
+ value = drv_hztousec(so->so_rcvtimeo);
+ else
+ value = drv_hztousec(so->so_sndtimeo);
+ ((struct timeval *)(optval))->tv_sec = value / (1000 * 1000);
+ ((struct timeval *)(optval))->tv_usec = value % (1000 * 1000);
+ *optlenp = sizeof (struct timeval);
+ return (0);
+ }
+ case SO_DEBUG:
+ case SO_REUSEADDR:
+ case SO_KEEPALIVE:
+ case SO_DONTROUTE:
+ case SO_BROADCAST:
+ case SO_USELOOPBACK:
+ case SO_OOBINLINE:
+ case SO_SNDBUF:
+ case SO_RCVBUF:
+#ifdef notyet
+ case SO_SNDLOWAT:
+ case SO_RCVLOWAT:
+#endif /* notyet */
+ case SO_DGRAM_ERRIND: {
+ socklen_t optlen = *optlenp;
+
+ if (optlen < (t_uscalar_t)sizeof (int32_t))
+ return (EINVAL);
+ break;
+ }
+ case SO_LINGER: {
+ socklen_t optlen = *optlenp;
+
+ if (optlen < (t_uscalar_t)sizeof (struct linger))
+ return (EINVAL);
+ break;
+ }
+ case SO_SND_BUFINFO: {
+ socklen_t optlen = *optlenp;
+
+ if (optlen < (t_uscalar_t)sizeof (struct so_snd_bufinfo))
+ return (EINVAL);
+ ((struct so_snd_bufinfo *)(optval))->sbi_wroff =
+ (so->so_proto_props).sopp_wroff;
+ ((struct so_snd_bufinfo *)(optval))->sbi_maxblk =
+ (so->so_proto_props).sopp_maxblk;
+ ((struct so_snd_bufinfo *)(optval))->sbi_maxpsz =
+ (so->so_proto_props).sopp_maxpsz;
+ ((struct so_snd_bufinfo *)(optval))->sbi_tail =
+ (so->so_proto_props).sopp_tail;
+ *optlenp = sizeof (struct so_snd_bufinfo);
+ return (0);
+ }
+ default:
+ break;
+ }
+
+ /* Unknown Option */
+ return (-1);
+}
+
+void
+socket_sonode_destroy(struct sonode *so)
+{
+ sonode_fini(so);
+ kmem_cache_free(socket_cache, so);
+}
+
+int
+so_zcopy_wait(struct sonode *so)
+{
+ int error = 0;
+
+ mutex_enter(&so->so_lock);
+ while (!(so->so_copyflag & STZCNOTIFY)) {
+ if (so->so_state & SS_CLOSING) {
+ mutex_exit(&so->so_lock);
+ return (EINTR);
+ }
+ if (cv_wait_sig(&so->so_copy_cv, &so->so_lock) == 0) {
+ error = EINTR;
+ break;
+ }
+ }
+ so->so_copyflag &= ~STZCNOTIFY;
+ mutex_exit(&so->so_lock);
+ return (error);
+}
+
+void
+so_timer_callback(void *arg)
+{
+ struct sonode *so = (struct sonode *)arg;
+
+ mutex_enter(&so->so_lock);
+
+ so->so_rcv_timer_tid = 0;
+ if (so->so_rcv_queued > 0) {
+ so_notify_data(so, so->so_rcv_queued);
+ } else {
+ mutex_exit(&so->so_lock);
+ }
+}
+
+#ifdef DEBUG
+/*
+ * Verify that the length stored in so_rcv_queued and the length of data blocks
+ * queued is same.
+ */
+static boolean_t
+so_check_length(sonode_t *so)
+{
+ mblk_t *mp = so->so_rcv_q_head;
+ int len = 0;
+
+ ASSERT(MUTEX_HELD(&so->so_lock));
+
+ if (mp != NULL) {
+ len = msgdsize(mp);
+ while ((mp = mp->b_next) != NULL)
+ len += msgdsize(mp);
+ }
+ mp = so->so_rcv_head;
+ if (mp != NULL) {
+ len += msgdsize(mp);
+ while ((mp = mp->b_next) != NULL)
+ len += msgdsize(mp);
+ }
+ return ((len == so->so_rcv_queued) ? B_TRUE : B_FALSE);
+}
+#endif
+
+int
+so_get_mod_version(struct sockparams *sp)
+{
+ ASSERT(sp != NULL && sp->sp_smod_info != NULL);
+ return (sp->sp_smod_info->smod_version);
+}
+
+/*
+ * so_start_fallback()
+ *
+ * Block new socket operations from coming in, and wait for active operations
+ * to complete. Threads that are sleeping will be woken up so they can get
+ * out of the way.
+ *
+ * The caller must be a reader on so_fallback_rwlock.
+ */
+static boolean_t
+so_start_fallback(struct sonode *so)
+{
+ ASSERT(RW_READ_HELD(&so->so_fallback_rwlock));
+
+ mutex_enter(&so->so_lock);
+ if (so->so_state & SS_FALLBACK_PENDING) {
+ mutex_exit(&so->so_lock);
+ return (B_FALSE);
+ }
+ so->so_state |= SS_FALLBACK_PENDING;
+ /*
+ * Poke all threads that might be sleeping. Any operation that comes
+ * in after the cv_broadcast will observe the fallback pending flag
+ * which cause the call to return where it would normally sleep.
+ */
+ cv_broadcast(&so->so_state_cv); /* threads in connect() */
+ cv_broadcast(&so->so_rcv_cv); /* threads in recvmsg() */
+ cv_broadcast(&so->so_snd_cv); /* threads in sendmsg() */
+ mutex_enter(&so->so_acceptq_lock);
+ cv_broadcast(&so->so_acceptq_cv); /* threads in accept() */
+ mutex_exit(&so->so_acceptq_lock);
+ mutex_exit(&so->so_lock);
+
+ /*
+ * The main reason for the rw_tryupgrade call is to provide
+ * observability during the fallback process. We want to
+ * be able to see if there are pending operations.
+ */
+ if (rw_tryupgrade(&so->so_fallback_rwlock) == 0) {
+ /*
+ * It is safe to drop and reaquire the fallback lock, because
+ * we are guaranteed that another fallback cannot take place.
+ */
+ rw_exit(&so->so_fallback_rwlock);
+ DTRACE_PROBE1(pending__ops__wait, (struct sonode *), so);
+ rw_enter(&so->so_fallback_rwlock, RW_WRITER);
+ DTRACE_PROBE1(pending__ops__complete, (struct sonode *), so);
+ }
+
+ return (B_TRUE);
+}
+
+/*
+ * so_end_fallback()
+ *
+ * Allow socket opertions back in.
+ *
+ * The caller must be a writer on so_fallback_rwlock.
+ */
+static void
+so_end_fallback(struct sonode *so)
+{
+ ASSERT(RW_ISWRITER(&so->so_fallback_rwlock));
+
+ mutex_enter(&so->so_lock);
+ so->so_state &= ~SS_FALLBACK_PENDING;
+ mutex_exit(&so->so_lock);
+
+ rw_downgrade(&so->so_fallback_rwlock);
+}
+
+/*
+ * so_quiesced_cb()
+ *
+ * Callback passed to the protocol during fallback. It is called once
+ * the endpoint is quiescent.
+ *
+ * No requests from the user, no notifications from the protocol, so it
+ * is safe to synchronize the state. Data can also be moved without
+ * risk for reordering.
+ *
+ * NOTE: urgent data is dropped on the floor.
+ *
+ * We do not need to hold so_lock, since there can be only one thread
+ * operating on the sonode.
+ */
+static void
+so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q,
+ struct T_capability_ack *tcap, struct sockaddr *laddr, socklen_t laddrlen,
+ struct sockaddr *faddr, socklen_t faddrlen, short opts)
+{
+ struct sonode *so = (struct sonode *)sock_handle;
+
+ sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen, opts);
+
+ mutex_enter(&so->so_lock);
+ SOCKET_TIMER_CANCEL(so);
+ mutex_exit(&so->so_lock);
+ /*
+ * Move data to the STREAM head.
+ */
+ if (so->so_rcv_head != NULL) {
+ if (so->so_rcv_q_last_head == NULL)
+ so->so_rcv_q_head = so->so_rcv_head;
+ else
+ so->so_rcv_q_last_head->b_next = so->so_rcv_head;
+ so->so_rcv_q_last_head = so->so_rcv_last_head;
+ }
+
+ while (so->so_rcv_q_head != NULL) {
+ mblk_t *mp = so->so_rcv_q_head;
+ size_t mlen = msgdsize(mp);
+
+ so->so_rcv_q_head = mp->b_next;
+ mp->b_next = NULL;
+ mp->b_prev = NULL;
+ so->so_rcv_queued -= mlen;
+ putnext(q, mp);
+ }
+ ASSERT(so->so_rcv_queued == 0);
+ so->so_rcv_head = NULL;
+ so->so_rcv_last_head = NULL;
+ so->so_rcv_q_head = NULL;
+ so->so_rcv_q_last_head = NULL;
+
+#ifdef DEBUG
+ if (so->so_oobmsg != NULL || so->so_oobmark > 0) {
+ cmn_err(CE_NOTE, "losing oob data due to tpi fallback\n");
+ }
+#endif
+ if (so->so_oobmsg != NULL) {
+ freemsg(so->so_oobmsg);
+ so->so_oobmsg = NULL;
+ }
+ so->so_oobmark = 0;
+
+ ASSERT(so->so_rcv_queued == 0);
+}
+
+/*
+ * so_tpi_fallback()
+ *
+ * This is fallback initation routine; things start here.
+ *
+ * Basic strategy:
+ * o Block new socket operations from coming in
+ * o Allocate/initate info needed by TPI
+ * o Quiesce the connection, at which point we sync
+ * state and move data
+ * o Change operations (sonodeops) associated with the socket
+ * o Unblock threads waiting for the fallback to finish
+ */
+int
+so_tpi_fallback(struct sonode *so, struct cred *cr)
+{
+ int error;
+ queue_t *q;
+ struct sockparams *sp;
+ struct sockparams *newsp;
+ so_proto_fallback_func_t fbfunc;
+ boolean_t direct;
+
+ error = 0;
+ sp = so->so_sockparams;
+ fbfunc = sp->sp_smod_info->smod_proto_fallback_func;
+
+ /*
+ * Fallback can only happen if there is a device associated
+ * with the sonode, and the socket module has a fallback function.
+ */
+ if (!SOCKPARAMS_HAS_DEVICE(sp) || fbfunc == NULL)
+ return (EINVAL);
+
+ /*
+ * Initiate fallback; upon success we know that no new requests
+ * will come in from the user.
+ */
+ if (!so_start_fallback(so))
+ return (EAGAIN);
+
+ newsp = sockparams_hold_ephemeral_bydev(so->so_family, so->so_type,
+ so->so_protocol, so->so_sockparams->sp_sdev_info.sd_devpath,
+ KM_SLEEP, &error);
+ if (error != 0)
+ goto out;
+
+ if (so->so_direct != NULL) {
+ sodirect_t *sodp = so->so_direct;
+ mutex_enter(sodp->sod_lockp);
+
+ so->so_direct->sod_state &= ~SOD_ENABLED;
+ so->so_state &= ~SS_SODIRECT;
+ ASSERT(sodp->sod_uioafh == NULL);
+ mutex_exit(sodp->sod_lockp);
+ }
+
+ /* Turn sonode into a TPI socket */
+ q = sotpi_convert_sonode(so, newsp, &direct, cr);
+ if (q == NULL) {
+ zcmn_err(getzoneid(), CE_WARN,
+ "Failed to convert socket to TPI. Pid = %d\n",
+ curproc->p_pid);
+ SOCKPARAMS_DEC_REF(newsp);
+ error = EINVAL;
+ goto out;
+ }
+
+ /*
+ * Now tell the protocol to start using TPI. so_quiesced_cb be
+ * called once it's safe to synchronize state.
+ */
+ DTRACE_PROBE1(proto__fallback__begin, struct sonode *, so);
+ /* FIXME assumes this cannot fail. TCP can fail to enter squeue */
+ (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb);
+ DTRACE_PROBE1(proto__fallback__end, struct sonode *, so);
+
+ /*
+ * Free all pending connection indications, i.e., socket_accept() has
+ * not yet pulled the connection of the queue. The transport sent
+ * a T_CONN_IND message for each pending connection to the STREAM head.
+ */
+ so_acceptq_flush(so);
+
+ mutex_enter(&so->so_lock);
+ so->so_state |= SS_FALLBACK_COMP;
+ mutex_exit(&so->so_lock);
+
+ /*
+ * Swap the sonode ops. Socket opertations that come in once this
+ * is done will proceed without blocking.
+ */
+ so->so_ops = &sotpi_sonodeops;
+
+ /*
+ * Wake up any threads stuck in poll. This is needed since the poll
+ * head changes when the fallback happens (moves from the sonode to
+ * the STREAMS head).
+ */
+ pollwakeup(&so->so_poll_list, POLLERR);
+out:
+ so_end_fallback(so);
+
+ return (error);
+}
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_vnops.c b/usr/src/uts/common/fs/sockfs/sockcommon_vnops.c
new file mode 100644
index 0000000000..ffcecfa7c1
--- /dev/null
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_vnops.c
@@ -0,0 +1,482 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bitmap.h>
+#include <sys/debug.h>
+#include <sys/errno.h>
+#include <sys/strsubr.h>
+#include <sys/cmn_err.h>
+#include <sys/sysmacros.h>
+#include <sys/filio.h>
+#include <sys/flock.h>
+#include <sys/stat.h>
+#include <sys/share.h>
+
+#include <sys/vfs.h>
+#include <sys/vfs_opreg.h>
+
+#include <sys/sockio.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/strsun.h>
+
+#include <fs/sockfs/sockcommon.h>
+#include <fs/sockfs/socktpi.h>
+
+/*
+ * Generic vnode ops
+ */
+static int socket_vop_open(struct vnode **, int, struct cred *,
+ caller_context_t *);
+static int socket_vop_close(struct vnode *, int, int, offset_t,
+ struct cred *, caller_context_t *);
+static int socket_vop_read(struct vnode *, struct uio *, int,
+ struct cred *, caller_context_t *);
+static int socket_vop_write(struct vnode *, struct uio *, int,
+ struct cred *, caller_context_t *);
+static int socket_vop_ioctl(struct vnode *, int, intptr_t, int,
+ struct cred *, int32_t *, caller_context_t *);
+static int socket_vop_setfl(struct vnode *, int, int, cred_t *,
+ caller_context_t *);
+static int socket_vop_getattr(struct vnode *, struct vattr *, int,
+ struct cred *, caller_context_t *);
+static int socket_vop_setattr(struct vnode *, struct vattr *, int,
+ struct cred *, caller_context_t *);
+static int socket_vop_access(struct vnode *, int, int, struct cred *,
+ caller_context_t *);
+static int socket_vop_fsync(struct vnode *, int, struct cred *,
+ caller_context_t *);
+static void socket_vop_inactive(struct vnode *, struct cred *,
+ caller_context_t *);
+static int socket_vop_fid(struct vnode *, struct fid *,
+ caller_context_t *);
+static int socket_vop_seek(struct vnode *, offset_t, offset_t *,
+ caller_context_t *);
+static int socket_vop_poll(struct vnode *, short, int, short *,
+ struct pollhead **, caller_context_t *);
+
+extern int socket_close_internal(struct sonode *, int, cred_t *);
+extern void socket_destroy_internal(struct sonode *, cred_t *);
+
+struct vnodeops *socket_vnodeops;
+const fs_operation_def_t socket_vnodeops_template[] = {
+ VOPNAME_OPEN, { .vop_open = socket_vop_open },
+ VOPNAME_CLOSE, { .vop_close = socket_vop_close },
+ VOPNAME_READ, { .vop_read = socket_vop_read },
+ VOPNAME_WRITE, { .vop_write = socket_vop_write },
+ VOPNAME_IOCTL, { .vop_ioctl = socket_vop_ioctl },
+ VOPNAME_SETFL, { .vop_setfl = socket_vop_setfl },
+ VOPNAME_GETATTR, { .vop_getattr = socket_vop_getattr },
+ VOPNAME_SETATTR, { .vop_setattr = socket_vop_setattr },
+ VOPNAME_ACCESS, { .vop_access = socket_vop_access },
+ VOPNAME_FSYNC, { .vop_fsync = socket_vop_fsync },
+ VOPNAME_INACTIVE, { .vop_inactive = socket_vop_inactive },
+ VOPNAME_FID, { .vop_fid = socket_vop_fid },
+ VOPNAME_SEEK, { .vop_seek = socket_vop_seek },
+ VOPNAME_POLL, { .vop_poll = socket_vop_poll },
+ VOPNAME_DISPOSE, { .error = fs_error },
+ NULL, NULL
+};
+
+
+/*
+ * generic vnode ops
+ */
+
+/*ARGSUSED*/
+static int
+socket_vop_open(struct vnode **vpp, int flag, struct cred *cr,
+ caller_context_t *ct)
+{
+ struct vnode *vp = *vpp;
+ struct sonode *so = VTOSO(vp);
+
+ flag &= ~FCREAT; /* paranoia */
+ mutex_enter(&so->so_lock);
+ so->so_count++;
+ mutex_exit(&so->so_lock);
+
+ ASSERT(so->so_count != 0); /* wraparound */
+ ASSERT(vp->v_type == VSOCK);
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+socket_vop_close(struct vnode *vp, int flag, int count, offset_t offset,
+ struct cred *cr, caller_context_t *ct)
+{
+ struct sonode *so;
+ int error = 0;
+
+ so = VTOSO(vp);
+ ASSERT(vp->v_type == VSOCK);
+
+ cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
+ cleanshares(vp, ttoproc(curthread)->p_pid);
+
+ if (vp->v_stream)
+ strclean(vp);
+
+ if (count > 1) {
+ dprint(2, ("socket_vop_close: count %d\n", count));
+ return (0);
+ }
+
+ mutex_enter(&so->so_lock);
+ if (--so->so_count == 0) {
+ /*
+ * Initiate connection shutdown.
+ */
+ mutex_exit(&so->so_lock);
+ error = socket_close_internal(so, flag, cr);
+ } else {
+ mutex_exit(&so->so_lock);
+ }
+
+ return (error);
+}
+
+/*ARGSUSED2*/
+static int
+socket_vop_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
+ caller_context_t *ct)
+{
+ struct sonode *so = VTOSO(vp);
+ struct nmsghdr lmsg;
+
+ ASSERT(vp->v_type == VSOCK);
+ bzero((void *)&lmsg, sizeof (lmsg));
+
+ return (socket_recvmsg(so, &lmsg, uiop, cr));
+}
+
+/*ARGSUSED2*/
+static int
+socket_vop_write(struct vnode *vp, struct uio *uiop, int ioflag,
+ struct cred *cr, caller_context_t *ct)
+{
+ struct sonode *so = VTOSO(vp);
+ struct nmsghdr lmsg;
+
+ ASSERT(vp->v_type == VSOCK);
+ bzero((void *)&lmsg, sizeof (lmsg));
+
+ if (!(so->so_mode & SM_BYTESTREAM)) {
+ /*
+ * If the socket is not byte stream set MSG_EOR
+ */
+ lmsg.msg_flags = MSG_EOR;
+ }
+
+ return (socket_sendmsg(so, &lmsg, uiop, cr));
+}
+
+/*ARGSUSED4*/
+static int
+socket_vop_ioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
+ struct cred *cr, int32_t *rvalp, caller_context_t *ct)
+{
+ struct sonode *so = VTOSO(vp);
+
+ ASSERT(vp->v_type == VSOCK);
+
+ return (socket_ioctl(so, cmd, arg, mode, cr, rvalp));
+}
+
+/*
+ * Allow any flags. Record FNDELAY and FNONBLOCK so that they can be inherited
+ * from listener to acceptor.
+ */
+/* ARGSUSED */
+static int
+socket_vop_setfl(vnode_t *vp, int oflags, int nflags, cred_t *cr,
+ caller_context_t *ct)
+{
+ struct sonode *so = VTOSO(vp);
+ int error = 0;
+
+ ASSERT(vp->v_type == VSOCK);
+
+ mutex_enter(&so->so_lock);
+ if (nflags & FNDELAY)
+ so->so_state |= SS_NDELAY;
+ else
+ so->so_state &= ~SS_NDELAY;
+ if (nflags & FNONBLOCK)
+ so->so_state |= SS_NONBLOCK;
+ else
+ so->so_state &= ~SS_NONBLOCK;
+ mutex_exit(&so->so_lock);
+
+ if (so->so_state & SS_ASYNC)
+ oflags |= FASYNC;
+ /*
+ * Sets/clears the SS_ASYNC flag based on the presence/absence
+ * of the FASYNC flag passed to fcntl(F_SETFL).
+ * This exists solely for BSD fcntl() FASYNC compatibility.
+ */
+ if ((oflags ^ nflags) & FASYNC && so->so_version != SOV_STREAM) {
+ int async = nflags & FASYNC;
+ int32_t rv;
+
+ /*
+ * For non-TPI sockets all we have to do is set/remove the
+ * SS_ASYNC bit, but for TPI it is more involved. For that
+ * reason we delegate the job to the protocol's ioctl handler.
+ */
+ error = socket_ioctl(so, FIOASYNC, (intptr_t)&async, FKIOCTL,
+ cr, &rv);
+ }
+ return (error);
+}
+
+
+/*
+ * Get the made up attributes for the vnode.
+ * 4.3BSD returns the current time for all the timestamps.
+ * 4.4BSD returns 0 for all the timestamps.
+ * Here we use the access and modified times recorded in the sonode.
+ *
+ * Just like in BSD there is not effect on the underlying file system node
+ * bound to an AF_UNIX pathname.
+ *
+ * When sockmod has been popped this will act just like a stream. Since
+ * a socket is always a clone there is no need to inspect the attributes
+ * of the "realvp".
+ */
+/* ARGSUSED */
+int
+socket_vop_getattr(struct vnode *vp, struct vattr *vap, int flags,
+ struct cred *cr, caller_context_t *ct)
+{
+ dev_t fsid;
+ struct sonode *so;
+ static int sonode_shift = 0;
+
+ /*
+ * Calculate the amount of bitshift to a sonode pointer which will
+ * still keep it unique. See below.
+ */
+ if (sonode_shift == 0)
+ sonode_shift = highbit(sizeof (struct sonode));
+ ASSERT(sonode_shift > 0);
+
+ so = VTOSO(vp);
+ fsid = sockdev;
+
+ if (so->so_version == SOV_STREAM) {
+ /*
+ * The imaginary "sockmod" has been popped - act
+ * as a stream
+ */
+ vap->va_type = VCHR;
+ vap->va_mode = 0;
+ } else {
+ vap->va_type = vp->v_type;
+ vap->va_mode = S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|
+ S_IROTH|S_IWOTH;
+ }
+ vap->va_uid = vap->va_gid = 0;
+ vap->va_fsid = fsid;
+ /*
+ * If the va_nodeid is > MAX_USHORT, then i386 stats might fail.
+ * So we shift down the sonode pointer to try and get the most
+ * uniqueness into 16-bits.
+ */
+ vap->va_nodeid = ((ino_t)so >> sonode_shift) & 0xFFFF;
+ vap->va_nlink = 0;
+ vap->va_size = 0;
+
+ /*
+ * We need to zero out the va_rdev to avoid some fstats getting
+ * EOVERFLOW. This also mimics SunOS 4.x and BSD behavior.
+ */
+ vap->va_rdev = (dev_t)0;
+ vap->va_blksize = MAXBSIZE;
+ vap->va_nblocks = btod(vap->va_size);
+
+ if (!SOCK_IS_NONSTR(so)) {
+ sotpi_info_t *sti = SOTOTPI(so);
+
+ mutex_enter(&so->so_lock);
+ vap->va_atime.tv_sec = sti->sti_atime;
+ vap->va_mtime.tv_sec = sti->sti_mtime;
+ vap->va_ctime.tv_sec = sti->sti_ctime;
+ mutex_exit(&so->so_lock);
+ } else {
+ vap->va_atime.tv_sec = 0;
+ vap->va_mtime.tv_sec = 0;
+ vap->va_ctime.tv_sec = 0;
+ }
+
+ vap->va_atime.tv_nsec = 0;
+ vap->va_mtime.tv_nsec = 0;
+ vap->va_ctime.tv_nsec = 0;
+ vap->va_seq = 0;
+
+ return (0);
+}
+
+/*
+ * Set attributes.
+ * Just like in BSD there is not effect on the underlying file system node
+ * bound to an AF_UNIX pathname.
+ *
+ * When sockmod has been popped this will act just like a stream. Since
+ * a socket is always a clone there is no need to modify the attributes
+ * of the "realvp".
+ */
+/* ARGSUSED */
+int
+socket_vop_setattr(struct vnode *vp, struct vattr *vap, int flags,
+ struct cred *cr, caller_context_t *ct)
+{
+ struct sonode *so = VTOSO(vp);
+
+ /*
+ * If times were changed, and we have a STREAMS socket, then update
+ * the sonode.
+ */
+ if (!SOCK_IS_NONSTR(so)) {
+ sotpi_info_t *sti = SOTOTPI(so);
+
+ mutex_enter(&so->so_lock);
+ if (vap->va_mask & AT_ATIME)
+ sti->sti_atime = vap->va_atime.tv_sec;
+ if (vap->va_mask & AT_MTIME) {
+ sti->sti_mtime = vap->va_mtime.tv_sec;
+ sti->sti_ctime = gethrestime_sec();
+ }
+ mutex_exit(&so->so_lock);
+ }
+
+ return (0);
+}
+
+/*
+ * Check if user is allowed to access vp. For non-STREAMS based sockets,
+ * there might not be a device attached to the file system. So for those
+ * types of sockets there are no permissions to check.
+ *
+ * XXX Should there be some other mechanism to check access rights?
+ */
+/*ARGSUSED*/
+int
+socket_vop_access(struct vnode *vp, int mode, int flags, struct cred *cr,
+ caller_context_t *ct)
+{
+ struct sonode *so = VTOSO(vp);
+
+ if (!SOCK_IS_NONSTR(so)) {
+ ASSERT(so->so_sockparams->sp_sdev_info.sd_vnode != NULL);
+ return (VOP_ACCESS(so->so_sockparams->sp_sdev_info.sd_vnode,
+ mode, flags, cr, NULL));
+ }
+ return (0);
+}
+
+/*
+ * 4.3BSD and 4.4BSD fail a fsync on a socket with EINVAL.
+ * This code does the same to be compatible and also to not give an
+ * application the impression that the data has actually been "synced"
+ * to the other end of the connection.
+ */
+/* ARGSUSED */
+int
+socket_vop_fsync(struct vnode *vp, int syncflag, struct cred *cr,
+ caller_context_t *ct)
+{
+ return (EINVAL);
+}
+
+/*ARGSUSED*/
+static void
+socket_vop_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
+{
+ struct sonode *so = VTOSO(vp);
+
+ ASSERT(vp->v_type == VSOCK);
+
+ mutex_enter(&vp->v_lock);
+ /*
+ * If no one has reclaimed the vnode, remove from the
+ * cache now.
+ */
+ if (vp->v_count < 1)
+ cmn_err(CE_PANIC, "socket_inactive: Bad v_count");
+
+ /*
+ * Drop the temporary hold by vn_rele now
+ */
+ if (--vp->v_count != 0) {
+ mutex_exit(&vp->v_lock);
+ return;
+ }
+ mutex_exit(&vp->v_lock);
+
+
+ ASSERT(!vn_has_cached_data(vp));
+
+ /* socket specfic clean-up */
+ socket_destroy_internal(so, cr);
+}
+
+/* ARGSUSED */
+int
+socket_vop_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
+{
+ return (EINVAL);
+}
+
+/*
+ * Sockets are not seekable.
+ * (and there is a bug to fix STREAMS to make them fail this as well).
+ */
+/*ARGSUSED*/
+int
+socket_vop_seek(struct vnode *vp, offset_t ooff, offset_t *noffp,
+ caller_context_t *ct)
+{
+ return (ESPIPE);
+}
+
+/*ARGSUSED*/
+static int
+socket_vop_poll(struct vnode *vp, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp, caller_context_t *ct)
+{
+ struct sonode *so = VTOSO(vp);
+
+ ASSERT(vp->v_type == VSOCK);
+
+ return (socket_poll(so, events, anyyet, reventsp, phpp));
+}
diff --git a/usr/src/uts/common/fs/sockfs/socknotify.c b/usr/src/uts/common/fs/sockfs/socknotify.c
new file mode 100644
index 0000000000..788efa9ff5
--- /dev/null
+++ b/usr/src/uts/common/fs/sockfs/socknotify.c
@@ -0,0 +1,379 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/stropts.h>
+#include <sys/socketvar.h>
+#include <sys/ksocket.h>
+#include <io/ksocket/ksocket_impl.h>
+#include <fs/sockfs/sockcommon.h>
+
+/*
+ * There can only be a single thread waiting for data (enforced by
+ * so_lock_read()), whereas for write there might be multiple threads
+ * waiting for transmit buffers. So therefore we use cv_broadcast for
+ * write and cv_signal for read.
+ */
+#define SO_WAKEUP_READER(so) { \
+ if ((so)->so_rcv_wakeup) { \
+ (so)->so_rcv_wakeup = B_FALSE; \
+ cv_signal(&(so)->so_rcv_cv); \
+ } \
+}
+
+#define SO_WAKEUP_WRITER(so) { \
+ if ((so)->so_snd_wakeup) { \
+ (so)->so_snd_wakeup = B_FALSE; \
+ cv_broadcast(&(so)->so_snd_cv); \
+ } \
+}
+
+static int i_so_notify_last_rx(struct sonode *, int *, int *);
+static int i_so_notify_last_tx(struct sonode *, int *, int *);
+
+/*
+ * The notification functions must be called with so_lock held,
+ * and they will all *drop* so_lock before returning.
+ */
+
+/*
+ * Wake up anyone waiting for the connection to be established.
+ */
+void
+so_notify_connected(struct sonode *so)
+{
+ ASSERT(MUTEX_HELD(&so->so_lock));
+
+ if (IS_KERNEL_SOCKET(so)) {
+ KSOCKET_CALLBACK(so, connected, 0);
+ mutex_exit(&so->so_lock);
+ } else {
+ socket_sendsig(so, SOCKETSIG_WRITE);
+ mutex_exit(&so->so_lock);
+ pollwakeup(&so->so_poll_list, POLLOUT);
+ }
+
+ ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+}
+
+/*
+ * The socket is disconnecting, so no more data can be sent. Wake up
+ * anyone that is waiting to send data.
+ */
+void
+so_notify_disconnecting(struct sonode *so)
+{
+ int pollev = 0;
+ int sigev = 0;
+
+ ASSERT(MUTEX_HELD(&so->so_lock));
+
+ if (IS_KERNEL_SOCKET(so)) {
+ SO_WAKEUP_WRITER(so);
+ KSOCKET_CALLBACK(so, cantsendmore, 0);
+ mutex_exit(&so->so_lock);
+ } else if (i_so_notify_last_tx(so, &pollev, &sigev)) {
+ socket_sendsig(so, sigev);
+ mutex_exit(&so->so_lock);
+ pollwakeup(&so->so_poll_list, pollev);
+ } else {
+ mutex_exit(&so->so_lock);
+ }
+
+ ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+}
+
+/*
+ * The socket is disconnected, so not more data can be sent or received.
+ * Wake up anyone that is waiting to send or receive data.
+ */
+void
+so_notify_disconnected(struct sonode *so, int error)
+{
+ int pollev = 0;
+ int sigev = 0;
+
+ ASSERT(MUTEX_HELD(&so->so_lock));
+
+ (void) i_so_notify_last_tx(so, &pollev, &sigev);
+ (void) i_so_notify_last_rx(so, &pollev, &sigev);
+
+ if (IS_KERNEL_SOCKET(so)) {
+ KSOCKET_CALLBACK(so, disconnected, error);
+ mutex_exit(&so->so_lock);
+ } else {
+ if (sigev != 0)
+ socket_sendsig(so, sigev);
+ mutex_exit(&so->so_lock);
+ if (pollev != 0)
+ pollwakeup(&so->so_poll_list, pollev);
+ }
+
+ ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+}
+
+/*
+ * The socket is writeable. Wake up anyone waiting to send data.
+ */
+void
+so_notify_writable(struct sonode *so)
+{
+ ASSERT(MUTEX_HELD(&so->so_lock));
+
+ SO_WAKEUP_WRITER(so);
+
+ if (IS_KERNEL_SOCKET(so)) {
+ KSOCKET_CALLBACK(so, cansend, 0);
+ mutex_exit(&so->so_lock);
+ } else {
+ socket_sendsig(so, SOCKETSIG_WRITE);
+ mutex_exit(&so->so_lock);
+ pollwakeup(&so->so_poll_list, POLLOUT);
+ }
+
+ ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+}
+
+/*
+ * Data is available, so wake up anyone waiting for data.
+ */
+void
+so_notify_data(struct sonode *so, size_t qlen)
+{
+ ASSERT(MUTEX_HELD(&so->so_lock));
+
+ SO_WAKEUP_READER(so);
+
+ if (IS_KERNEL_SOCKET(so)) {
+ KSOCKET_CALLBACK(so, newdata, qlen);
+ mutex_exit(&so->so_lock);
+ } else {
+ socket_sendsig(so, SOCKETSIG_READ);
+ if (so->so_pollev & (SO_POLLEV_IN|SO_POLLEV_ALWAYS)) {
+ so->so_pollev &= ~SO_POLLEV_IN;
+ mutex_exit(&so->so_lock);
+ pollwakeup(&so->so_poll_list, POLLIN|POLLRDNORM);
+ } else {
+ mutex_exit(&so->so_lock);
+ }
+ }
+
+ ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+}
+
+/*
+ * Transient error. Wake up anyone waiting to send or receive data.
+ */
+void
+so_notify_error(struct sonode *so)
+{
+ ASSERT(MUTEX_HELD(&so->so_lock));
+
+ SO_WAKEUP_WRITER(so);
+ SO_WAKEUP_READER(so);
+
+ if (IS_KERNEL_SOCKET(so)) {
+ KSOCKET_CALLBACK(so, error, 0);
+ mutex_exit(&so->so_lock);
+ } else {
+ socket_sendsig(so, SOCKETSIG_WRITE|SOCKETSIG_READ);
+ so->so_pollev &= ~SO_POLLEV_IN;
+ mutex_exit(&so->so_lock);
+ pollwakeup(&so->so_poll_list, POLLOUT|POLLIN|POLLRDNORM);
+ }
+
+ ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+}
+
+/*
+ * Out-of-band data is incoming, notify any interested parties.
+ */
+void
+so_notify_oobsig(struct sonode *so)
+{
+ socket_sendsig(so, SOCKETSIG_URG);
+ mutex_exit(&so->so_lock);
+ pollwakeup(&so->so_poll_list, POLLRDBAND);
+}
+
+/*
+ * Received out-of-band data. If the OOB data is delivered inline, then
+ * in addition of regular OOB notification, anyone waiting for normal
+ * data is also notified.
+ */
+void
+so_notify_oobdata(struct sonode *so, boolean_t oob_inline)
+{
+ ASSERT(MUTEX_HELD(&so->so_lock));
+ SOD_UIOAFINI(so->so_direct);
+
+ if (IS_KERNEL_SOCKET(so)) {
+ if (oob_inline)
+ SO_WAKEUP_READER(so);
+ KSOCKET_CALLBACK(so, oobdata, 0);
+ mutex_exit(&so->so_lock);
+ } else {
+ if (oob_inline) {
+ socket_sendsig(so, SOCKETSIG_READ);
+ so->so_pollev &= ~SO_POLLEV_IN;
+ mutex_exit(&so->so_lock);
+ pollwakeup(&so->so_poll_list,
+ POLLRDBAND|POLLIN|POLLRDNORM);
+
+ SO_WAKEUP_READER(so);
+ } else {
+ mutex_exit(&so->so_lock);
+ pollwakeup(&so->so_poll_list, POLLRDBAND);
+ }
+ }
+
+ ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+}
+
+/*
+ * End-of-file has been reach, so peer will send no new data. Wake up
+ * anyone that is waiting for data.
+ */
+void
+so_notify_eof(struct sonode *so)
+{
+ int pollev = 0;
+ int sigev = 0;
+
+ ASSERT(MUTEX_HELD(&so->so_lock));
+
+ (void) i_so_notify_last_rx(so, &pollev, &sigev);
+
+ if (IS_KERNEL_SOCKET(so)) {
+ SO_WAKEUP_READER(so);
+ KSOCKET_CALLBACK(so, cantrecvmore, 0);
+ mutex_exit(&so->so_lock);
+ } else {
+ if (sigev != 0)
+ socket_sendsig(so, sigev);
+ mutex_exit(&so->so_lock);
+ if (pollev != 0)
+ pollwakeup(&so->so_poll_list, pollev);
+
+ }
+
+ ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+}
+
+/*
+ * Wake up anyone waiting for a new connection.
+ */
+void
+so_notify_newconn(struct sonode *so)
+{
+ ASSERT(MUTEX_HELD(&so->so_lock));
+
+ if (IS_KERNEL_SOCKET(so)) {
+ KSOCKET_CALLBACK(so, newconn, so->so_rcv_queued);
+ mutex_exit(&so->so_lock);
+ } else {
+ socket_sendsig(so, SOCKETSIG_READ);
+ if (so->so_pollev & (SO_POLLEV_IN|SO_POLLEV_ALWAYS)) {
+ so->so_pollev &= ~SO_POLLEV_IN;
+ mutex_exit(&so->so_lock);
+ pollwakeup(&so->so_poll_list, POLLIN|POLLRDNORM);
+ } else {
+ mutex_exit(&so->so_lock);
+ }
+ }
+
+ ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+}
+
+/*
+ * User initated shutdown/close, wake anyone that is trying to do
+ * an operation that is no longer possible.
+ */
+void
+so_notify_shutdown(struct sonode *so)
+{
+ int pollev = 0;
+ int sigev = 0;
+
+ ASSERT(MUTEX_HELD(&so->so_lock));
+ ASSERT(so->so_state & (SS_CANTSENDMORE|SS_CANTRCVMORE));
+
+ if (so->so_state & SS_CANTSENDMORE)
+ (void) i_so_notify_last_tx(so, &pollev, &sigev);
+ if (so->so_state & SS_CANTRCVMORE)
+ (void) i_so_notify_last_rx(so, &pollev, &sigev);
+
+ if (sigev != 0)
+ socket_sendsig(so, sigev);
+ mutex_exit(&so->so_lock);
+ if (pollev != 0)
+ pollwakeup(&so->so_poll_list, pollev);
+
+ ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+}
+
+/*
+ * No more data will be coming in, and this will be the last notification
+ * made.
+ */
+static int
+i_so_notify_last_rx(struct sonode *so, int *pollev, int *sigev)
+{
+ if (!(so->so_state & SS_SENTLASTREADSIG)) {
+ SOCKET_TIMER_CANCEL(so);
+ SO_WAKEUP_READER(so);
+ so->so_state |= SS_SENTLASTREADSIG;
+ so->so_pollev &= ~SO_POLLEV_IN;
+
+ *pollev |= POLLIN|POLLRDNORM;
+ *sigev |= SOCKETSIG_READ;
+
+ return (1);
+ } else {
+ return (0);
+ }
+}
+
+/*
+ * The socket is un-writeable. Make one last notification.
+ */
+static int
+i_so_notify_last_tx(struct sonode *so, int *pollev, int *sigev)
+{
+ if (!(so->so_state & SS_SENTLASTWRITESIG)) {
+ SO_WAKEUP_WRITER(so);
+ so->so_state |= SS_SENTLASTWRITESIG;
+
+ *pollev |= POLLOUT;
+ *sigev |= SOCKETSIG_WRITE;
+
+ return (1);
+ } else {
+ return (0);
+ }
+}
diff --git a/usr/src/uts/common/fs/sockfs/sockparams.c b/usr/src/uts/common/fs/sockfs/sockparams.c
new file mode 100644
index 0000000000..2e1d11c64e
--- /dev/null
+++ b/usr/src/uts/common/fs/sockfs/sockparams.c
@@ -0,0 +1,723 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/list.h>
+
+#include <sys/stropts.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+
+#include <fs/sockfs/sockcommon.h>
+#include <fs/sockfs/socktpi.h>
+
+/*
+ * Socket Parameters
+ *
+ * Socket parameter (struct sockparams) entries represent the socket types
+ * available on the system.
+ *
+ * Flags (sp_flags):
+ *
+ * SOCKPARAMS_EPHEMERAL: A temporary sockparams entry that will be deleted
+ * as soon as its' ref count drops to zero. In addition, ephemeral entries will
+ * never be hooked onto the global sockparams list. Ephemeral entries are
+ * created when application requests to create a socket using an application
+ * supplied device path, or when a socket is falling back to TPI.
+ *
+ * Lock order:
+ * The lock order is splist_lock -> sp_lock.
+ * The lock order is sp_ephem_lock -> sp_lock.
+ */
+extern int kobj_path_exists(char *, int);
+extern void nl7c_init(void);
+extern int sockfs_defer_nl7c_init;
+
+static int sockparams_sdev_init(struct sockparams *, char *, int);
+static void sockparams_sdev_fini(struct sockparams *);
+
+/*
+ * Global sockparams list (populated via soconfig(1M)).
+ */
+static list_t sphead;
+static krwlock_t splist_lock;
+
+/*
+ * List of ephemeral sockparams.
+ */
+static list_t sp_ephem_list;
+static krwlock_t sp_ephem_lock;
+
+/*
+ * Mearch criteria used by sockparams_find()
+ */
+typedef enum sp_match_criteria {
+ SP_MATCH_EXACT, /* family, type & proto must match */
+ SP_MATCH_WILDCARD, /* family & type must match, proto can be 0 */
+ SP_MATCH_INC_DEV, /* same as exact, but dev must also match */
+ SP_MATCH_INC_MOD /* same as exact, but mod must also match */
+} sp_match_criteria_t;
+
+
+void
+sockparams_init(void)
+{
+ list_create(&sphead, sizeof (struct sockparams),
+ offsetof(struct sockparams, sp_node));
+ list_create(&sp_ephem_list, sizeof (struct sockparams),
+ offsetof(struct sockparams, sp_node));
+
+ rw_init(&splist_lock, NULL, RW_DEFAULT, NULL);
+ rw_init(&sp_ephem_lock, NULL, RW_DEFAULT, NULL);
+}
+
+/*
+ * sockparams_create(int family, int type, int protocol, char *modname,
+ * char *devpath, int devpathlen, int flags, int kmflags, int *errorp)
+ *
+ * Create a new sockparams entry.
+ *
+ * Arguments:
+ * family, type, protocol: specifies the socket type
+ * modname: Name of the module associated with the socket type. The
+ * module can be NULL if a device path is given, in which
+ * case the TPI module is used.
+ * devpath: Path to the STREAMS device. May be NULL for non-STREAMS
+ * based transports, or those transports that do not provide
+ * the capability to fallback to STREAMS.
+ * devpathlen: Length of the devpath string. The argument can be 0,
+ * indicating that devpath was allocated statically, and should
+ * not be freed when the sockparams entry is destroyed.
+ *
+ * flags : SOCKPARAMS_EPHEMERAL is the only flag that is allowed.
+ * kmflags: KM_{NO,}SLEEP
+ * errorp : Value-return argument, set when an error occurs.
+ *
+ * Returns:
+ * On success a new sockparams entry is returned, and *errorp is set
+ * to 0. On failure NULL is returned and *errorp is set to indicate the
+ * type of error that occured.
+ *
+ * Notes:
+ * devpath and modname are freed upon failure.
+ */
+struct sockparams *
+sockparams_create(int family, int type, int protocol, char *modname,
+ char *devpath, int devpathlen, int flags, int kmflags, int *errorp)
+{
+ struct sockparams *sp = NULL;
+ size_t size;
+
+ ASSERT((flags & ~SOCKPARAMS_EPHEMERAL) == 0);
+ if (flags & ~SOCKPARAMS_EPHEMERAL) {
+ *errorp = EINVAL;
+ goto error;
+ }
+
+ /* either a module or device must be given */
+ if (modname == NULL && devpath == NULL) {
+ *errorp = EINVAL;
+ goto error;
+ }
+
+ sp = kmem_zalloc(sizeof (*sp), kmflags);
+ if (sp == NULL) {
+ *errorp = ENOMEM;
+ goto error;
+ }
+ sp->sp_family = family;
+ sp->sp_type = type;
+ sp->sp_protocol = protocol;
+ sp->sp_refcnt = 0;
+ sp->sp_flags = flags;
+
+ if (modname != NULL) {
+ sp->sp_smod_name = modname;
+ } else {
+ size = strlen(SOTPI_SMOD_NAME) + 1;
+ modname = kmem_zalloc(size, kmflags);
+ if (modname == NULL) {
+ *errorp = ENOMEM;
+ goto error;
+ }
+ sp->sp_smod_name = modname;
+ (void) sprintf(sp->sp_smod_name, "%s", SOTPI_SMOD_NAME);
+ }
+
+ if (devpath != NULL) {
+ /* Set up the device entry. */
+ *errorp = sockparams_sdev_init(sp, devpath, devpathlen);
+ if (*errorp != 0)
+ goto error;
+ }
+
+ mutex_init(&sp->sp_lock, NULL, MUTEX_DEFAULT, NULL);
+ *errorp = 0;
+ return (sp);
+error:
+ ASSERT(*errorp != 0);
+ if (modname != NULL)
+ kmem_free(modname, strlen(modname) + 1);
+ if (devpathlen != 0)
+ kmem_free(devpath, devpathlen);
+ if (sp != NULL)
+ kmem_free(sp, sizeof (*sp));
+ return (NULL);
+}
+
+/*
+ * Initialize the STREAMS device aspect of the sockparams entry.
+ */
+static int
+sockparams_sdev_init(struct sockparams *sp, char *devpath, int devpathlen)
+{
+ vnode_t *vp = NULL;
+ int error;
+
+ ASSERT(devpath != NULL);
+
+ if ((error = sogetvp(devpath, &vp, UIO_SYSSPACE)) != 0) {
+ dprint(0, ("sockparams_sdev_init: vp %s failed with %d\n",
+ devpath, error));
+ return (error);
+ }
+
+ ASSERT(vp != NULL);
+ sp->sp_sdev_info.sd_vnode = vp;
+ sp->sp_sdev_info.sd_devpath = devpath;
+ sp->sp_sdev_info.sd_devpathlen = devpathlen;
+
+ return (0);
+}
+
+/*
+ * sockparams_destroy(struct sockparams *sp)
+ *
+ * Releases all the resources associated with the sockparams entry,
+ * and frees the sockparams entry.
+ *
+ * Arguments:
+ * sp: the sockparams entry to destroy.
+ *
+ * Returns:
+ * Nothing.
+ *
+ * Locking:
+ * The sp_lock of the entry can not be held.
+ */
+void
+sockparams_destroy(struct sockparams *sp)
+{
+ ASSERT(sp->sp_refcnt == 0);
+ ASSERT(!list_link_active(&sp->sp_node));
+
+ sockparams_sdev_fini(sp);
+
+ if (sp->sp_smod_info != NULL)
+ SMOD_DEC_REF(sp, sp->sp_smod_info);
+ kmem_free(sp->sp_smod_name, strlen(sp->sp_smod_name) + 1);
+ sp->sp_smod_name = NULL;
+ sp->sp_smod_info = NULL;
+ mutex_destroy(&sp->sp_lock);
+
+ kmem_free(sp, sizeof (*sp));
+}
+
+/*
+ * Clean up the STREAMS device part of the sockparams entry.
+ */
+static void
+sockparams_sdev_fini(struct sockparams *sp)
+{
+ sdev_info_t sd;
+
+ /*
+ * if the entry does not have a STREAMS device, then there
+ * is nothing to do.
+ */
+ if (!SOCKPARAMS_HAS_DEVICE(sp))
+ return;
+
+ sd = sp->sp_sdev_info;
+ if (sd.sd_vnode != NULL)
+ VN_RELE(sd.sd_vnode);
+ if (sd.sd_devpathlen != 0)
+ kmem_free(sd.sd_devpath, sd.sd_devpathlen);
+
+ sp->sp_sdev_info.sd_vnode = NULL;
+ sp->sp_sdev_info.sd_devpath = NULL;
+}
+
+/*
+ * Look for a matching sockparams entry on the given list.
+ *
+ * The caller must hold the associated list lock.
+ */
+static struct sockparams *
+sockparams_find(list_t *list, int family, int type, int protocol,
+ enum sp_match_criteria crit, const char *name)
+{
+ struct sockparams *sp;
+ struct sockparams *wild = NULL;
+
+ for (sp = list_head(list); sp != NULL; sp = list_next(list, sp)) {
+ if (sp->sp_family == family &&
+ sp->sp_type == type) {
+
+ if (sp->sp_protocol == protocol) {
+ if (crit == SP_MATCH_EXACT ||
+ crit == SP_MATCH_WILDCARD)
+ break;
+ else if (crit == SP_MATCH_INC_DEV &&
+ sp->sp_sdev_info.sd_devpath != NULL &&
+ strcmp(sp->sp_sdev_info.sd_devpath,
+ name) == 0)
+ break;
+ else if (crit == SP_MATCH_INC_MOD &&
+ strcmp(sp->sp_smod_name, name) == 0)
+ break;
+ } else if (crit == SP_MATCH_WILDCARD &&
+ sp->sp_protocol == 0) {
+ /* best match so far */
+ wild = sp;
+ }
+ }
+ }
+
+ return ((sp == NULL) ? wild : sp);
+}
+
+/*
+ * sockparams_hold_ephemeral()
+ *
+ * Returns an ephemeral sockparams entry of the requested family, type and
+ * protocol. The entry is returned held, and the caller is responsible for
+ * dropping the reference using SOCKPARAMS_DEC_REF() once done.
+ *
+ * All ephemeral entries are on list (sp_ephem_list). If there is an
+ * entry on the list that match the search criteria, then a reference is
+ * placed on that entry. Otherwise, a new entry is created and inserted
+ * in the list. The entry is removed from the list when the last reference
+ * is dropped.
+ *
+ * The tpi flag is used to determine whether name refers to a device or
+ * module name.
+ */
+static struct sockparams *
+sockparams_hold_ephemeral(int family, int type, int protocol,
+ const char *name, boolean_t tpi, int kmflag, int *errorp)
+{
+ struct sockparams *sp = NULL;
+ sp_match_criteria_t crit = (tpi) ? SP_MATCH_INC_DEV : SP_MATCH_INC_MOD;
+
+ *errorp = 0;
+
+ /*
+ * First look for an existing entry
+ */
+ rw_enter(&sp_ephem_lock, RW_READER);
+ sp = sockparams_find(&sp_ephem_list, family, type, protocol,
+ crit, name);
+ if (sp != NULL) {
+ SOCKPARAMS_INC_REF(sp);
+ rw_exit(&sp_ephem_lock);
+
+ return (sp);
+ } else {
+ struct sockparams *newsp = NULL;
+ char *namebuf = NULL;
+ int namelen = 0;
+
+ rw_exit(&sp_ephem_lock);
+
+ namelen = strlen(name) + 1;
+ namebuf = kmem_alloc(namelen, kmflag);
+ if (namebuf == NULL) {
+ *errorp = ENOMEM;
+ return (NULL);
+ }
+
+ (void *)strncpy(namebuf, name, namelen);
+ if (tpi) {
+ newsp = sockparams_create(family, type,
+ protocol, NULL, namebuf, namelen,
+ SOCKPARAMS_EPHEMERAL, kmflag, errorp);
+ } else {
+ newsp = sockparams_create(family, type,
+ protocol, namebuf, NULL, 0,
+ SOCKPARAMS_EPHEMERAL, kmflag, errorp);
+ }
+
+ if (newsp == NULL) {
+ ASSERT(*errorp != 0);
+ return (NULL);
+ }
+
+ /*
+ * Time to load the socket module.
+ */
+ ASSERT(newsp->sp_smod_info == NULL);
+ newsp->sp_smod_info =
+ smod_lookup_byname(newsp->sp_smod_name);
+ if (newsp->sp_smod_info == NULL) {
+ /* Failed to load */
+ sockparams_destroy(newsp);
+ *errorp = ENXIO;
+ return (NULL);
+ }
+
+ /*
+ * The sockparams entry was created, now try to add it
+ * to the list. We need to hold the lock as a WRITER.
+ */
+ rw_enter(&sp_ephem_lock, RW_WRITER);
+ sp = sockparams_find(&sp_ephem_list, family, type, protocol,
+ crit, name);
+ if (sp != NULL) {
+ /*
+ * Someone has requested a matching entry, so just
+ * place a hold on it and release the entry we alloc'ed.
+ */
+ SOCKPARAMS_INC_REF(sp);
+ rw_exit(&sp_ephem_lock);
+
+ sockparams_destroy(newsp);
+ } else {
+ SOCKPARAMS_INC_REF(newsp);
+ list_insert_tail(&sp_ephem_list, newsp);
+ rw_exit(&sp_ephem_lock);
+
+ sp = newsp;
+ }
+ ASSERT(*errorp == 0);
+
+ return (sp);
+ }
+}
+
+struct sockparams *
+sockparams_hold_ephemeral_bydev(int family, int type, int protocol,
+ const char *dev, int kmflag, int *errorp)
+{
+ return (sockparams_hold_ephemeral(family, type, protocol, dev, B_TRUE,
+ kmflag, errorp));
+}
+
+struct sockparams *
+sockparams_hold_ephemeral_bymod(int family, int type, int protocol,
+ const char *mod, int kmflag, int *errorp)
+{
+ return (sockparams_hold_ephemeral(family, type, protocol, mod, B_FALSE,
+ kmflag, errorp));
+}
+
+/*
+ * Called when the last socket using the ephemeral entry is dropping
+ * its' reference. To maintain lock order we must drop the sockparams
+ * lock before calling this function. As a result, a new reference
+ * might be placed on the entry, in which case there is nothing to
+ * do. However, if ref count goes to zero, we delete the entry.
+ */
+void
+sockparams_ephemeral_drop_last_ref(struct sockparams *sp)
+{
+ ASSERT(sp->sp_flags & SOCKPARAMS_EPHEMERAL);
+ ASSERT(MUTEX_NOT_HELD(&sp->sp_lock));
+
+ rw_enter(&sp_ephem_lock, RW_WRITER);
+ mutex_enter(&sp->sp_lock);
+
+ if (--sp->sp_refcnt == 0) {
+ list_remove(&sp_ephem_list, sp);
+ mutex_exit(&sp->sp_lock);
+ rw_exit(&sp_ephem_lock);
+
+ sockparams_destroy(sp);
+ } else {
+ mutex_exit(&sp->sp_lock);
+ rw_exit(&sp_ephem_lock);
+ }
+}
+
+/*
+ * sockparams_add(struct sockparams *sp)
+ *
+ * Tries to add the given sockparams entry to the global list.
+ *
+ * Arguments:
+ * sp: the sockparms entry to add
+ *
+ * Returns:
+ * On success 0, but if an entry already exists, then EEXIST
+ * is returned.
+ *
+ * Locking:
+ * The caller can not be holding splist_lock.
+ */
+static int
+sockparams_add(struct sockparams *sp)
+{
+ ASSERT(!(sp->sp_flags & SOCKPARAMS_EPHEMERAL));
+
+ rw_enter(&splist_lock, RW_WRITER);
+ if (sockparams_find(&sphead, sp->sp_family, sp->sp_type,
+ sp->sp_protocol, SP_MATCH_EXACT, NULL) != 0) {
+ rw_exit(&splist_lock);
+ return (EEXIST);
+ } else {
+ list_insert_tail(&sphead, sp);
+ rw_exit(&splist_lock);
+ return (0);
+ }
+}
+
+/*
+ * sockparams_delete(int family, int type, int protocol)
+ *
+ * Marks the sockparams entry for a specific family, type and protocol
+ * for deletion. The entry is removed from the list and destroyed
+ * if no one is holding a reference to it.
+ *
+ * Arguments:
+ * family, type, protocol: the socket type that should be removed.
+ *
+ * Returns:
+ * On success 0, otherwise ENXIO.
+ *
+ * Locking:
+ * Caller can not be holding splist_lock or the sp_lock of
+ * any sockparams entry.
+ */
+static int
+sockparams_delete(int family, int type, int protocol)
+{
+ struct sockparams *sp;
+
+ rw_enter(&splist_lock, RW_WRITER);
+ sp = sockparams_find(&sphead, family, type, protocol, SP_MATCH_EXACT,
+ NULL);
+
+ if (sp != NULL) {
+ /*
+ * If no one is holding a reference to the entry, then
+ * we go ahead and remove it from the list and then
+ * destroy it.
+ */
+ mutex_enter(&sp->sp_lock);
+ if (sp->sp_refcnt != 0) {
+ mutex_exit(&sp->sp_lock);
+ rw_exit(&splist_lock);
+ return (EBUSY);
+ }
+ mutex_exit(&sp->sp_lock);
+ /* Delete the sockparams entry. */
+ list_remove(&sphead, sp);
+ rw_exit(&splist_lock);
+
+ sockparams_destroy(sp);
+ return (0);
+ } else {
+ rw_exit(&splist_lock);
+ return (ENXIO);
+ }
+}
+
+/*
+ * soconfig(int family, int type, int protocol,
+ * char *devpath, int devpathlen, char *module)
+ *
+ * Add or delete an entry to the sockparams table.
+ * When devpath and module both are NULL, it will delete an entry.
+ *
+ * Arguments:
+ * family, type, protocol: the tuple in question
+ * devpath: STREAMS device path. Can be NULL for module based sockets.
+ * module : Name of the socket module. Can be NULL for STREAMS
+ * based sockets.
+ * devpathlen: length of the devpath string, or 0 if devpath
+ * was statically allocated.
+ *
+ * Note:
+ * This routine assumes that the caller has kmem_alloced
+ * devpath (if devpathlen > 0) and module for this routine to
+ * consume.
+ */
+int
+soconfig(int family, int type, int protocol,
+ char *devpath, int devpathlen, char *module)
+{
+ struct sockparams *sp;
+ int error = 0;
+
+ dprint(0, ("soconfig(%d,%d,%d,%s,%d,%s)\n",
+ family, type, protocol, devpath, devpathlen,
+ module == NULL ? "NULL" : module));
+
+ if (sockfs_defer_nl7c_init) {
+ nl7c_init();
+ sockfs_defer_nl7c_init = 0;
+ }
+
+ if (devpath == NULL && module == NULL) {
+ /*
+ * Delete existing entry,
+ * both socket module and STEAMS device.
+ */
+ ASSERT(module == NULL);
+ error = sockparams_delete(family, type, protocol);
+ } else {
+ /*
+ * Adding an entry
+ * sockparams_create frees mod name and devpath upon failure.
+ */
+ sp = sockparams_create(family, type, protocol, module,
+ devpath, devpathlen, 0, KM_SLEEP, &error);
+
+ if (sp != NULL) {
+ error = sockparams_add(sp);
+ if (error != 0)
+ sockparams_destroy(sp);
+ }
+ }
+
+ return (error);
+}
+
+/*
+ * solookup(int family, int type, int protocol, struct sockparams **spp)
+ *
+ * Lookup an entry in the sockparams list based on the triple. The returned
+ * entry either exactly match the given tuple, or it is the 'default' entry
+ * for the given <family, type>. A default entry is on with a protocol
+ * value of zero.
+ *
+ * Arguments:
+ * family, type, protocol: tuple to search for
+ * spp: Value-return argument
+ *
+ * Returns:
+ * If an entry is found, 0 is returned and *spp is set to point to the
+ * entry. In case an entry is not found, *spp is set to NULL, and an
+ * error code is returned. The errors are (in decreasing precedence):
+ * EAFNOSUPPORT - address family not in list
+ * EPROTONOSUPPORT - address family supported but not protocol.
+ * EPROTOTYPE - address family and protocol supported but not socket type.
+ *
+ * TODO: should use ddi_modopen()/ddi_modclose()
+ */
+
+int
+solookup(int family, int type, int protocol, struct sockparams **spp)
+{
+ struct sockparams *sp = NULL;
+ int error = 0;
+
+ *spp = NULL;
+ rw_enter(&splist_lock, RW_READER);
+
+ /*
+ * Search the sockparams list for an appropiate entry.
+ * Hopefully we find an entry that match the exact family,
+ * type and protocol specified by the user, in which case
+ * we return that entry. However, we also keep track of
+ * the default entry for a specific family and type, the
+ * entry of which would have a protocol value of 0.
+ */
+ sp = sockparams_find(&sphead, family, type, protocol, SP_MATCH_WILDCARD,
+ NULL);
+
+ if (sp == NULL) {
+ int found = 0;
+
+ /* Determine correct error code */
+ for (sp = list_head(&sphead); sp != NULL;
+ sp = list_next(&sphead, sp)) {
+ if (sp->sp_family == family && found < 1)
+ found = 1;
+ if (sp->sp_family == family &&
+ sp->sp_protocol == protocol && found < 2)
+ found = 2;
+ }
+ rw_exit(&splist_lock);
+
+ switch (found) {
+ case 0:
+ error = EAFNOSUPPORT;
+ break;
+ case 1:
+ error = EPROTONOSUPPORT;
+ break;
+ case 2:
+ error = EPROTOTYPE;
+ break;
+ }
+ return (error);
+ }
+
+ /*
+ * An entry was found.
+ *
+ * We put a hold on the entry early on, so if the
+ * sockmod is not loaded, and we have to exit
+ * splist_lock to call modload(), we know that the
+ * sockparams entry wont go away. That way we don't
+ * have to look up the entry once we come back from
+ * modload().
+ */
+ SOCKPARAMS_INC_REF(sp);
+ rw_exit(&splist_lock);
+
+ if (sp->sp_smod_info == NULL) {
+ sp->sp_smod_info = smod_lookup_byname(sp->sp_smod_name);
+ if (sp->sp_smod_info == NULL) {
+ /*
+ * We put a hold on the sockparams entry
+ * earlier, hoping everything would work out.
+ * That obviously did not happen, so release
+ * the hold here.
+ */
+ SOCKPARAMS_DEC_REF(sp);
+ /*
+ * We should probably mark the sockparams as
+ * "bad", and redo the lookup skipping the
+ * "bad" entries. I.e., sp->sp_mod_state |= BAD,
+ * return (solookup(...))
+ */
+ return (ENXIO);
+ }
+ }
+
+ /*
+ * Alright, we have a valid sockparams entry.
+ */
+ *spp = sp;
+ return (0);
+}
diff --git a/usr/src/uts/common/fs/sockfs/socksctp.c b/usr/src/uts/common/fs/sockfs/socksctp.c
deleted file mode 100644
index a5763b0b5f..0000000000
--- a/usr/src/uts/common/fs/sockfs/socksctp.c
+++ /dev/null
@@ -1,2773 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/types.h>
-#include <sys/t_lock.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/buf.h>
-#include <sys/vfs.h>
-#include <sys/vfs_opreg.h>
-#include <sys/vnode.h>
-#include <sys/debug.h>
-#include <sys/errno.h>
-#include <sys/stropts.h>
-#include <sys/cmn_err.h>
-#include <sys/sysmacros.h>
-
-#include <sys/project.h>
-#include <sys/tihdr.h>
-#include <sys/strsubr.h>
-
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-#include <sys/strsun.h>
-
-#include <netinet/sctp.h>
-#include <inet/sctp_itf.h>
-#include "socksctp.h"
-
-/*
- * SCTP sockfs sonode operations, 1-1 socket
- */
-static int sosctp_accept(struct sonode *, int, struct sonode **);
-static int sosctp_listen(struct sonode *, int);
-static int sosctp_connect(struct sonode *, const struct sockaddr *, socklen_t,
- int, int);
-static int sosctp_sendmsg(struct sonode *, struct nmsghdr *, struct uio *);
-static int sosctp_getpeername(struct sonode *);
-static int sosctp_getsockname(struct sonode *);
-static int sosctp_shutdown(struct sonode *, int);
-static int sosctp_getsockopt(struct sonode *, int, int, void *, socklen_t *,
- int);
-static int sosctp_setsockopt(struct sonode *, int, int, const void *,
- socklen_t);
-
-/*
- * SCTP sockfs sonode operations, 1-N socket
- */
-static int sosctp_seq_connect(struct sonode *, const struct sockaddr *,
- socklen_t, int, int);
-static int sosctp_seq_sendmsg(struct sonode *, struct nmsghdr *, struct uio *);
-
-/*
- * Socket upcalls, 1-1 socket connection
- */
-static void *sctp_sock_newconn(void *parenthandle, void *connind);
-static void sctp_sock_connected(void *handle);
-static int sctp_sock_disconnected(void *handle, int error);
-static void sctp_sock_disconnecting(void *handle);
-static int sctp_sock_recv(void *handle, mblk_t *mp, int flags);
-static void sctp_sock_xmitted(void *handle, int txqueued);
-static void sctp_sock_properties(void *handle, int wroff, size_t maxblk);
-
-/*
- * Socket association upcalls, 1-N socket connection
- */
-static void *sctp_assoc_newconn(void *parenthandle, void *connind);
-static void sctp_assoc_connected(void *handle);
-static int sctp_assoc_disconnected(void *handle, int error);
-static void sctp_assoc_disconnecting(void *handle);
-static int sctp_assoc_recv(void *handle, mblk_t *mp, int flags);
-static void sctp_assoc_xmitted(void *handle, int txqueued);
-static void sctp_assoc_properties(void *handle, int wroff, size_t maxblk);
-
-static kmem_cache_t *sosctp_sockcache;
-kmem_cache_t *sosctp_assoccache;
-
-sonodeops_t sosctp_sonodeops = {
- sosctp_accept, /* sop_accept */
- sosctp_bind, /* sop_bind */
- sosctp_listen, /* sop_listen */
- sosctp_connect, /* sop_connect */
- sosctp_recvmsg, /* sop_recvmsg */
- sosctp_sendmsg, /* sop_sendmsg */
- sosctp_getpeername, /* sop_getpeername */
- sosctp_getsockname, /* sop_getsockname */
- sosctp_shutdown, /* sop_shutdown */
- sosctp_getsockopt, /* sop_getsockopt */
- sosctp_setsockopt /* sop_setsockopt */
-};
-
-sonodeops_t sosctp_seq_sonodeops = {
- sosctp_accept, /* sop_accept */
- sosctp_bind, /* sop_bind */
- sosctp_listen, /* sop_listen */
- sosctp_seq_connect, /* sop_connect */
- sosctp_recvmsg, /* sop_recvmsg */
- sosctp_seq_sendmsg, /* sop_sendmsg */
- sosctp_getpeername, /* sop_getpeername */
- sosctp_getsockname, /* sop_getsockname */
- sosctp_shutdown, /* sop_shutdown */
- sosctp_getsockopt, /* sop_getsockopt */
- sosctp_setsockopt /* sop_setsockopt */
-};
-
-sctp_upcalls_t sosctp_sock_upcalls = {
- sctp_sock_newconn,
- sctp_sock_connected,
- sctp_sock_disconnected,
- sctp_sock_disconnecting,
- sctp_sock_recv,
- sctp_sock_xmitted,
- sctp_sock_properties
-};
-
-sctp_upcalls_t sosctp_assoc_upcalls = {
- sctp_assoc_newconn,
- sctp_assoc_connected,
- sctp_assoc_disconnected,
- sctp_assoc_disconnecting,
- sctp_assoc_recv,
- sctp_assoc_xmitted,
- sctp_assoc_properties
-};
-
-/*ARGSUSED*/
-static int
-sosctp_sock_constructor(void *buf, void *cdrarg, int kmflags)
-{
- struct sctp_sonode *ss = buf;
- struct sonode *so = &ss->ss_so;
- struct vnode *vp;
-
- ss->ss_type = SOSCTP_SOCKET;
- so->so_oobmsg = NULL;
- so->so_ack_mp = NULL;
- so->so_conn_ind_head = NULL;
- so->so_conn_ind_tail = NULL;
- so->so_discon_ind_mp = NULL;
- so->so_ux_bound_vp = NULL;
- so->so_unbind_mp = NULL;
- so->so_ops = NULL;
- so->so_accessvp = NULL;
- so->so_priv = NULL;
-
- so->so_nl7c_flags = 0;
- so->so_nl7c_uri = NULL;
- so->so_nl7c_rcv_mp = NULL;
-
- so->so_direct = NULL;
-
- vp = vn_alloc(kmflags);
- if (vp == NULL) {
- return (-1);
- }
- so->so_vnode = vp;
-
- vn_setops(vp, socksctp_vnodeops);
- vp->v_data = (caddr_t)so;
-
- ss->ss_rxdata = NULL;
- ss->ss_rxtail = &ss->ss_rxdata;
-
- mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&so->so_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&so->so_ack_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&so->so_connind_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&so->so_want_cv, NULL, CV_DEFAULT, NULL);
-
- cv_init(&ss->ss_txdata_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&ss->ss_rxdata_cv, NULL, CV_DEFAULT, NULL);
-
- return (0);
-}
-
-/*ARGSUSED*/
-static void
-sosctp_sock_destructor(void *buf, void *cdrarg)
-{
- struct sctp_sonode *ss = buf;
- struct sonode *so = &ss->ss_so;
- struct vnode *vp = SOTOV(so);
-
- ASSERT(so->so_direct == NULL);
-
- ASSERT(so->so_nl7c_flags == 0);
- ASSERT(so->so_nl7c_uri == NULL);
- ASSERT(so->so_nl7c_rcv_mp == NULL);
-
- ASSERT(so->so_oobmsg == NULL);
- ASSERT(so->so_ack_mp == NULL);
- ASSERT(so->so_conn_ind_head == NULL);
- ASSERT(so->so_conn_ind_tail == NULL);
- ASSERT(so->so_discon_ind_mp == NULL);
- ASSERT(so->so_ux_bound_vp == NULL);
- ASSERT(so->so_unbind_mp == NULL);
- ASSERT(so->so_ops == NULL ||
- so->so_ops == &sosctp_sonodeops ||
- so->so_ops == &sosctp_seq_sonodeops);
-
- ASSERT(ss->ss_rxdata == NULL);
-
- ASSERT(vn_matchops(vp, socksctp_vnodeops));
- ASSERT(vp->v_data == (caddr_t)so);
-
- vn_free(vp);
-
- mutex_destroy(&so->so_lock);
- mutex_destroy(&so->so_plumb_lock);
- cv_destroy(&so->so_state_cv);
- cv_destroy(&so->so_ack_cv);
- cv_destroy(&so->so_connind_cv);
- cv_destroy(&so->so_want_cv);
- cv_destroy(&ss->ss_txdata_cv);
- cv_destroy(&ss->ss_rxdata_cv);
-}
-
-int
-sosctp_init(void)
-{
- int error;
-
- error = vn_make_ops("socksctp", socksctp_vnodeops_template,
- &socksctp_vnodeops);
- if (error != 0) {
- zcmn_err(GLOBAL_ZONEID, CE_WARN,
- "sosctp_init: bad vnode ops template");
- return (error);
- }
-
- sosctp_sockcache = kmem_cache_create("sctpsock",
- sizeof (struct sctp_sonode), 0, sosctp_sock_constructor,
- sosctp_sock_destructor, NULL, NULL, NULL, 0);
- sosctp_assoccache = kmem_cache_create("sctp_assoc",
- sizeof (struct sctp_soassoc), 0, NULL, NULL, NULL, NULL, NULL, 0);
- return (0);
-}
-
-static struct vnode *
-sosctp_makevp(struct vnode *accessvp, int domain, int type, int protocol,
- int kmflags)
-{
- struct sctp_sonode *ss;
- struct sonode *so;
- struct vnode *vp;
- time_t now;
-
- ss = kmem_cache_alloc(sosctp_sockcache, kmflags);
- if (ss == NULL) {
- return (NULL);
- }
- so = &ss->ss_so;
- so->so_cache = sosctp_sockcache;
- so->so_obj = ss;
- vp = SOTOV(so);
- now = gethrestime_sec();
-
- so->so_flag = 0;
- so->so_accessvp = accessvp;
- so->so_dev = accessvp->v_rdev;
-
- so->so_state = 0;
- so->so_mode = 0;
-
- so->so_fsid = sockdev;
- so->so_atime = now;
- so->so_mtime = now;
- so->so_ctime = now;
- so->so_count = 0;
-
- so->so_family = domain;
- so->so_type = type;
- so->so_protocol = protocol;
- so->so_pushcnt = 0;
-
- so->so_options = 0;
- so->so_linger.l_onoff = 0;
- so->so_linger.l_linger = 0;
- so->so_sndbuf = 0;
- so->so_rcvbuf = 0;
- so->so_error = 0;
- so->so_delayed_error = 0;
-
- ASSERT(so->so_oobmsg == NULL);
- so->so_oobcnt = 0;
- so->so_oobsigcnt = 0;
- so->so_pgrp = 0;
- so->so_provinfo = NULL;
-
- so->so_laddr_sa = (struct sockaddr *)&ss->ss_laddr;
- so->so_faddr_sa = (struct sockaddr *)&ss->ss_faddr;
- so->so_laddr_maxlen = so->so_faddr_maxlen = sizeof (ss->ss_laddr);
- so->so_laddr_len = so->so_faddr_len = 0;
- so->so_eaddr_mp = NULL;
- so->so_delayed_error = 0;
-
- so->so_peercred = NULL;
-
- ASSERT(so->so_ack_mp == NULL);
- ASSERT(so->so_conn_ind_head == NULL);
- ASSERT(so->so_conn_ind_tail == NULL);
- ASSERT(so->so_ux_bound_vp == NULL);
- ASSERT(so->so_unbind_mp == NULL);
-
- vn_reinit(vp);
- vp->v_vfsp = rootvfs;
- vp->v_type = VSOCK;
- vp->v_rdev = so->so_dev;
-
- ss->ss_maxassoc = 0;
- ss->ss_assoccnt = 0;
- ss->ss_assocs = NULL;
-
- if (type == SOCK_STREAM) {
- so->so_ops = &sosctp_sonodeops;
- } else {
- ASSERT(type == SOCK_SEQPACKET);
- so->so_ops = &sosctp_seq_sonodeops;
- mutex_enter(&so->so_lock);
- (void) sosctp_aid_grow(ss, 1, kmflags);
- mutex_exit(&so->so_lock);
- }
- ss->ss_rxqueued = 0;
- ss->ss_txqueued = 0;
- ss->ss_wroff = 0;
- ss->ss_wrsize = strmsgsz;
- bzero(&ss->ss_poll_list, sizeof (ss->ss_poll_list));
-
- vn_exists(vp);
- return (vp);
-}
-
-/*
- * Creates a sctp socket data structure.
- * tso is non-NULL if it's passive open.
- */
-struct sonode *
-sosctp_create(vnode_t *accessvp, int domain, int type, int protocol,
- int version, struct sonode *tso, int *errorp)
-{
- struct sonode *so;
- vnode_t *vp;
- int error;
- int soflags;
- cred_t *cr;
-
- if (version == SOV_STREAM) {
- *errorp = EINVAL;
- return (NULL);
- }
- ASSERT(accessvp != NULL);
-
- /*
- * We only support two types of SCTP socket. Let sotpi_create()
- * handle all other cases, such as raw socket.
- */
- if (!(domain == AF_INET || domain == AF_INET6) ||
- !(type == SOCK_STREAM || type == SOCK_SEQPACKET)) {
- return (sotpi_create(accessvp, domain, type, protocol, version,
- NULL, errorp));
- }
-
- if (tso == NULL) {
- vp = sosctp_makevp(accessvp, domain, type, protocol, KM_SLEEP);
- ASSERT(vp != NULL);
-
- soflags = FREAD | FWRITE;
- } else {
- vp = sosctp_makevp(accessvp, domain, type, protocol,
- KM_NOSLEEP);
- if (vp == NULL) {
- /*
- * sosctp_makevp() only fails when there is no memory.
- */
- *errorp = ENOMEM;
- return (NULL);
- }
- soflags = FREAD | FWRITE | SO_ACCEPTOR;
- }
- /*
- * This function may be called in interrupt context, and CRED()
- * will be NULL. In this case, pass in kcred to VOP_OPEN().
- */
- if ((cr = CRED()) == NULL)
- cr = kcred;
- if ((error = VOP_OPEN(&vp, soflags, cr, NULL)) != 0) {
- VN_RELE(vp);
- *errorp = error;
- return (NULL);
- }
- so = VTOSO(vp);
-
- dprint(2, ("sosctp_create: %p domain %d type %d\n",
- (void *)so, domain, type));
-
- if (version == SOV_DEFAULT) {
- version = so_default_version;
- }
- so->so_version = (short)version;
-
- return (so);
-}
-
-/*
- * Free SCTP socket data structure.
- * Closes incoming connections which were never accepted, frees
- * resources.
- */
-void
-sosctp_free(struct sonode *so)
-{
- struct sctp_sonode *ss = SOTOSSO(so);
- struct sonode *nso;
- mblk_t *mp;
-
- mutex_enter(&so->so_lock);
-
- /*
- * Need to clear these out so that sockfree() doesn't think that
- * there's memory in need of free'ing.
- */
- so->so_laddr_sa = so->so_faddr_sa = NULL;
- so->so_laddr_len = so->so_laddr_maxlen = 0;
- so->so_faddr_len = so->so_faddr_maxlen = 0;
-
- while ((mp = ss->ss_rxdata) != NULL) {
- ss->ss_rxdata = mp->b_next;
- mp->b_next = NULL;
- freemsg(mp);
- mp = ss->ss_rxdata;
- }
- ss->ss_rxtail = &ss->ss_rxdata;
-
-
- while ((mp = so->so_conn_ind_head) != NULL) {
- so->so_conn_ind_head = mp->b_next;
- mutex_exit(&so->so_lock);
- mp->b_next = NULL;
- nso = *(struct sonode **)mp->b_rptr;
-
- (void) VOP_CLOSE(SOTOV(nso), 0, 1, 0, CRED(), NULL);
- vn_invalid(SOTOV(nso));
- VN_RELE(SOTOV(nso));
-
- freeb(mp);
- mutex_enter(&so->so_lock);
- }
- so->so_conn_ind_tail = NULL;
- so->so_state &= ~SS_HASCONNIND;
-
- if (ss->ss_assocs != NULL) {
- ASSERT(ss->ss_assoccnt == 0);
- kmem_free(ss->ss_assocs,
- ss->ss_maxassoc * sizeof (struct sctp_sa_id));
- }
- mutex_exit(&so->so_lock);
-
- sockfree(so);
-}
-
-/*
- * Accept incoming connection.
- */
-static int
-sosctp_accept(struct sonode *lso, int fflag, struct sonode **nsop)
-{
- int error = 0;
- mblk_t *mp;
- struct sonode *nso;
-
- if (!(lso->so_state & SS_ACCEPTCONN)) {
- /*
- * Not a listen socket.
- */
- eprintsoline(lso, EINVAL);
- return (EINVAL);
- }
- if (lso->so_type != SOCK_STREAM) {
- /*
- * Cannot accept() connections from SOCK_SEQPACKET type
- * socket.
- */
- eprintsoline(lso, EOPNOTSUPP);
- return (EOPNOTSUPP);
- }
-
- /*
- * Returns right away if socket is nonblocking.
- */
- error = sowaitconnind(lso, fflag, &mp);
- if (error != 0) {
- eprintsoline(lso, error);
- return (error);
- }
- nso = *(struct sonode **)mp->b_rptr;
- freeb(mp);
-
- mutex_enter(&lso->so_lock);
- ASSERT(SOTOSSO(lso)->ss_rxqueued > 0);
- --SOTOSSO(lso)->ss_rxqueued;
- mutex_exit(&lso->so_lock);
-
- /*
- * accept() needs remote address right away.
- * since sosctp_getpeername() is called with
- * socket lock released, the connection may
- * get aborted before we return from the
- * routine. So, we need to to handle aborted
- * socket connection here.
- */
- error = sosctp_getpeername(nso);
- if (error != 0) {
- vnode_t *nvp;
- nvp = SOTOV(nso);
- (void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL);
- VN_RELE(nvp);
-
- /*
- * We can't return ENOTCONN to accept. accept
- * either returns connected socket in case no error
- * has occured or the connection which is getting
- * accepted is being aborted. This is the reason we
- * return ECONNABORTED in case sosctp_getpeername()
- * returns ENOTCONN.
- */
- return ((error == ENOTCONN) ? ECONNABORTED : error);
- }
-
- dprint(2, ("sosctp_accept: new %p\n", (void *)nso));
-
- *nsop = nso;
- return (0);
-}
-
-/*
- * Bind local endpoint.
- */
-int
-sosctp_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
- int flags)
-{
- int error = 0;
-
- if (!(flags & _SOBIND_LOCK_HELD)) {
- mutex_enter(&so->so_lock);
- so_lock_single(so); /* Set SOLOCKED */
- /* LINTED - statement has no conseq */
- } else {
- ASSERT(MUTEX_HELD(&so->so_lock));
- ASSERT(so->so_flag & SOLOCKED);
- }
-
- if ((so->so_state & SS_ISBOUND) || name == NULL || namelen == 0) {
- /*
- * Multiple binds not allowed for any SCTP socket.
- * Also binding with null address is not supported.
- */
- error = EINVAL;
- eprintsoline(so, error);
- goto done;
- }
- /*
- * X/Open requires this check
- */
- if (so->so_state & SS_CANTSENDMORE) {
- error = EINVAL;
- goto done;
- }
-
- /*
- * Protocol module does address family checks.
- */
- mutex_exit(&so->so_lock);
-
- error = sctp_bind(so->so_priv, name, namelen);
-
- mutex_enter(&so->so_lock);
- if (error == 0) {
- so->so_state |= SS_ISBOUND;
- /* LINTED - statement has no conseq */
- } else {
- eprintsoline(so, error);
- }
-done:
- if (!(flags & _SOBIND_LOCK_HELD)) {
- so_unlock_single(so, SOLOCKED);
- mutex_exit(&so->so_lock);
- /* LINTED - statement has no conseq */
- } else {
- /* If the caller held the lock don't release it here */
- ASSERT(MUTEX_HELD(&so->so_lock));
- ASSERT(so->so_flag & SOLOCKED);
- }
- return (error);
-}
-
-/*
- * Turn socket into a listen socket.
- */
-static int
-sosctp_listen(struct sonode *so, int backlog)
-{
- int error = 0;
-
- mutex_enter(&so->so_lock);
- so_lock_single(so);
-
- /*
- * If this socket is trying to do connect, or if it has
- * been connected, disallow.
- */
- if (so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED |
- SS_ISDISCONNECTING | SS_CANTRCVMORE | SS_CANTSENDMORE)) {
- error = EINVAL;
- eprintsoline(so, error);
- goto done;
- }
-
- if (backlog < 0) {
- backlog = 0;
- }
-
- /*
- * If listen() is only called to change backlog, we don't
- * need to notify protocol module.
- */
- if (so->so_state & SS_ACCEPTCONN) {
- so->so_backlog = backlog;
- goto done;
- }
-
- mutex_exit(&so->so_lock);
-
- error = sctp_listen(so->so_priv);
-
- mutex_enter(&so->so_lock);
- if (error == 0) {
- so->so_state |= (SS_ACCEPTCONN|SS_ISBOUND);
- so->so_backlog = backlog;
- /* LINTED - statement has no conseq */
- } else {
- eprintsoline(so, error);
- }
-done:
- so_unlock_single(so, SOLOCKED);
- mutex_exit(&so->so_lock);
-
- return (error);
-}
-
-/*
- * Active open.
- */
-static int
-sosctp_connect(struct sonode *so, const struct sockaddr *name,
- socklen_t namelen, int fflag, int flags)
-{
- int error;
-
- ASSERT(so->so_type == SOCK_STREAM);
-
- mutex_enter(&so->so_lock);
- so_lock_single(so);
-
- /*
- * Can't connect() after listen(), or if the socket is already
- * connected.
- */
- if (so->so_state & (SS_ACCEPTCONN|SS_ISCONNECTED|SS_ISCONNECTING)) {
- if (so->so_state & SS_ISCONNECTED) {
- error = EISCONN;
- } else if (so->so_state & SS_ISCONNECTING) {
- error = EALREADY;
- } else {
- error = EOPNOTSUPP;
- }
- eprintsoline(so, error);
- goto done;
- }
-
- /*
- * Check for failure of an earlier call
- */
- if (so->so_error != 0) {
- error = sogeterr(so);
- eprintsoline(so, error);
- goto done;
- }
-
- /*
- * Connection is closing, or closed, don't allow reconnect.
- * TCP allows this to proceed, but the socket remains unwriteable.
- * BSD returns EINVAL.
- */
- if (so->so_state & (SS_ISDISCONNECTING|SS_CANTRCVMORE|
- SS_CANTSENDMORE)) {
- error = EINVAL;
- eprintsoline(so, error);
- goto done;
- }
- if (name == NULL || namelen == 0) {
- error = EINVAL;
- eprintsoline(so, error);
- goto done;
- }
- soisconnecting(so);
-
- mutex_exit(&so->so_lock);
-
- error = sctp_connect(so->so_priv, name, namelen);
-
- mutex_enter(&so->so_lock);
- if (error == 0) {
- /*
- * Allow other threads to access the socket
- */
- error = sosctp_waitconnected(so, fflag);
- }
- switch (error) {
- case 0:
- case EINPROGRESS:
- case EALREADY:
- case EINTR:
- /* Non-fatal errors */
- so->so_state |= SS_ISBOUND;
- break;
- case EHOSTUNREACH:
- if (flags & _SOCONNECT_XPG4_2) {
- /*
- * X/Open specification contains a requirement that
- * ENETUNREACH be returned but does not require
- * EHOSTUNREACH. In order to keep the test suite
- * happy we mess with the errno here.
- */
- error = ENETUNREACH;
- }
- /* FALLTHRU */
-
- default:
- /* clear SS_ISCONNECTING in case it was set */
- so->so_state &= ~SS_ISCONNECTING;
- break;
- }
-done:
- so_unlock_single(so, SOLOCKED);
- mutex_exit(&so->so_lock);
- return (error);
-}
-
-/*
- * Active open for 1-N sockets, create a new association and
- * call connect on that.
- * If there parent hasn't been bound yet (this is the first association),
- * make it so.
- */
-static int
-sosctp_seq_connect(struct sonode *so, const struct sockaddr *name,
- socklen_t namelen, int fflag, int flags)
-{
- struct sctp_soassoc *ssa;
- struct sctp_sonode *ss;
- int error;
-
- ASSERT(so->so_type == SOCK_SEQPACKET);
-
- mutex_enter(&so->so_lock);
- so_lock_single(so);
-
- if (name == NULL || namelen == 0) {
- error = EINVAL;
- eprintsoline(so, error);
- goto done;
- }
-
- ss = SOTOSSO(so);
-
- error = sosctp_assoc_createconn(ss, name, namelen, NULL, 0, fflag,
- &ssa);
- if (error != 0) {
- if ((error == EHOSTUNREACH) && (flags & _SOCONNECT_XPG4_2)) {
- error = ENETUNREACH;
- }
- }
- if (ssa != NULL) {
- SSA_REFRELE(ss, ssa);
- }
-
-done:
- so_unlock_single(so, SOLOCKED);
- mutex_exit(&so->so_lock);
- return (error);
-}
-
-/*
- * Receive data.
- */
-int
-sosctp_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
-{
- struct sctp_sonode *ss = SOTOSSO(so);
- struct sctp_soassoc *ssa = NULL;
- int flags, error = 0;
- struct T_unitdata_ind *tind;
- int len, count, readcnt = 0, rxqueued;
- boolean_t consumed = B_FALSE;
- void *opt;
- mblk_t *mp, *mdata;
-
- flags = msg->msg_flags;
- msg->msg_flags = 0;
-
- if (so->so_type == SOCK_STREAM) {
- if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|
- SS_CANTRCVMORE))) {
- return (ENOTCONN);
- }
- } else {
- /* For 1-N socket, recv() cannot be used. */
- if (msg->msg_namelen == 0)
- return (EOPNOTSUPP);
- /*
- * If there are no associations, and no new connections are
- * coming in, there's not going to be new messages coming
- * in either.
- */
- if (ss->ss_rxdata == NULL && ss->ss_assoccnt == 0 &&
- !(so->so_state & SS_ACCEPTCONN)) {
- return (ENOTCONN);
- }
- }
-
- /*
- * out-of-band data not supported.
- */
- if (flags & MSG_OOB) {
- return (EOPNOTSUPP);
- }
-
- /*
- * flag possibilities:
- *
- * MSG_PEEK Don't consume data
- * MSG_WAITALL Wait for full quantity of data (ignored if MSG_PEEK)
- * MSG_DONTWAIT Non-blocking (same as FNDELAY | FNONBLOCK)
- *
- * MSG_WAITALL can return less than the full buffer if either
- *
- * 1. we would block and we are non-blocking
- * 2. a full message cannot be delivered
- *
- * Given that we always get a full message from proto below,
- * MSG_WAITALL is not meaningful.
- */
-
- mutex_enter(&so->so_lock);
-
- /*
- * Allow just one reader at a time.
- */
- error = so_lock_read_intr(so,
- uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
- if (error) {
- mutex_exit(&so->so_lock);
- return (error);
- }
-again:
- mp = ss->ss_rxdata;
- if (mp != NULL) {
- if (so->so_type == SOCK_SEQPACKET) {
- ssa = *(struct sctp_soassoc **)DB_BASE(mp);
- }
- mutex_exit(&so->so_lock);
-
- tind = (struct T_unitdata_ind *)mp->b_rptr;
-
- len = tind->SRC_length;
-
- if (msg->msg_namelen > 0 && len > 0) {
-
- opt = sogetoff(mp, tind->SRC_offset, len, 1);
-
- ASSERT(opt != NULL);
-
- msg->msg_name = kmem_alloc(len, KM_SLEEP);
- msg->msg_namelen = len;
-
- bcopy(opt, msg->msg_name, len);
- } else {
- msg->msg_namelen = 0;
- }
-
- len = tind->OPT_length;
- if (msg->msg_controllen == 0) {
- if (len > 0) {
- msg->msg_flags |= MSG_CTRUNC;
- }
- } else if (len > 0) {
- opt = sogetoff(mp, tind->OPT_offset, len,
- __TPI_ALIGN_SIZE);
-
- ASSERT(opt != NULL);
- sosctp_pack_cmsg(opt, msg, len);
- } else {
- msg->msg_controllen = 0;
- }
-
- if (mp->b_flag & SCTP_NOTIFICATION) {
- msg->msg_flags |= MSG_NOTIFICATION;
- }
-
- mdata = mp->b_cont;
- while (mdata != NULL) {
- len = MBLKL(mdata);
- count = MIN(uiop->uio_resid, len);
-
- error = uiomove(mdata->b_rptr, count, UIO_READ, uiop);
- /*
- * We will re-read this message the next time.
- */
- if (error != 0) {
- if (msg->msg_namelen > 0) {
- kmem_free(msg->msg_name,
- msg->msg_namelen);
- }
- if (msg->msg_controllen > 0) {
- kmem_free(msg->msg_control,
- msg->msg_controllen);
- }
- mutex_enter(&so->so_lock);
- so_unlock_read(so);
- mutex_exit(&so->so_lock);
- return (error);
- }
- if (!(flags & MSG_PEEK))
- readcnt += count;
- if (uiop->uio_resid == 0) {
- mblk_t *mp1 = ss->ss_rxdata;
- mblk_t *mp2 = mp1->b_cont;
-#ifdef DEBUG
- int rcnt = readcnt;
-#endif
-
- /* Finished with this message? */
- if (count == len && mdata->b_cont == NULL)
- break;
- /*
- * Remove the bits that have been read, the
- * next read will start from where we left
- * off.
- */
- while (mp1->b_cont != mdata) {
-#ifdef DEBUG
- ASSERT(rcnt > MBLKL(mp1->b_cont));
- rcnt -= MBLKL(mp1->b_cont);
-#endif
- mp1 = mp1->b_cont;
- }
-#ifdef DEBUG
- ASSERT(rcnt == count);
-#endif
- if (len > count)
- mp1->b_cont->b_rptr += count;
- else
- mp1 = mp1->b_cont;
- mutex_enter(&so->so_lock);
- if (mp2 != mp1->b_cont) {
- ss->ss_rxdata->b_cont = mp1->b_cont;
- mp1->b_cont = NULL;
- freemsg(mp2);
- }
- goto done;
- }
- mdata = mdata->b_cont;
- }
- if (!(mp->b_flag & SCTP_PARTIAL_DATA))
- msg->msg_flags |= MSG_EOR;
- /*
- * Consume this message
- */
-consume:
- mutex_enter(&so->so_lock);
- if (!(flags & MSG_PEEK)) {
- ss->ss_rxdata = mp->b_next;
- if (ss->ss_rxtail == &mp->b_next) {
- ss->ss_rxtail = &ss->ss_rxdata;
- }
- mp->b_next = NULL;
- freemsg(mp);
- consumed = B_TRUE;
- }
- } else {
- /*
- * No pending data. Return right away for nonblocking
- * socket, otherwise sleep waiting for data.
- */
- if (!(so->so_state & SS_CANTRCVMORE)) {
- if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
- (flags & MSG_DONTWAIT)) {
- error = EWOULDBLOCK;
- } else {
- if (!cv_wait_sig(&ss->ss_rxdata_cv,
- &so->so_lock)) {
- error = EINTR;
- } else {
- goto again;
- }
- }
- } else {
- msg->msg_controllen = 0;
- msg->msg_namelen = 0;
- }
- }
-done:
- /*
- * Determine if we need to update SCTP about the buffer
- * space. For performance reason, we cannot update SCTP
- * every time a message is read. The socket buffer low
- * watermark is used as the threshold.
- */
- if (ssa == NULL) {
- rxqueued = ss->ss_rxqueued;
-
- ss->ss_rxqueued = rxqueued - readcnt;
- count = so->so_rcvbuf - ss->ss_rxqueued;
-
- ASSERT(ss->ss_rxdata != NULL || ss->ss_rxqueued == 0);
-
- so_unlock_read(so);
- mutex_exit(&so->so_lock);
-
- if (readcnt > 0 && (((count > 0) &&
- (rxqueued >= so->so_rcvlowat)) ||
- (ss->ss_rxqueued == 0))) {
- /*
- * If amount of queued data is higher than watermark,
- * updata SCTP's idea of available buffer space.
- */
- sctp_recvd(so->so_priv, count);
- }
- } else {
- rxqueued = ssa->ssa_rxqueued;
-
- ssa->ssa_rxqueued = rxqueued - readcnt;
- count = so->so_rcvbuf - ssa->ssa_rxqueued;
-
- so_unlock_read(so);
-
- if (readcnt > 0 &&
- (((count > 0) && (rxqueued >= so->so_rcvlowat)) ||
- (ssa->ssa_rxqueued == 0))) {
- /*
- * If amount of queued data is higher than watermark,
- * updata SCTP's idea of available buffer space.
- */
- mutex_exit(&so->so_lock);
-
- sctp_recvd(ssa->ssa_conn, count);
-
- mutex_enter(&so->so_lock);
- }
- if (consumed) {
- SSA_REFRELE(ss, ssa);
- }
- mutex_exit(&so->so_lock);
- }
-
- return (error);
-}
-
-int
-sosctp_uiomove(mblk_t *hdr_mp, ssize_t count, ssize_t blk_size, int wroff,
- struct uio *uiop, int flags, cred_t *cr)
-{
- ssize_t size;
- int error;
- mblk_t *mp;
- dblk_t *dp;
-
- /*
- * Loop until we have all data copied into mblk's.
- */
- while (count > 0) {
- size = MIN(count, blk_size);
-
- /*
- * As a message can be splitted up and sent in different
- * packets, each mblk will have the extra space before
- * data to accommodate what SCTP wants to put in there.
- */
- while ((mp = allocb_cred(size + wroff, cr)) == NULL) {
- if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
- (flags & MSG_DONTWAIT)) {
- return (EAGAIN);
- }
- if ((error = strwaitbuf(size + wroff, BPRI_MED))) {
- return (error);
- }
- }
-
- dp = mp->b_datap;
- dp->db_cpid = curproc->p_pid;
- ASSERT(wroff <= dp->db_lim - mp->b_wptr);
- mp->b_rptr += wroff;
- error = uiomove(mp->b_rptr, size, UIO_WRITE, uiop);
- if (error != 0) {
- freeb(mp);
- return (error);
- }
- mp->b_wptr = mp->b_rptr + size;
- count -= size;
- hdr_mp->b_cont = mp;
- hdr_mp = mp;
- }
- return (0);
-}
-
-/*
- * Send message.
- */
-static int
-sosctp_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
-{
- struct sctp_sonode *ss = SOTOSSO(so);
- mblk_t *mctl;
- struct cmsghdr *cmsg;
- struct sctp_sndrcvinfo *sinfo;
- int optlen, flags, fflag;
- ssize_t count, msglen;
- int error;
-
- ASSERT(so->so_type == SOCK_STREAM);
-
- flags = msg->msg_flags;
- if (flags & MSG_OOB) {
- /*
- * No out-of-band data support.
- */
- return (EOPNOTSUPP);
- }
-
- if (msg->msg_controllen != 0) {
- optlen = msg->msg_controllen;
- cmsg = sosctp_find_cmsg(msg->msg_control, optlen, SCTP_SNDRCV);
- if (cmsg != NULL) {
- if (cmsg->cmsg_len <
- (sizeof (*sinfo) + sizeof (*cmsg))) {
- eprintsoline(so, EINVAL);
- return (EINVAL);
- }
- sinfo = (struct sctp_sndrcvinfo *)(cmsg + 1);
-
- /* Both flags should not be set together. */
- if ((sinfo->sinfo_flags & MSG_EOF) &&
- (sinfo->sinfo_flags & MSG_ABORT)) {
- eprintsoline(so, EINVAL);
- return (EINVAL);
- }
-
- /* Initiate a graceful shutdown. */
- if (sinfo->sinfo_flags & MSG_EOF) {
- /* Can't include data in MSG_EOF message. */
- if (uiop->uio_resid != 0) {
- eprintsoline(so, EINVAL);
- return (EINVAL);
- }
-
- /*
- * This is the same sequence as done in
- * shutdown(SHUT_WR).
- */
- mutex_enter(&so->so_lock);
- so_lock_single(so);
- socantsendmore(so);
- cv_broadcast(&ss->ss_txdata_cv);
- so->so_state |= SS_ISDISCONNECTING;
- mutex_exit(&so->so_lock);
-
- pollwakeup(&ss->ss_poll_list, POLLOUT);
- sctp_recvd(so->so_priv, so->so_rcvbuf);
- error = sctp_disconnect(so->so_priv);
-
- mutex_enter(&so->so_lock);
- so_unlock_single(so, SOLOCKED);
- mutex_exit(&so->so_lock);
- return (error);
- }
- }
- } else {
- optlen = 0;
- }
-
- mutex_enter(&so->so_lock);
- for (;;) {
- if (so->so_state & SS_CANTSENDMORE) {
- mutex_exit(&so->so_lock);
- tsignal(curthread, SIGPIPE);
- return (EPIPE);
- }
-
- if (so->so_error != 0) {
- error = sogeterr(so);
- mutex_exit(&so->so_lock);
- return (error);
- }
-
- if (ss->ss_txqueued < so->so_sndbuf)
- break;
-
- /*
- * Xmit window full in a blocking socket.
- */
- if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
- (flags & MSG_DONTWAIT)) {
- mutex_exit(&so->so_lock);
- return (EAGAIN);
- } else {
- /*
- * Wait for space to become available and try again.
- */
- error = cv_wait_sig(&ss->ss_txdata_cv, &so->so_lock);
- if (!error) { /* signal */
- mutex_exit(&so->so_lock);
- return (EINTR);
- }
- }
- }
- msglen = count = uiop->uio_resid;
-
- /* Don't allow sending a message larger than the send buffer size. */
- if (msglen > so->so_sndbuf) {
- mutex_exit(&so->so_lock);
- return (EMSGSIZE);
- }
-
- /*
- * Update TX buffer usage here so that we can lift the socket lock.
- */
- ss->ss_txqueued += msglen;
-
- /*
- * Allow piggybacking data on handshake messages (SS_ISCONNECTING).
- */
- if (!(so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED))) {
- /*
- * We need to check here for listener so that the
- * same error will be returned as with a TCP socket.
- * In this case, sosctp_connect() returns EOPNOTSUPP
- * while a TCP socket returns ENOTCONN instead. Catch it
- * here to have the same behavior as a TCP socket.
- *
- * We also need to make sure that the peer address is
- * provided before we attempt to do the connect.
- */
- if ((so->so_state & SS_ACCEPTCONN) ||
- msg->msg_name == NULL) {
- mutex_exit(&so->so_lock);
- error = ENOTCONN;
- goto error_nofree;
- }
- mutex_exit(&so->so_lock);
- fflag = uiop->uio_fmode;
- if (flags & MSG_DONTWAIT) {
- fflag |= FNDELAY;
- }
- error = sosctp_connect(so, msg->msg_name, msg->msg_namelen,
- fflag, (so->so_version == SOV_XPG4_2) * _SOCONNECT_XPG4_2);
- if (error) {
- /*
- * Check for non-fatal errors, socket connected
- * while the lock had been lifted.
- */
- if (error != EISCONN && error != EALREADY) {
- goto error_nofree;
- }
- error = 0;
- }
- } else {
- mutex_exit(&so->so_lock);
- }
-
- mctl = sctp_alloc_hdr(msg->msg_name, msg->msg_namelen,
- msg->msg_control, optlen, SCTP_CAN_BLOCK);
- if (mctl == NULL) {
- error = EINTR;
- goto error_nofree;
- }
-
- /* Copy in the message. */
- if ((error = sosctp_uiomove(mctl, count, ss->ss_wrsize, ss->ss_wroff,
- uiop, flags, CRED())) != 0) {
- goto error_ret;
- }
- error = sctp_sendmsg(so->so_priv, mctl, 0);
- if (error == 0)
- return (0);
-
-error_ret:
- freemsg(mctl);
-error_nofree:
- mutex_enter(&so->so_lock);
- ss->ss_txqueued -= msglen;
- cv_broadcast(&ss->ss_txdata_cv);
- if ((error == EPIPE) && (so->so_state & SS_CANTSENDMORE)) {
- /*
- * We received shutdown between the time lock was
- * lifted and call to sctp_sendmsg().
- */
- mutex_exit(&so->so_lock);
- tsignal(curthread, SIGPIPE);
- return (EPIPE);
- }
- mutex_exit(&so->so_lock);
- return (error);
-}
-
-/*
- * Send message on 1-N socket. Connects automatically if there is
- * no association.
- */
-static int
-sosctp_seq_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
-{
- struct sctp_sonode *ss;
- struct sctp_soassoc *ssa;
- struct cmsghdr *cmsg;
- struct sctp_sndrcvinfo *sinfo;
- int aid = 0;
- mblk_t *mctl;
- int namelen, optlen, flags;
- ssize_t count, msglen;
- int error;
- uint16_t s_flags = 0;
-
- ASSERT(so->so_type == SOCK_SEQPACKET);
-
- /*
- * There shouldn't be problems with alignment, as the memory for
- * msg_control was alloced with kmem_alloc.
- */
- cmsg = sosctp_find_cmsg(msg->msg_control, msg->msg_controllen,
- SCTP_SNDRCV);
- if (cmsg != NULL) {
- if (cmsg->cmsg_len < (sizeof (*sinfo) + sizeof (*cmsg))) {
- eprintsoline(so, EINVAL);
- return (EINVAL);
- }
- sinfo = (struct sctp_sndrcvinfo *)(cmsg + 1);
- s_flags = sinfo->sinfo_flags;
- aid = sinfo->sinfo_assoc_id;
- }
-
- ss = SOTOSSO(so);
- namelen = msg->msg_namelen;
-
- if (msg->msg_controllen > 0) {
- optlen = msg->msg_controllen;
- } else {
- optlen = 0;
- }
-
- mutex_enter(&so->so_lock);
-
- /*
- * If there is no association id, connect to address specified
- * in msg_name. Otherwise look up the association using the id.
- */
- if (aid == 0) {
- /*
- * Connect and shutdown cannot be done together, so check for
- * MSG_EOF.
- */
- if (msg->msg_name == NULL || namelen == 0 ||
- (s_flags & MSG_EOF)) {
- error = EINVAL;
- eprintsoline(so, error);
- goto done;
- }
- flags = uiop->uio_fmode;
- if (msg->msg_flags & MSG_DONTWAIT) {
- flags |= FNDELAY;
- }
- so_lock_single(so);
- error = sosctp_assoc_createconn(ss, msg->msg_name, namelen,
- msg->msg_control, optlen, flags, &ssa);
- if (error) {
- if ((so->so_version == SOV_XPG4_2) &&
- (error == EHOSTUNREACH)) {
- error = ENETUNREACH;
- }
- if (ssa == NULL) {
- /*
- * Fatal error during connect(). Bail out.
- * If ssa exists, it means that the handshake
- * is in progress.
- */
- eprintsoline(so, error);
- so_unlock_single(so, SOLOCKED);
- goto done;
- }
- /*
- * All the errors are non-fatal ones, don't return
- * e.g. EINPROGRESS from sendmsg().
- */
- error = 0;
- }
- so_unlock_single(so, SOLOCKED);
- } else {
- if ((error = sosctp_assoc(ss, aid, &ssa)) != 0) {
- eprintsoline(so, error);
- goto done;
- }
- }
-
- /*
- * Now we have an association.
- */
- flags = msg->msg_flags;
-
- /*
- * MSG_EOF initiates graceful shutdown.
- */
- if (s_flags & MSG_EOF) {
- if (uiop->uio_resid) {
- /*
- * Can't include data in MSG_EOF message.
- */
- error = EINVAL;
- } else {
- mutex_exit(&so->so_lock);
- ssa->ssa_state |= SS_ISDISCONNECTING;
- sctp_recvd(ssa->ssa_conn, so->so_rcvbuf);
- error = sctp_disconnect(ssa->ssa_conn);
- mutex_enter(&so->so_lock);
- }
- goto refrele;
- }
-
- for (;;) {
- if (ssa->ssa_state & SS_CANTSENDMORE) {
- SSA_REFRELE(ss, ssa);
- mutex_exit(&so->so_lock);
- tsignal(curthread, SIGPIPE);
- return (EPIPE);
- }
-
- if (ssa->ssa_error != 0) {
- error = ssa->ssa_error;
- ssa->ssa_error = 0;
- goto refrele;
- }
-
- if (ssa->ssa_txqueued < so->so_sndbuf)
- break;
-
- if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
- (flags & MSG_DONTWAIT)) {
- error = EAGAIN;
- goto refrele;
- } else {
- /*
- * Wait for space to become available and try again.
- */
- error = cv_wait_sig(&ss->ss_txdata_cv, &so->so_lock);
- if (!error) { /* signal */
- error = EINTR;
- goto refrele;
- }
- }
- }
-
- msglen = count = uiop->uio_resid;
-
- /* Don't allow sending a message larger than the send buffer size. */
- if (msglen > so->so_sndbuf) {
- error = EMSGSIZE;
- goto refrele;
- }
-
- /*
- * Update TX buffer usage here so that we can lift the socket lock.
- */
- ssa->ssa_txqueued += msglen;
-
- mutex_exit(&so->so_lock);
-
- mctl = sctp_alloc_hdr(msg->msg_name, namelen, msg->msg_control,
- optlen, SCTP_CAN_BLOCK);
- if (mctl == NULL) {
- error = EINTR;
- goto lock_rele;
- }
-
- /* Copy in the message. */
- if ((error = sosctp_uiomove(mctl, count, ssa->ssa_wrsize,
- ssa->ssa_wroff, uiop, flags, CRED())) != 0) {
- goto lock_rele;
- }
- error = sctp_sendmsg(ssa->ssa_conn, mctl, 0);
-lock_rele:
- mutex_enter(&so->so_lock);
- if (error != 0) {
- freemsg(mctl);
- ssa->ssa_txqueued -= msglen;
- cv_broadcast(&ss->ss_txdata_cv);
- if ((error == EPIPE) && (ssa->ssa_state & SS_CANTSENDMORE)) {
- /*
- * We received shutdown between the time lock was
- * lifted and call to sctp_sendmsg().
- */
- SSA_REFRELE(ss, ssa);
- mutex_exit(&so->so_lock);
- tsignal(curthread, SIGPIPE);
- return (EPIPE);
- }
- }
-
-refrele:
- SSA_REFRELE(ss, ssa);
-done:
- mutex_exit(&so->so_lock);
- return (error);
-}
-
-/*
- * Get address of remote node.
- */
-static int
-sosctp_getpeername(struct sonode *so)
-{
- int error;
-
- if (so->so_type != SOCK_STREAM) {
- /*
- * SEQPACKET can have multiple end-points.
- */
- return (EOPNOTSUPP);
- }
-
- if (!(so->so_state & SS_ISCONNECTED)) {
- error = ENOTCONN;
- } else {
- error = sctp_getpeername(so->so_priv, so->so_faddr_sa,
- &so->so_faddr_len);
- }
- return (error);
-}
-
-/*
- * Get local address.
- */
-static int
-sosctp_getsockname(struct sonode *so)
-{
- int error;
-
- mutex_enter(&so->so_lock);
-
- if (!(so->so_state & SS_ISBOUND)) {
- /*
- * Zero address, except for address family
- */
- bzero(so->so_laddr_sa, so->so_laddr_maxlen);
-
- so->so_laddr_len = (so->so_family == AF_INET6) ?
- sizeof (struct sockaddr_in6) : sizeof (struct sockaddr_in);
- so->so_laddr_sa->sa_family = so->so_family;
- error = 0;
- mutex_exit(&so->so_lock);
- } else {
- mutex_exit(&so->so_lock);
-
- error = sctp_getsockname(so->so_priv, so->so_laddr_sa,
- &so->so_laddr_len);
- }
-
- return (error);
-}
-
-/*
- * Called from shutdown().
- */
-static int
-sosctp_shutdown(struct sonode *so, int how)
-{
- struct sctp_sonode *ss = SOTOSSO(so);
- uint_t state_change;
- int error = 0;
- short wakesig = 0;
-
- if (so->so_type == SOCK_SEQPACKET) {
- return (EOPNOTSUPP);
- }
- mutex_enter(&so->so_lock);
- so_lock_single(so);
-
- /*
- * SunOS 4.X has no check for datagram sockets.
- * 5.X checks that it is connected (ENOTCONN)
- * X/Open requires that we check the connected state.
- */
- if (!(so->so_state & SS_ISCONNECTED)) {
- error = ENOTCONN;
- goto done;
- }
-
- /*
- * Record the current state and then perform any state changes.
- * Then use the difference between the old and new states to
- * determine which needs to be done.
- */
- state_change = so->so_state;
-
- switch (how) {
- case SHUT_RD:
- socantrcvmore(so);
- break;
- case SHUT_WR:
- socantsendmore(so);
- break;
- case SHUT_RDWR:
- socantsendmore(so);
- socantrcvmore(so);
- break;
- default:
- error = EINVAL;
- goto done;
- }
-
- state_change = so->so_state & ~state_change;
-
- if (state_change & SS_CANTRCVMORE) {
- if (ss->ss_rxdata == NULL) {
- cv_signal(&ss->ss_rxdata_cv);
- }
- wakesig = POLLIN|POLLRDNORM;
-
- sosctp_sendsig(ss, SCTPSIG_READ);
- }
- if (state_change & SS_CANTSENDMORE) {
- cv_broadcast(&ss->ss_txdata_cv);
- wakesig |= POLLOUT;
-
- so->so_state |= SS_ISDISCONNECTING;
- }
- mutex_exit(&so->so_lock);
-
- pollwakeup(&ss->ss_poll_list, wakesig);
-
- if (state_change & SS_CANTSENDMORE) {
- sctp_recvd(so->so_priv, so->so_rcvbuf);
- error = sctp_disconnect(so->so_priv);
- }
- mutex_enter(&so->so_lock);
-done:
- so_unlock_single(so, SOLOCKED);
- mutex_exit(&so->so_lock);
-
- /*
- * HACK: sctp_disconnect() may return EWOULDBLOCK. But this error is
- * not documented in standard socket API. Catch it here.
- */
- if (error == EWOULDBLOCK)
- error = 0;
- return (error);
-}
-
-/*
- * Get socket options.
- */
-/*ARGSUSED5*/
-static int
-sosctp_getsockopt(struct sonode *so, int level, int option_name,
- void *optval, socklen_t *optlenp, int flags)
-{
- int error = 0;
- void *option = NULL;
- socklen_t maxlen = *optlenp;
- socklen_t len;
- socklen_t optlen;
- uint32_t value;
- uint8_t buffer[4];
- void *optbuf = &buffer;
-
- mutex_enter(&so->so_lock);
-
- if (level == SOL_SOCKET) {
- switch (option_name) {
- /* Not supported options */
- case SO_SNDTIMEO:
- case SO_RCVTIMEO:
- case SO_EXCLBIND:
- error = ENOPROTOOPT;
- eprintsoline(so, error);
- goto done;
-
- case SO_TYPE:
- case SO_ERROR:
- case SO_DEBUG:
- case SO_ACCEPTCONN:
- case SO_REUSEADDR:
- case SO_KEEPALIVE:
- case SO_DONTROUTE:
- case SO_BROADCAST:
- case SO_USELOOPBACK:
- case SO_OOBINLINE:
- case SO_SNDBUF:
- case SO_RCVBUF:
- case SO_SNDLOWAT:
- case SO_RCVLOWAT:
- case SO_DGRAM_ERRIND:
- case SO_PROTOTYPE:
- case SO_DOMAIN:
- if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
- error = EINVAL;
- eprintsoline(so, error);
- goto done;
- }
- break;
- case SO_LINGER:
- if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
- error = EINVAL;
- eprintsoline(so, error);
- goto done;
- }
- break;
- }
- len = (t_uscalar_t)sizeof (uint32_t); /* Default */
- option = &value;
-
- /*
- * Most of the SOL_SOCKET level option values are also
- * recorded in sockfs. So we can return the recorded value
- * here without calling into SCTP.
- */
- switch (option_name) {
- case SO_TYPE:
- value = so->so_type;
- goto copyout;
-
- case SO_ERROR:
- value = sogeterr(so);
- goto copyout;
-
- case SO_ACCEPTCONN:
- value = (so->so_state & SS_ACCEPTCONN) ?
- SO_ACCEPTCONN : 0;
- goto copyout;
-
- case SO_DEBUG:
- case SO_REUSEADDR:
- case SO_KEEPALIVE:
- case SO_DONTROUTE:
- case SO_BROADCAST:
- case SO_USELOOPBACK:
- case SO_OOBINLINE:
- case SO_DGRAM_ERRIND:
- value = (so->so_options & option_name);
- goto copyout;
-
- case SO_SNDBUF:
- value = so->so_sndbuf;
- goto copyout;
-
- case SO_RCVBUF:
- value = so->so_rcvbuf;
- goto copyout;
-
- case SO_SNDLOWAT:
- value = so->so_sndlowat;
- goto copyout;
-
- case SO_RCVLOWAT:
- value = so->so_rcvlowat;
- goto copyout;
-
- case SO_PROTOTYPE:
- value = IPPROTO_SCTP;
- goto copyout;
-
- case SO_DOMAIN:
- value = so->so_family;
- goto copyout;
-
- case SO_LINGER:
- option = &so->so_linger;
- len = (t_uscalar_t)sizeof (struct linger);
- break;
-
- default:
- option = NULL;
- break;
- }
- }
- if (level == IPPROTO_SCTP) {
- /*
- * Should go through ioctl().
- */
- error = EINVAL;
- goto done;
- }
- if (maxlen > sizeof (buffer)) {
- optbuf = kmem_alloc(maxlen, KM_SLEEP);
- }
- optlen = maxlen;
- mutex_exit(&so->so_lock);
- /*
- * If the resulting optlen is greater than the provided maxlen, then
- * we sliently trucate.
- */
- error = sctp_get_opt(so->so_priv, level, option_name, optbuf, &optlen);
- mutex_enter(&so->so_lock);
- if (error != 0) {
- if (option == NULL) {
- /* We have no fallback value */
- eprintsoline(so, error);
- goto free;
- }
- error = 0;
- goto copyout;
- }
-
- option = optbuf;
- len = optlen;
-
-copyout:
- len = MIN(len, maxlen);
- bcopy(option, optval, len);
- *optlenp = len;
-
-free:
- if (optbuf != &buffer) {
- kmem_free(optbuf, maxlen);
- }
-done:
- mutex_exit(&so->so_lock);
- return (error);
-}
-
-/*
- * Set socket options
- */
-static int
-sosctp_setsockopt(struct sonode *so, int level, int option_name,
- const void *optval, t_uscalar_t optlen)
-{
- struct sctp_sonode *ss = SOTOSSO(so);
- struct sctp_soassoc *ssa = NULL;
- sctp_assoc_t id;
- int error, rc;
- void *conn = NULL;
-
- /* X/Open requires this check */
- if (so->so_state & SS_CANTSENDMORE) {
- return (EINVAL);
- }
- if ((option_name == SCTP_UC_SWAP) && (level == IPPROTO_SCTP)) {
- error = EOPNOTSUPP;
- eprintsoline(so, error);
- return (error);
- }
-
- /* Caller allocates aligned optval, or passes null */
- ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
-
- /* No SCTP options should be zero-length */
- if (optlen == 0) {
- error = EINVAL;
- eprintsoline(so, error);
- return (error);
- }
-
- mutex_enter(&so->so_lock);
- so_lock_single(so);
-
- /*
- * For some SCTP level options, one can select the association this
- * applies to.
- */
- if (so->so_type == SOCK_STREAM) {
- conn = so->so_priv;
- } else {
- /*
- * SOCK_SEQPACKET only
- */
- id = 0;
- if (level == IPPROTO_SCTP) {
- switch (option_name) {
- case SCTP_RTOINFO:
- case SCTP_ASSOCINFO:
- case SCTP_SET_PEER_PRIMARY_ADDR:
- case SCTP_PRIMARY_ADDR:
- case SCTP_PEER_ADDR_PARAMS:
- /*
- * Association ID is the first element
- * params struct
- */
- if (optlen < sizeof (sctp_assoc_t)) {
- error = EINVAL;
- eprintsoline(so, error);
- goto done;
- }
- id = *(sctp_assoc_t *)optval;
- break;
- case SCTP_DEFAULT_SEND_PARAM:
- if (optlen != sizeof (struct sctp_sndrcvinfo)) {
- error = EINVAL;
- eprintsoline(so, error);
- goto done;
- }
- id = ((struct sctp_sndrcvinfo *)
- optval)->sinfo_assoc_id;
- break;
- case SCTP_INITMSG:
- /*
- * Only applies to future associations
- */
- conn = so->so_priv;
- break;
- default:
- break;
- }
- } else if (level == SOL_SOCKET) {
- if (option_name == SO_LINGER) {
- error = EOPNOTSUPP;
- eprintsoline(so, error);
- goto done;
- }
- /*
- * These 2 options are applied to all associations.
- * The other socket level options are only applied
- * to the socket (not associations).
- */
- if ((option_name != SO_RCVBUF) &&
- (option_name != SO_SNDBUF)) {
- conn = so->so_priv;
- }
- } else {
- conn = NULL;
- }
-
- /*
- * If association ID was specified, do op on that assoc.
- * Otherwise set the default setting of a socket.
- */
- if (id != 0) {
- if ((error = sosctp_assoc(ss, id, &ssa)) != 0) {
- eprintsoline(so, error);
- goto done;
- }
- conn = ssa->ssa_conn;
- }
- }
- dprint(2, ("sosctp_setsockopt %p (%d) - conn %p %d %d id:%d\n",
- (void *)ss, so->so_type, conn, level, option_name, id));
-
- ASSERT(ssa == NULL || (ssa != NULL && conn != NULL));
- if (conn != NULL) {
- mutex_exit(&so->so_lock);
- error = sctp_set_opt(conn, level, option_name, optval, optlen);
- mutex_enter(&so->so_lock);
- if (ssa != NULL)
- SSA_REFRELE(ss, ssa);
- } else {
- /*
- * 1-N socket, and we have to apply the operation to ALL
- * associations. Like with anything of this sort, the
- * problem is what to do if the operation fails.
- * Just try to apply the setting to everyone, but store
- * error number if someone returns such. And since we are
- * looping through all possible aids, some of them can be
- * invalid. We just ignore this kind (sosctp_assoc()) of
- * errors.
- */
- sctp_assoc_t aid;
-
- mutex_exit(&so->so_lock);
- error = sctp_set_opt(so->so_priv, level, option_name, optval,
- optlen);
- mutex_enter(&so->so_lock);
- for (aid = 1; aid < ss->ss_maxassoc; aid++) {
- if (sosctp_assoc(ss, aid, &ssa) != 0)
- continue;
- mutex_exit(&so->so_lock);
- rc = sctp_set_opt(ssa->ssa_conn, level, option_name,
- optval, optlen);
- mutex_enter(&so->so_lock);
- SSA_REFRELE(ss, ssa);
- if (error == 0) {
- error = rc;
- }
- }
- }
- /*
- * Check for SOL_SOCKET options and record their values.
- * If we know about a SOL_SOCKET parameter and the transport
- * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
- * EPROTO) we let the setsockopt succeed.
- */
- if (level == SOL_SOCKET) {
- boolean_t handled = B_FALSE;
-
- /* Check parameters */
- switch (option_name) {
- case SO_DEBUG:
- case SO_REUSEADDR:
- case SO_KEEPALIVE:
- case SO_DONTROUTE:
- case SO_BROADCAST:
- case SO_USELOOPBACK:
- case SO_OOBINLINE:
- case SO_SNDBUF:
- case SO_RCVBUF:
- case SO_SNDLOWAT:
- case SO_RCVLOWAT:
- case SO_DGRAM_ERRIND:
- if (optlen != (t_uscalar_t)sizeof (int32_t)) {
- error = EINVAL;
- eprintsoline(so, error);
- goto done;
- }
- ASSERT(optval);
- handled = B_TRUE;
- break;
- case SO_LINGER:
- if (optlen != (t_uscalar_t)sizeof (struct linger)) {
- error = EINVAL;
- eprintsoline(so, error);
- goto done;
- }
- ASSERT(optval);
- handled = B_TRUE;
- break;
- }
-
-#define intvalue (*(int32_t *)optval)
-
- switch (option_name) {
- case SO_SNDTIMEO:
- case SO_RCVTIMEO:
- case SO_EXCLBIND:
- case SO_TYPE:
- case SO_ERROR:
- case SO_ACCEPTCONN:
- case SO_PROTOTYPE:
- case SO_DOMAIN:
- /* Can't be set */
- error = ENOPROTOOPT;
- goto done;
- case SO_LINGER: {
- struct linger *l = (struct linger *)optval;
-
- so->so_linger.l_linger = l->l_linger;
- if (l->l_onoff) {
- so->so_linger.l_onoff = SO_LINGER;
- so->so_options |= SO_LINGER;
- } else {
- so->so_linger.l_onoff = 0;
- so->so_options &= ~SO_LINGER;
- }
- break;
- }
-
- case SO_DEBUG:
- case SO_REUSEADDR:
- case SO_KEEPALIVE:
- case SO_DONTROUTE:
- case SO_BROADCAST:
- case SO_USELOOPBACK:
- case SO_OOBINLINE:
- case SO_DGRAM_ERRIND:
- if (intvalue != 0) {
- dprintso(so, 1,
- ("sosctp_setsockopt: setting 0x%x\n",
- option_name));
- so->so_options |= option_name;
- } else {
- dprintso(so, 1,
- ("sosctp_setsockopt: clearing 0x%x\n",
- option_name));
- so->so_options &= ~option_name;
- }
- break;
- /*
- * The following options are only returned by us when
- * the sctp_set_opt fails.
- * XXX XPG 4.2 applications retrieve SO_RCVBUF from
- * sockfs since the transport might adjust the value
- * and not return exactly what was set by the
- * application.
- */
- case SO_SNDBUF:
- so->so_sndbuf = intvalue;
- if (so->so_sndlowat > so->so_sndbuf) {
- so->so_sndlowat = so->so_sndbuf;
- }
- break;
- case SO_RCVBUF:
- so->so_rcvbuf = intvalue;
- if (so->so_rcvlowat > so->so_rcvbuf) {
- so->so_rcvlowat = so->so_rcvbuf;
- }
- break;
- case SO_SNDLOWAT:
- so->so_sndlowat = intvalue;
- if (so->so_sndlowat > so->so_sndbuf) {
- so->so_sndlowat = so->so_sndbuf;
- }
- break;
- case SO_RCVLOWAT:
- so->so_rcvlowat = intvalue;
- if (so->so_rcvlowat > so->so_rcvbuf) {
- so->so_rcvlowat = so->so_rcvbuf;
- }
- break;
- }
-#undef intvalue
-
- if (error != 0) {
- if ((error == ENOPROTOOPT || error == EPROTO ||
- error == EINVAL) && handled) {
- dprintso(so, 1,
- ("sosctp_setsockopt: ignoring error %d "
- "for 0x%x\n", error, option_name));
- error = 0;
- }
- }
- }
-
-done:
- so_unlock_single(so, SOLOCKED);
- mutex_exit(&so->so_lock);
-
- return (error);
-}
-
-/*
- * Upcalls from SCTP
- */
-
-/*
- * Incoming connection on listen socket.
- */
-static void *
-sctp_sock_newconn(void *parenthandle, void *connind)
-{
- struct sctp_sonode *lss = parenthandle;
- struct sonode *lso = &lss->ss_so;
- struct sonode *nso;
- struct sctp_sonode *nss;
- mblk_t *mp;
- int error;
-
- ASSERT(lso->so_state & SS_ACCEPTCONN);
- ASSERT(lso->so_priv != NULL); /* closed conn */
- ASSERT(lso->so_type == SOCK_STREAM);
-
- /*
- * Check current # of queued conns against backlog
- */
- if (lss->ss_rxqueued >= lso->so_backlog) {
- return (NULL);
- }
-
- /*
- * Need to create a new socket.
- */
- mp = allocb(sizeof (nso), BPRI_MED);
- if (mp == NULL) {
- eprintsoline(lso, ENOMEM);
- return (NULL);
- }
- DB_TYPE(mp) = M_PROTO;
-
- VN_HOLD(lso->so_accessvp);
- nso = sosctp_create(lso->so_accessvp, lso->so_family, lso->so_type,
- lso->so_protocol, lso->so_version, lso, &error);
- if (nso == NULL) {
- VN_RELE(lso->so_accessvp);
- freeb(mp);
- eprintsoline(lso, error);
- return (NULL);
- }
-
- dprint(2, ("sctp_stream_newconn: new %p\n", (void *)nso));
-
- nss = SOTOSSO(nso);
-
- /*
- * Inherit socket properties
- */
- mutex_enter(&lso->so_lock);
- mutex_enter(&nso->so_lock);
-
- nso->so_state |= (SS_ISBOUND | SS_ISCONNECTED |
- (lso->so_state & SS_ASYNC));
- sosctp_so_inherit(lss, nss);
- nso->so_priv = connind;
-
- mutex_exit(&nso->so_lock);
-
- ++lss->ss_rxqueued;
- mutex_exit(&lso->so_lock);
-
- /*
- * Copy pointer to new socket to connind queue message
- */
- *(struct sonode **)mp->b_wptr = nso;
- mp->b_wptr += sizeof (nso);
-
- /*
- * Wake people who're waiting incoming conns. Note that
- * soqueueconnind gets so_lock.
- */
- soqueueconnind(lso, mp);
- pollwakeup(&lss->ss_poll_list, POLLIN|POLLRDNORM);
-
- mutex_enter(&lso->so_lock);
- sosctp_sendsig(lss, SCTPSIG_READ);
- mutex_exit(&lso->so_lock);
-
- return (nss);
-}
-
-/*
- * This is the upcall function for 1-N (SOCK_SEQPACKET) socket when a new
- * association is created. Note that the first argument (handle) is of type
- * sctp_sonode *, which is the one changed to a listener for new
- * associations. All the other upcalls for 1-N socket take sctp_soassoc *
- * as handle. The only exception is the su_properties upcall, which
- * can take both types as handle.
- */
-static void *
-sctp_assoc_newconn(void *parenthandle, void *connind)
-{
- struct sctp_sonode *lss = (struct sctp_sonode *)parenthandle;
- struct sonode *lso = &lss->ss_so;
- struct sctp_soassoc *ssa;
- sctp_assoc_t id;
-
- ASSERT(lss->ss_type == SOSCTP_SOCKET);
- ASSERT(lso->so_state & SS_ACCEPTCONN);
- ASSERT(lso->so_priv != NULL); /* closed conn */
- ASSERT(lso->so_type == SOCK_SEQPACKET);
-
- mutex_enter(&lso->so_lock);
-
- if ((id = sosctp_aid_get(lss)) == -1) {
- /*
- * Array not large enough; increase size.
- */
- if (sosctp_aid_grow(lss, lss->ss_maxassoc, KM_NOSLEEP) < 0) {
- mutex_exit(&lso->so_lock);
- return (NULL);
- }
- id = sosctp_aid_get(lss);
- ASSERT(id != -1);
- }
-
- /*
- * Create soassoc for this connection
- */
- ssa = sosctp_assoc_create(lss, KM_NOSLEEP);
- if (ssa == NULL) {
- mutex_exit(&lso->so_lock);
- return (NULL);
- }
- sosctp_aid_reserve(lss, id, 1);
- lss->ss_assocs[id].ssi_assoc = ssa;
- ++lss->ss_assoccnt;
- ssa->ssa_id = id;
- ssa->ssa_conn = connind;
- ssa->ssa_state = (SS_ISBOUND | SS_ISCONNECTED);
- ssa->ssa_wroff = lss->ss_wroff;
- ssa->ssa_wrsize = lss->ss_wrsize;
-
- mutex_exit(&lso->so_lock);
-
- return (ssa);
-}
-
-/*
- * For outgoing connections, the connection has been established.
- */
-static void
-sctp_sock_connected(void *handle)
-{
- struct sctp_sonode *ss = handle;
- struct sonode *so = &ss->ss_so;
-
- ASSERT(so->so_type == SOCK_STREAM);
-
- mutex_enter(&so->so_lock);
- ASSERT(so->so_priv); /* closed conn */
-
- ASSERT(!(so->so_state & SS_ACCEPTCONN));
- soisconnected(so);
-
- sosctp_sendsig(ss, SCTPSIG_WRITE);
-
- mutex_exit(&so->so_lock);
-
- /*
- * Wake ones who're waiting for conn to become established.
- */
- pollwakeup(&ss->ss_poll_list, POLLOUT);
-}
-
-static void
-sctp_assoc_connected(void *handle)
-{
- struct sctp_soassoc *ssa = handle;
- struct sonode *so = &ssa->ssa_sonode->ss_so;
-
- ASSERT(so->so_type == SOCK_SEQPACKET);
- ASSERT(ssa->ssa_conn);
-
- mutex_enter(&so->so_lock);
- sosctp_assoc_isconnected(ssa);
- mutex_exit(&so->so_lock);
-}
-
-/*
- * Connection got disconnected. Either with an error, or through
- * normal handshake.
- * Note that there is no half-closed conn, like TCP.
- */
-static int
-sctp_sock_disconnected(void *handle, int error)
-{
- struct sctp_sonode *ss = handle;
- struct sonode *so = &ss->ss_so;
- int event = 0;
-
- ASSERT(so->so_type == SOCK_STREAM);
-
- mutex_enter(&so->so_lock);
- ASSERT(so->so_priv != NULL); /* closed conn */
-
- /*
- * Connection is gone, wake everybody.
- */
- if (ss->ss_rxdata == NULL) {
- cv_signal(&ss->ss_rxdata_cv);
- }
- cv_broadcast(&ss->ss_txdata_cv);
-
- /*
- * If socket is already disconnected/disconnecting,
- * don't (re)send signal.
- */
- if (!(so->so_state & SS_CANTRCVMORE))
- event |= SCTPSIG_READ;
- if (!(so->so_state & SS_CANTSENDMORE))
- event |= SCTPSIG_WRITE;
- if (event != 0)
- sosctp_sendsig(ss, event);
-
- soisdisconnected(so, error);
- mutex_exit(&so->so_lock);
-
- pollwakeup(&ss->ss_poll_list, POLLIN|POLLRDNORM|POLLOUT);
-
- return (0);
-}
-
-static int
-sctp_assoc_disconnected(void *handle, int error)
-{
- struct sctp_soassoc *ssa = handle;
- struct sctp_sonode *ss = ssa->ssa_sonode;
- struct sonode *so = &ssa->ssa_sonode->ss_so;
- int ret;
-
- ASSERT(so->so_type == SOCK_SEQPACKET);
- ASSERT(ssa->ssa_conn != NULL);
-
- mutex_enter(&so->so_lock);
- sosctp_assoc_isdisconnected(ssa, error);
- if (ssa->ssa_refcnt == 1) {
- ret = 1;
- ssa->ssa_conn = NULL;
- } else {
- ret = 0;
- }
- SSA_REFRELE(SOTOSSO(so), ssa);
-
- cv_broadcast(&ss->ss_txdata_cv);
-
- mutex_exit(&so->so_lock);
-
- return (ret);
-}
-
-/*
- * Peer sent a shutdown. After this point writes are not allowed
- * to this socket, but one might still receive notifications
- * (e.g. for data which never got sent).
- */
-static void
-sctp_sock_disconnecting(void *handle)
-{
- struct sctp_sonode *ss = handle;
- struct sonode *so = &ss->ss_so;
-
- ASSERT(so->so_type == SOCK_STREAM);
-
- mutex_enter(&so->so_lock);
- ASSERT(so->so_priv != NULL); /* closed conn */
-
- /*
- * Socket not writeable anymore. Wake writers, and ones
- * who're waiting on socket state change
- */
- cv_broadcast(&ss->ss_txdata_cv);
-
- if (!(so->so_state & SS_CANTSENDMORE)) {
- /*
- * If socket already un-writeable, don't (re)send signal.
- */
- sosctp_sendsig(ss, SCTPSIG_WRITE);
- }
- so->so_state &= ~(SS_ISCONNECTING);
- so->so_state |= SS_CANTSENDMORE;
- cv_broadcast(&so->so_state_cv);
- mutex_exit(&so->so_lock);
-
- pollwakeup(&ss->ss_poll_list, POLLOUT);
-}
-
-static void
-sctp_assoc_disconnecting(void *handle)
-{
- struct sctp_soassoc *ssa = handle;
- struct sonode *so = &ssa->ssa_sonode->ss_so;
-
- ASSERT(so->so_type == SOCK_SEQPACKET);
- ASSERT(ssa->ssa_conn != NULL);
-
- mutex_enter(&so->so_lock);
- sosctp_assoc_isdisconnecting(ssa);
- mutex_exit(&so->so_lock);
-}
-
-/*
- * Incoming data.
- */
-static int
-sctp_sock_recv(void *handle, mblk_t *mp, int flags)
-{
- struct sctp_sonode *ss = handle;
- struct sonode *so = &ss->ss_so;
- int msglen;
-#if defined(DEBUG) && !defined(lint)
- union T_primitives *tpr;
-#endif
-
- ASSERT(so->so_type == SOCK_STREAM);
- ASSERT(mp != NULL);
- ASSERT(!(so->so_state & SS_ACCEPTCONN));
-
- /*
- * Should be getting T_unitdata_req's only.
- * Must have address as part of packet.
- */
-#if defined(DEBUG) && !defined(lint)
- tpr = (union T_primitives *)mp->b_rptr;
- ASSERT((DB_TYPE(mp) == M_PROTO) &&
- (tpr->type == T_UNITDATA_IND));
- ASSERT((tpr->unitdata_ind.SRC_length));
-#endif
-
- /*
- * First mblk has only unitdata_req
- */
- msglen = msgsize(mp->b_cont);
-
- mutex_enter(&so->so_lock);
- ASSERT(so->so_priv); /* closed conn */
-
- if (so->so_state & SS_CANTRCVMORE) {
- mutex_exit(&so->so_lock);
- freemsg(mp);
- return (so->so_rcvbuf);
- }
- if (ss->ss_rxdata == NULL) {
- cv_signal(&ss->ss_rxdata_cv);
- }
- *ss->ss_rxtail = mp;
- ss->ss_rxtail = &mp->b_next;
- ss->ss_rxqueued += msglen;
-
- sosctp_sendsig(ss, SCTPSIG_READ);
-
- /*
- * Override b_flag for SCTP sockfs internal use
- */
- mp->b_flag = (short)flags;
-
- mutex_exit(&so->so_lock);
-
- pollwakeup(&ss->ss_poll_list, POLLIN|POLLRDNORM);
-
- return (so->so_rcvbuf - ss->ss_rxqueued);
-}
-
-static int
-sctp_assoc_recv(void *handle, mblk_t *mp, int flags)
-{
- struct sctp_soassoc *ssa = handle;
- struct sctp_sonode *ss = ssa->ssa_sonode;
- struct sonode *so = &ss->ss_so;
- struct T_unitdata_ind *tind;
- int msglen;
- mblk_t *mp2;
- union sctp_notification *sn;
- struct sctp_sndrcvinfo *sinfo;
-
- ASSERT(ssa->ssa_type == SOSCTP_ASSOC);
- ASSERT(so->so_type == SOCK_SEQPACKET);
- ASSERT(ssa->ssa_conn != NULL); /* closed conn */
- ASSERT(mp != NULL);
-
- /*
- * Should be getting T_unitdata_req's only.
- * Must have address as part of packet.
- */
- tind = (struct T_unitdata_ind *)mp->b_rptr;
- ASSERT((DB_TYPE(mp) == M_PROTO) &&
- (tind->PRIM_type == T_UNITDATA_IND));
- ASSERT(tind->SRC_length);
-
- /*
- * First mblk has only unitdata_req
- */
- msglen = msgsize(mp->b_cont);
-
- mutex_enter(&so->so_lock);
-
- /*
- * Override b_flag for SCTP sockfs internal use
- */
- mp->b_flag = (short)flags;
-
- /*
- * For notify messages, need to fill in association id.
- * For data messages, sndrcvinfo could be in ancillary data.
- */
- if (flags & SCTP_NOTIFICATION) {
- mp2 = mp->b_cont;
- sn = (union sctp_notification *)mp2->b_rptr;
- switch (sn->sn_header.sn_type) {
- case SCTP_ASSOC_CHANGE:
- sn->sn_assoc_change.sac_assoc_id = ssa->ssa_id;
- break;
- case SCTP_PEER_ADDR_CHANGE:
- sn->sn_paddr_change.spc_assoc_id = ssa->ssa_id;
- break;
- case SCTP_REMOTE_ERROR:
- sn->sn_remote_error.sre_assoc_id = ssa->ssa_id;
- break;
- case SCTP_SEND_FAILED:
- sn->sn_send_failed.ssf_assoc_id = ssa->ssa_id;
- break;
- case SCTP_SHUTDOWN_EVENT:
- sn->sn_shutdown_event.sse_assoc_id = ssa->ssa_id;
- break;
- case SCTP_ADAPTATION_INDICATION:
- sn->sn_adaptation_event.sai_assoc_id = ssa->ssa_id;
- break;
- case SCTP_PARTIAL_DELIVERY_EVENT:
- sn->sn_pdapi_event.pdapi_assoc_id = ssa->ssa_id;
- break;
- default:
- ASSERT(0);
- break;
- }
- } else {
- if (tind->OPT_length > 0) {
- struct cmsghdr *cmsg;
- char *cend;
-
- cmsg = (struct cmsghdr *)
- ((uchar_t *)mp->b_rptr + tind->OPT_offset);
- cend = (char *)cmsg + tind->OPT_length;
- for (;;) {
- if ((char *)(cmsg + 1) > cend ||
- ((char *)cmsg + cmsg->cmsg_len) > cend) {
- break;
- }
- if ((cmsg->cmsg_level == IPPROTO_SCTP) &&
- (cmsg->cmsg_type == SCTP_SNDRCV)) {
- sinfo = (struct sctp_sndrcvinfo *)
- (cmsg + 1);
- sinfo->sinfo_assoc_id = ssa->ssa_id;
- break;
- }
- if (cmsg->cmsg_len > 0) {
- cmsg = (struct cmsghdr *)
- ((uchar_t *)cmsg + cmsg->cmsg_len);
- } else {
- break;
- }
- }
- }
- }
-
- /*
- * SCTP has reserved space in the header for storing a pointer.
- * Put the pointer to assocation there, and queue the data.
- */
- SSA_REFHOLD(ssa);
- ASSERT((mp->b_rptr - DB_BASE(mp)) >= sizeof (ssa));
- *(struct sctp_soassoc **)DB_BASE(mp) = ssa;
-
- if (ss->ss_rxdata == NULL) {
- cv_signal(&ss->ss_rxdata_cv);
- }
- *ss->ss_rxtail = mp;
- ss->ss_rxtail = &mp->b_next;
- ssa->ssa_rxqueued += msglen;
-
- sosctp_sendsig(ss, SCTPSIG_READ);
-
- mutex_exit(&so->so_lock);
-
- pollwakeup(&ss->ss_poll_list, POLLIN|POLLRDNORM);
-
- return (so->so_rcvbuf - ssa->ssa_rxqueued);
-}
-
-/*
- * TX queued data got acknowledged. Frees up space in TX queue.
- */
-static void
-sctp_sock_xmitted(void *handle, int txqueued)
-{
- struct sctp_sonode *ss = handle;
- struct sonode *so = &ss->ss_so;
- boolean_t writeable;
-
- mutex_enter(&so->so_lock);
- ASSERT(so->so_priv != NULL); /* closed conn */
-
- if (ss->ss_txqueued < so->so_sndlowat) {
- writeable = B_TRUE;
- } else {
- writeable = B_FALSE;
- }
- ss->ss_txqueued = txqueued;
-
- /*
- * Wake blocked writers.
- */
- cv_broadcast(&ss->ss_txdata_cv);
-
- /*
- * Only do pollwakeup if the amount of queued data is less than
- * watermark, and the socket wasn't writeable before.
- */
- if (!writeable && (ss->ss_txqueued < so->so_sndlowat)) {
- sosctp_sendsig(ss, SCTPSIG_WRITE);
- mutex_exit(&so->so_lock);
- pollwakeup(&ss->ss_poll_list, POLLOUT);
- } else {
- mutex_exit(&so->so_lock);
- }
-}
-
-static void
-sctp_assoc_xmitted(void *handle, int txqueued)
-{
- struct sctp_soassoc *ssa = handle;
- struct sctp_sonode *ss = ssa->ssa_sonode;
-
- ASSERT(ssa->ssa_type == SOSCTP_ASSOC);
- ASSERT(ss->ss_so.so_type == SOCK_SEQPACKET);
- ASSERT(ssa->ssa_conn != NULL);
-
- mutex_enter(&ss->ss_so.so_lock);
-
- ssa->ssa_txqueued = txqueued;
-
- /*
- * Wake blocked writers.
- */
- cv_broadcast(&ss->ss_txdata_cv);
-
- mutex_exit(&ss->ss_so.so_lock);
-}
-
-/*
- * SCTP notifies socket about write offset and amount of TX data per mblk.
- */
-static void
-sctp_sock_properties(void *handle, int wroff, size_t maxblk)
-{
- struct sctp_sonode *ss = handle;
-
- ASSERT(ss->ss_so.so_type == SOCK_STREAM);
-
- mutex_enter(&ss->ss_so.so_lock);
-
- ASSERT(ss->ss_so.so_priv != NULL); /* closed conn */
-
- /*
- * Only change them if they're set.
- */
- if (wroff != 0) {
- ss->ss_wroff = wroff;
- }
- if (maxblk != 0) {
- ss->ss_wrsize = maxblk;
- }
- mutex_exit(&ss->ss_so.so_lock);
-}
-
-static void
-sctp_assoc_properties(void *handle, int wroff, size_t maxblk)
-{
- struct sctp_soassoc *ssa = handle;
- struct sctp_sonode *ss;
-
- if (ssa->ssa_type == SOSCTP_ASSOC) {
- ss = ssa->ssa_sonode;
- mutex_enter(&ss->ss_so.so_lock);
-
- /*
- * Only change them if they're set.
- */
- if (wroff != 0) {
- ssa->ssa_wroff = wroff;
- }
- if (maxblk != 0) {
- ssa->ssa_wrsize = maxblk;
- }
- } else {
- ss = (struct sctp_sonode *)handle;
- mutex_enter(&ss->ss_so.so_lock);
-
- if (wroff != 0) {
- ss->ss_wroff = wroff;
- }
- if (maxblk != 0) {
- ss->ss_wrsize = maxblk;
- }
- }
-
- mutex_exit(&ss->ss_so.so_lock);
-}
diff --git a/usr/src/uts/common/fs/sockfs/socksctpvnops.c b/usr/src/uts/common/fs/sockfs/socksctpvnops.c
deleted file mode 100644
index b59bb8d163..0000000000
--- a/usr/src/uts/common/fs/sockfs/socksctpvnops.c
+++ /dev/null
@@ -1,875 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/types.h>
-#include <sys/t_lock.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/buf.h>
-#include <sys/debug.h>
-#include <sys/errno.h>
-#include <sys/uio.h>
-#include <sys/vfs.h>
-#include <sys/vfs_opreg.h>
-#include <sys/vnode.h>
-#include <sys/stropts.h>
-#include <sys/cmn_err.h>
-#include <sys/sysmacros.h>
-#include <sys/stream.h>
-#include <sys/strsun.h>
-
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-
-#include <sys/project.h>
-#include <sys/strsubr.h>
-
-#include <fs/fs_subr.h>
-
-#include <sys/esunddi.h>
-#include <sys/ddi.h>
-
-#include <sys/filio.h>
-#include <sys/sockio.h>
-
-#include <netinet/sctp.h>
-#include <inet/sctp_itf.h>
-#include "socksctp.h"
-
-/*
- * SCTP sockfs vnode operations
- */
-static int socksctpv_open(struct vnode **, int, struct cred *,
- caller_context_t *);
-static int socksctpv_close(struct vnode *, int, int, offset_t,
- struct cred *, caller_context_t *);
-static int socksctpv_read(struct vnode *, struct uio *, int, struct cred *,
- caller_context_t *);
-static int socksctpv_write(struct vnode *, struct uio *, int, struct cred *,
- caller_context_t *);
-static int socksctpv_ioctl(struct vnode *, int, intptr_t, int,
- struct cred *, int32_t *, caller_context_t *);
-static int socksctp_setfl(vnode_t *, int, int, cred_t *, caller_context_t *);
-static void socksctpv_inactive(struct vnode *, struct cred *,
- caller_context_t *);
-static int socksctpv_poll(struct vnode *, short, int, short *,
- struct pollhead **, caller_context_t *);
-
-const fs_operation_def_t socksctp_vnodeops_template[] = {
- VOPNAME_OPEN, { .vop_open = socksctpv_open },
- VOPNAME_CLOSE, { .vop_close = socksctpv_close },
- VOPNAME_READ, { .vop_read = socksctpv_read },
- VOPNAME_WRITE, { .vop_write = socksctpv_write },
- VOPNAME_IOCTL, { .vop_ioctl = socksctpv_ioctl },
- VOPNAME_SETFL, { .vop_setfl = socksctp_setfl },
- VOPNAME_GETATTR, { .vop_getattr = socktpi_getattr },
- VOPNAME_SETATTR, { .vop_setattr = socktpi_setattr },
- VOPNAME_ACCESS, { .vop_access = socktpi_access },
- VOPNAME_FSYNC, { .vop_fsync = socktpi_fsync },
- VOPNAME_INACTIVE, { .vop_inactive = socksctpv_inactive },
- VOPNAME_FID, { .vop_fid = socktpi_fid },
- VOPNAME_SEEK, { .vop_seek = socktpi_seek },
- VOPNAME_POLL, { .vop_poll = socksctpv_poll },
- VOPNAME_DISPOSE, { .error = fs_error },
- NULL, NULL
-};
-struct vnodeops *socksctp_vnodeops;
-
-/*ARGSUSED3*/
-static int
-socksctpv_open(struct vnode **vpp, int flag, struct cred *cr,
- caller_context_t *ct)
-{
- struct sonode *so;
- struct sctp_sonode *ss;
- struct vnode *vp = *vpp;
- int error = 0;
- sctp_sockbuf_limits_t sbl;
- sctp_upcalls_t *upcalls;
-
- flag &= ~FCREAT; /* paranoia */
-
- so = VTOSO(vp);
- ss = SOTOSSO(so);
-
- mutex_enter(&so->so_lock);
- so->so_count++; /* one more open reference */
- ASSERT(so->so_count != 0); /* wraparound */
- mutex_exit(&so->so_lock);
-
- ASSERT(vp->v_type == VSOCK);
-
- if (flag & SO_ACCEPTOR) {
- ASSERT(so->so_type == SOCK_STREAM);
- /*
- * Protocol control block already created
- */
- return (0);
- }
-
- /*
- * Active open.
- */
- if (so->so_type == SOCK_STREAM) {
- upcalls = &sosctp_sock_upcalls;
- } else {
- ASSERT(so->so_type == SOCK_SEQPACKET);
- upcalls = &sosctp_assoc_upcalls;
- }
- so->so_priv = sctp_create(ss, NULL, so->so_family, SCTP_CAN_BLOCK,
- upcalls, &sbl, cr);
- if (so->so_priv == NULL) {
- error = ENOMEM;
- mutex_enter(&so->so_lock);
- ASSERT(so->so_count > 0);
- so->so_count--; /* one less open reference */
- mutex_exit(&so->so_lock);
- }
- so->so_rcvbuf = sbl.sbl_rxbuf;
- so->so_rcvlowat = sbl.sbl_rxlowat;
- so->so_sndbuf = sbl.sbl_txbuf;
- so->so_sndlowat = sbl.sbl_txlowat;
-
- return (error);
-}
-
-/*ARGSUSED*/
-static int
-socksctpv_close(struct vnode *vp, int flag, int count, offset_t offset,
- struct cred *cr, caller_context_t *ct)
-{
- struct sonode *so;
- struct sctp_sonode *ss;
- struct sctp_sa_id *ssi;
- struct sctp_soassoc *ssa;
- int sendsig = 0;
- int32_t i;
-
- so = VTOSO(vp);
- ss = SOTOSSO(so);
-
- cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
- cleanshares(vp, ttoproc(curthread)->p_pid);
-
- ASSERT(vp->v_stream == NULL);
- if (count > 1) {
- dprint(2, ("socksctpv_close: count %d\n", count));
- return (0);
- }
-
- mutex_enter(&so->so_lock);
- so_lock_single(so); /* Set SOLOCKED */
- ASSERT(so->so_count > 0);
- so->so_count--; /* one fewer open reference */
-
- dprint(2, ("socksctpv_close: %p so_count %d\n", (void *)so,
- so->so_count));
-
- if (so->so_count == 0) {
- /*
- * Need to set flags as there might be ops in progress on
- * this socket.
- *
- * If socket already disconnected/disconnecting,
- * don't send signal (again).
- */
- if (!(so->so_state & SS_CANTRCVMORE))
- sendsig |= SCTPSIG_READ;
- if (!(so->so_state & SS_CANTSENDMORE))
- sendsig |= SCTPSIG_WRITE;
- soisdisconnected(so, 0);
- mutex_exit(&so->so_lock);
-
- /*
- * Initiate connection shutdown. Update SCTP's receive
- * window.
- */
- sctp_recvd(so->so_priv, so->so_rcvbuf - ss->ss_rxqueued);
- (void) sctp_disconnect(so->so_priv);
-
- /*
- * New associations can't come in, but old ones might get
- * closed in upcall. Protect against that by taking a reference
- * on the association.
- */
- mutex_enter(&so->so_lock);
- ssi = ss->ss_assocs;
- for (i = 0; i < ss->ss_maxassoc; i++, ssi++) {
- if ((ssa = ssi->ssi_assoc) != NULL) {
- SSA_REFHOLD(ssa);
- sosctp_assoc_isdisconnected(ssa, 0);
- mutex_exit(&so->so_lock);
-
- sctp_recvd(ssa->ssa_conn, so->so_rcvbuf -
- ssa->ssa_rxqueued);
- (void) sctp_disconnect(ssa->ssa_conn);
-
- mutex_enter(&so->so_lock);
- SSA_REFRELE(ss, ssa);
- }
- }
- if (sendsig != 0) {
- sosctp_sendsig(ss, sendsig);
- }
- mutex_exit(&so->so_lock);
- pollwakeup(&ss->ss_poll_list, POLLIN|POLLRDNORM|POLLOUT);
- mutex_enter(&so->so_lock);
- }
- so_unlock_single(so, SOLOCKED);
- mutex_exit(&so->so_lock);
-
- return (0);
-}
-
-/*ARGSUSED2*/
-static int
-socksctpv_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
- caller_context_t *ct)
-{
- struct sonode *so = VTOSO(vp);
- struct nmsghdr lmsg;
-
- if (so->so_type != SOCK_STREAM) {
- return (EOPNOTSUPP);
- }
-
- ASSERT(vp->v_type == VSOCK);
- so_update_attrs(so, SOACC);
- lmsg.msg_namelen = 0;
- lmsg.msg_controllen = 0;
- lmsg.msg_flags = 0;
- return (sosctp_recvmsg(so, &lmsg, uiop));
-}
-
-/*
- * Send data, see sosctp_sendmsg()
- */
-/*ARGSUSED2*/
-static int
-socksctpv_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
- caller_context_t *ct)
-{
- struct sctp_sonode *ss;
- struct sonode *so;
- mblk_t *head;
- ssize_t count, msglen;
- int error;
-
- so = VTOSO(vp);
- ss = SOTOSSO(so);
-
- if (so->so_type != SOCK_STREAM) {
- return (EOPNOTSUPP);
- }
-
- mutex_enter(&so->so_lock);
-
- for (;;) {
- if (so->so_state & SS_CANTSENDMORE) {
- mutex_exit(&so->so_lock);
- tsignal(curthread, SIGPIPE);
- return (EPIPE);
- }
-
- if (so->so_error != 0) {
- error = sogeterr(so);
- if (error != 0) {
- mutex_exit(&so->so_lock);
- return (error);
- }
- }
-
- if (ss->ss_txqueued < so->so_sndbuf)
- break;
-
- if (uiop->uio_fmode & (FNDELAY|FNONBLOCK)) {
- mutex_exit(&so->so_lock);
- return (EAGAIN);
- } else {
- /*
- * Xmit window full in a blocking socket.
- * Wait for space to become available and try again.
- */
- error = cv_wait_sig(&ss->ss_txdata_cv, &so->so_lock);
- if (error == 0) { /* signal */
- mutex_exit(&so->so_lock);
- return (EINTR);
- }
- }
- }
-
- if (!(so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED))) {
- mutex_exit(&so->so_lock);
- return (ENOTCONN);
- }
-
- msglen = count = uiop->uio_resid;
- /* Don't allow sending a message larger than the send buffer size. */
- if (msglen > so->so_sndbuf) {
- mutex_exit(&so->so_lock);
- return (EMSGSIZE);
- }
- ss->ss_txqueued += msglen;
-
- mutex_exit(&so->so_lock);
-
- if (count == 0) {
- return (0);
- }
-
- head = sctp_alloc_hdr(NULL, 0, NULL, 0, SCTP_CAN_BLOCK);
- if (head == NULL) {
- error = EINTR;
- goto error_ret;
- }
-
- /* Copy in the message. */
- if ((error = sosctp_uiomove(head, count, ss->ss_wrsize, ss->ss_wroff,
- uiop, 0, cr)) != 0) {
- goto error_ret;
- }
- so_update_attrs(so, SOMOD);
-
- error = sctp_sendmsg(so->so_priv, head, 0);
- if (error == 0)
- return (0);
-
-error_ret:
- mutex_enter(&so->so_lock);
- ss->ss_txqueued -= msglen;
- cv_broadcast(&ss->ss_txdata_cv);
- mutex_exit(&so->so_lock);
- freemsg(head);
- return (error);
-}
-
-/*ARGSUSED4*/
-static int
-socksctpv_ioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
- struct cred *cr, int32_t *rvalp, caller_context_t *ct)
-{
- struct sonode *so;
- struct sctp_sonode *ss;
- int32_t value;
- int error;
- int intval;
- pid_t pid;
- struct sctp_soassoc *ssa;
- void *conn;
- void *buf;
- STRUCT_DECL(sctpopt, opt);
- uint32_t optlen;
- int buflen;
-
- so = VTOSO(vp);
- ss = SOTOSSO(so);
-
- /* handle socket specific ioctls */
- switch (cmd) {
- case FIONBIO:
- if (so_copyin((void *)arg, &value, sizeof (int32_t),
- (mode & (int)FKIOCTL))) {
- return (EFAULT);
- }
- mutex_enter(&so->so_lock);
- if (value) {
- so->so_state |= SS_NDELAY;
- } else {
- so->so_state &= ~SS_NDELAY;
- }
- mutex_exit(&so->so_lock);
- return (0);
-
- case FIOASYNC:
- if (so_copyin((void *)arg, &value, sizeof (int32_t),
- (mode & (int)FKIOCTL))) {
- return (EFAULT);
- }
- mutex_enter(&so->so_lock);
-
- if (value) {
- /* Turn on SIGIO */
- so->so_state |= SS_ASYNC;
- } else {
- /* Turn off SIGIO */
- so->so_state &= ~SS_ASYNC;
- }
- mutex_exit(&so->so_lock);
- return (0);
-
- case SIOCSPGRP:
- case FIOSETOWN:
- if (so_copyin((void *)arg, &pid, sizeof (pid_t),
- (mode & (int)FKIOCTL))) {
- return (EFAULT);
- }
- mutex_enter(&so->so_lock);
-
- error = (pid != so->so_pgrp) ? sosctp_chgpgrp(ss, pid) : 0;
- mutex_exit(&so->so_lock);
- return (error);
-
- case SIOCGPGRP:
- case FIOGETOWN:
- if (so_copyout(&so->so_pgrp, (void *)arg,
- sizeof (pid_t), (mode & (int)FKIOCTL)))
- return (EFAULT);
- return (0);
-
- case SIOCATMARK:
- /*
- * No support for urgent data.
- */
- intval = 0;
-
- if (so_copyout(&intval, (void *)arg, sizeof (int),
- (mode & (int)FKIOCTL)))
- return (EFAULT);
- return (0);
-
- /* from strioctl */
- case FIONREAD:
- /*
- * Return number of bytes of data in all data messages
- * in queue in "arg".
- * For stream socket, amount of available data.
- * For sock_dgram, # of available bytes + addresses.
- */
- intval = (so->so_state & SS_ACCEPTCONN) ? 0 :
- MIN(ss->ss_rxqueued, INT_MAX);
- if (so_copyout(&intval, (void *)arg, sizeof (intval),
- (mode & (int)FKIOCTL)))
- return (EFAULT);
- return (0);
-
- case SIOCSCTPGOPT:
- STRUCT_INIT(opt, mode);
-
- if (so_copyin((void *)arg, STRUCT_BUF(opt), STRUCT_SIZE(opt),
- (mode & (int)FKIOCTL))) {
- return (EFAULT);
- }
- if ((optlen = STRUCT_FGET(opt, sopt_len)) > SO_MAXARGSIZE)
- return (EINVAL);
-
- /*
- * Find the correct sctp_t based on whether it is 1-N socket
- * or not.
- */
- intval = STRUCT_FGET(opt, sopt_aid);
- mutex_enter(&so->so_lock);
- if ((so->so_type == SOCK_SEQPACKET) && intval) {
- if ((error = sosctp_assoc(ss, intval, &ssa)) != 0) {
- mutex_exit(&so->so_lock);
- return (error);
- }
- conn = ssa->ssa_conn;
- ASSERT(conn != NULL);
- } else {
- conn = so->so_priv;
- ssa = NULL;
- }
- mutex_exit(&so->so_lock);
-
- /* Copyin the option buffer and then call sctp_get_opt(). */
- buflen = optlen;
- /* Let's allocate a buffer enough to hold an int */
- if (buflen < sizeof (uint32_t))
- buflen = sizeof (uint32_t);
- buf = kmem_alloc(buflen, KM_SLEEP);
- if (so_copyin(STRUCT_FGETP(opt, sopt_val), buf, optlen,
- (mode & (int)FKIOCTL))) {
- if (ssa != NULL) {
- mutex_enter(&so->so_lock);
- SSA_REFRELE(ss, ssa);
- mutex_exit(&so->so_lock);
- }
- kmem_free(buf, buflen);
- return (EFAULT);
- }
- /* The option level has to be IPPROTO_SCTP */
- error = sctp_get_opt(conn, IPPROTO_SCTP,
- STRUCT_FGET(opt, sopt_name), buf, &optlen);
- if (ssa != NULL) {
- mutex_enter(&so->so_lock);
- SSA_REFRELE(ss, ssa);
- mutex_exit(&so->so_lock);
- }
- optlen = MIN(buflen, optlen);
- /* No error, copyout the result with the correct buf len. */
- if (error == 0) {
- STRUCT_FSET(opt, sopt_len, optlen);
- if (so_copyout(STRUCT_BUF(opt), (void *)arg,
- STRUCT_SIZE(opt), (mode & (int)FKIOCTL))) {
- error = EFAULT;
- } else if (so_copyout(buf, STRUCT_FGETP(opt, sopt_val),
- optlen, (mode & (int)FKIOCTL))) {
- error = EFAULT;
- }
- }
- kmem_free(buf, buflen);
- return (error);
-
- case SIOCSCTPSOPT:
- STRUCT_INIT(opt, mode);
-
- if (so_copyin((void *)arg, STRUCT_BUF(opt), STRUCT_SIZE(opt),
- (mode & (int)FKIOCTL))) {
- return (EFAULT);
- }
- if ((optlen = STRUCT_FGET(opt, sopt_len)) > SO_MAXARGSIZE)
- return (EINVAL);
-
- /*
- * Find the correct sctp_t based on whether it is 1-N socket
- * or not.
- */
- intval = STRUCT_FGET(opt, sopt_aid);
- mutex_enter(&so->so_lock);
- if (intval != 0) {
- if ((error = sosctp_assoc(ss, intval, &ssa)) != 0) {
- mutex_exit(&so->so_lock);
- return (error);
- }
- conn = ssa->ssa_conn;
- ASSERT(conn != NULL);
- } else {
- conn = so->so_priv;
- ssa = NULL;
- }
- mutex_exit(&so->so_lock);
-
- /* Copyin the option buffer and then call sctp_set_opt(). */
- buf = kmem_alloc(optlen, KM_SLEEP);
- if (so_copyin(STRUCT_FGETP(opt, sopt_val), buf, optlen,
- (mode & (int)FKIOCTL))) {
- if (ssa != NULL) {
- mutex_enter(&so->so_lock);
- SSA_REFRELE(ss, ssa);
- mutex_exit(&so->so_lock);
- }
- kmem_free(buf, intval);
- return (EFAULT);
- }
- /* The option level has to be IPPROTO_SCTP */
- error = sctp_set_opt(conn, IPPROTO_SCTP,
- STRUCT_FGET(opt, sopt_name), buf, optlen);
- if (ssa) {
- mutex_enter(&so->so_lock);
- SSA_REFRELE(ss, ssa);
- mutex_exit(&so->so_lock);
- }
- kmem_free(buf, optlen);
- return (error);
-
- case SIOCSCTPPEELOFF: {
- struct sonode *nso;
- struct sctp_uc_swap us;
- int nfd;
- struct file *nfp;
- struct vnode *nvp = NULL, *accessvp;
-
- dprint(2, ("sctppeeloff %p\n", (void *)ss));
-
- if (so->so_type != SOCK_SEQPACKET) {
- return (EOPNOTSUPP);
- }
- if (so_copyin((void *)arg, &intval, sizeof (intval),
- (mode & (int)FKIOCTL))) {
- return (EFAULT);
- }
- if (intval == 0) {
- return (EINVAL);
- }
-
- /*
- * Find accessvp. This is different from parent's vp,
- * as the socket type is different.
- */
- accessvp = solookup(so->so_family, SOCK_STREAM,
- so->so_protocol, NULL, &error);
- if (accessvp == NULL) {
- return (error);
- }
-
- /*
- * Allocate the user fd.
- */
- if ((nfd = ufalloc(0)) == -1) {
- eprintsoline(so, EMFILE);
- return (EMFILE);
- }
-
- /*
- * Copy the fd out.
- */
- if (so_copyout(&nfd, (void *)arg, sizeof (nfd),
- (mode & (int)FKIOCTL))) {
- error = EFAULT;
- goto err;
- }
- mutex_enter(&so->so_lock);
-
- /*
- * Don't use sosctp_assoc() in order to peel off disconnected
- * associations.
- */
- ssa = ((uint32_t)intval >= ss->ss_maxassoc) ? NULL :
- ss->ss_assocs[intval].ssi_assoc;
- if (ssa == NULL) {
- mutex_exit(&so->so_lock);
- error = EINVAL;
- goto err;
- }
- SSA_REFHOLD(ssa);
-
- nso = sosctp_create(accessvp, so->so_family, SOCK_STREAM,
- so->so_protocol, so->so_version, so, &error);
- if (nso == NULL) {
- SSA_REFRELE(ss, ssa);
- mutex_exit(&so->so_lock);
- goto err;
- }
- nvp = SOTOV(nso);
- so_lock_single(so);
- mutex_exit(&so->so_lock);
- us.sus_handle = SOTOSSO(nso);
- us.sus_upcalls = &sosctp_sock_upcalls;
-
- /*
- * Upcalls to new socket are blocked for the duration of
- * downcall.
- */
- mutex_enter(&nso->so_lock);
-
- error = sctp_set_opt(ssa->ssa_conn, IPPROTO_SCTP, SCTP_UC_SWAP,
- &us, sizeof (us));
- if (error) {
- goto peelerr;
- }
- error = falloc(nvp, FWRITE|FREAD, &nfp, NULL);
- if (error) {
- goto peelerr;
- }
-
- /*
- * fill in the entries that falloc reserved
- */
- nfp->f_vnode = nvp;
- mutex_exit(&nfp->f_tlock);
- setf(nfd, nfp);
-
- mutex_enter(&so->so_lock);
-
- sosctp_assoc_move(ss, SOTOSSO(nso), ssa);
-
- mutex_exit(&nso->so_lock);
-
- ssa->ssa_conn = NULL;
- sosctp_assoc_free(ss, ssa);
-
- so_unlock_single(so, SOLOCKED);
- mutex_exit(&so->so_lock);
-
- return (0);
-
-err:
- setf(nfd, NULL);
- eprintsoline(so, error);
- return (error);
-
-peelerr:
- mutex_exit(&nso->so_lock);
- mutex_enter(&so->so_lock);
- ASSERT(nso->so_count == 1);
- nso->so_count = 0;
- so_unlock_single(so, SOLOCKED);
- SSA_REFRELE(ss, ssa);
- mutex_exit(&so->so_lock);
- /* held in VOP_OPEN() */
- ddi_rele_driver(getmajor(nso->so_dev));
- setf(nfd, NULL);
- ASSERT(nvp->v_count == 1);
- VN_RELE(nvp);
- eprintsoline(so, error);
- return (error);
- }
- default:
- return (EINVAL);
- }
-}
-
-/*
- * Allow any flags. Record FNDELAY and FNONBLOCK so that they can be inherited
- * from listener to acceptor.
- */
-/* ARGSUSED */
-static int
-socksctp_setfl(vnode_t *vp, int oflags, int nflags, cred_t *cr,
- caller_context_t *ct)
-{
- struct sonode *so;
-
- so = VTOSO(vp);
-
- mutex_enter(&so->so_lock);
- if (nflags & FNDELAY)
- so->so_state |= SS_NDELAY;
- else
- so->so_state &= ~SS_NDELAY;
- if (nflags & FNONBLOCK)
- so->so_state |= SS_NONBLOCK;
- else
- so->so_state &= ~SS_NONBLOCK;
- mutex_exit(&so->so_lock);
- return (0);
-}
-
-/*ARGSUSED*/
-static void
-socksctpv_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
-{
- struct sonode *so;
- struct sctp_sonode *ss;
- struct sctp_sa_id *ssi;
- struct sctp_soassoc *ssa;
- int32_t i;
-
- so = VTOSO(vp);
- ss = SOTOSSO(so);
-
- mutex_enter(&vp->v_lock);
- /*
- * If no one has reclaimed the vnode, remove from the
- * cache now.
- */
- if (vp->v_count < 1)
- cmn_err(CE_PANIC, "socksctpv_inactive: Bad v_count");
-
- /*
- * Drop the temporary hold by vn_rele now
- */
- if (--vp->v_count != 0) {
- mutex_exit(&vp->v_lock);
- return;
- }
- mutex_exit(&vp->v_lock);
-
- /* We are the sole owner of so now */
-
- /*
- * New associations can't come in, but old ones might get
- * closed in upcall. Protect against that by taking a reference
- * on the association.
- */
- mutex_enter(&so->so_lock);
-
- ssi = ss->ss_assocs;
- for (i = 0; i < ss->ss_maxassoc; i++, ssi++) {
- if ((ssa = ssi->ssi_assoc) != NULL) {
- SSA_REFHOLD(ssa);
- mutex_exit(&so->so_lock);
-
- sctp_close(ssa->ssa_conn);
-
- mutex_enter(&so->so_lock);
- ssa->ssa_conn = NULL;
- sosctp_assoc_free(ss, ssa);
- }
- }
- mutex_exit(&so->so_lock);
-
- ASSERT(!vn_has_cached_data(vp));
- if (so->so_priv) {
- sctp_close(so->so_priv);
- }
- so->so_priv = NULL;
- sosctp_free(so);
-}
-
-/*
- * Check socktpi_poll() on why so_lock is not held in this function.
- */
-/*ARGSUSED5*/
-static int
-socksctpv_poll(struct vnode *vp, short events, int anyyet, short *reventsp,
- struct pollhead **phpp, caller_context_t *ct)
-{
- struct sonode *so;
- struct sctp_sonode *ss;
- short origevents = events;
- int so_state;
-
- so = VTOSO(vp);
- ss = SOTOSSO(so);
- so_state = so->so_state;
-
- ASSERT(vp->v_type == VSOCK);
- ASSERT(vp->v_stream == NULL);
- ASSERT(so->so_version != SOV_STREAM);
-
- if (!(so_state & SS_ISCONNECTED) && (so->so_type == SOCK_STREAM)) {
- /*
- * Not connected yet - turn off write side events
- */
- events &= ~(POLLOUT|POLLWRBAND);
- }
-
- /*
- * Check for errors
- */
- if (so->so_error != 0 &&
- ((POLLIN|POLLRDNORM|POLLOUT) & origevents) != 0) {
- *reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
- return (0);
- }
-
- *reventsp = 0;
-
- /*
- * Don't mark socket as writable until TX queued data is
- * below watermark.
- */
- if (so->so_type == SOCK_STREAM) {
- if (ss->ss_txqueued < so->so_sndlowat) {
- *reventsp |= POLLOUT & events;
- }
- } else {
- *reventsp |= POLLOUT & events;
- }
- if (ss->ss_rxdata) {
- *reventsp |= (POLLIN|POLLRDNORM) & events;
- }
- if ((so_state & (SS_HASCONNIND|SS_CANTRCVMORE)) != 0) {
- *reventsp |= (POLLIN|POLLRDNORM) & events;
- }
-
- if (!*reventsp && !anyyet) {
- *phpp = &ss->ss_poll_list;
- }
-
- return (0);
-}
diff --git a/usr/src/uts/common/fs/sockfs/socksdp.h b/usr/src/uts/common/fs/sockfs/socksdp.h
deleted file mode 100755
index 68231bb0e5..0000000000
--- a/usr/src/uts/common/fs/sockfs/socksdp.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef _SOCKSDP_H_
-#define _SOCKSDP_H_
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * SDP socket structure.
- *
- * The opaque pointer passed in upcalls is a pointer to sdp_sonode.
- */
-struct sdp_sonode {
- int ss_type; /* sonode or soassoc */
- struct sonode ss_so;
- struct sockaddr_in6 ss_laddr; /* can fit both v4 & v6 */
- struct sockaddr_in6 ss_faddr;
- int ss_rxqueued; /* queued # of conn */
- struct pollhead ss_poll_list;
-};
-
-extern sdp_upcalls_t sosdp_sock_upcalls;
-extern struct vnodeops *socksdp_vnodeops;
-extern const fs_operation_def_t socksdp_vnodeops_template[];
-
-extern void sosdp_free(struct sonode *so);
-extern int sosdp_chgpgrp(struct sdp_sonode *ss, pid_t pid);
-extern void sosdp_sendsig(struct sdp_sonode *ss, int event);
-
-extern int sosdp_bind(struct sonode *so, struct sockaddr *name,
- socklen_t namelen, int flags);
-extern int sosdp_recvmsg(struct sonode *, struct nmsghdr *, struct uio *);
-
-extern int sosdp_waitconnected(struct sonode *so, int fmode);
-
-extern void sosdp_so_inherit(struct sdp_sonode *lss, struct sdp_sonode *nss);
-
-/*
- * Data structure types.
- */
-#define SOSDP_SOCKET 0x1
-
-#define SOTOSDO(so) ((struct sdp_sonode *)(((char *)so) - \
- offsetof(struct sdp_sonode, ss_so)))
-
-/*
- * Event flags to sosdp_sendsig().
- */
-#define SDPSIG_WRITE 0x1
-#define SDPSIG_READ 0x2
-#define SDPSIG_URG 0x4
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SOCKSDP_H_ */
diff --git a/usr/src/uts/common/fs/sockfs/socksdpsubr.c b/usr/src/uts/common/fs/sockfs/socksdpsubr.c
deleted file mode 100755
index 357c61db3d..0000000000
--- a/usr/src/uts/common/fs/sockfs/socksdpsubr.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-#include <sys/t_lock.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/debug.h>
-#include <sys/errno.h>
-#include <sys/strsubr.h>
-#include <sys/cmn_err.h>
-#include <sys/sysmacros.h>
-
-#include <sys/vfs.h>
-#include <sys/vfs_opreg.h>
-
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-#include <sys/strsun.h>
-#include <sys/signal.h>
-
-#include <inet/sdp_itf.h>
-#include "socksdp.h"
-
-
-/*
- * Wait until the socket is connected or there is an error.
- * fmode should contain any nonblocking flags.
- */
-int
-sosdp_waitconnected(struct sonode *so, int fmode)
-{
- int error;
-
- ASSERT(MUTEX_HELD(&so->so_lock));
- ASSERT((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) ||
- so->so_error != 0);
-
- while ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) ==
- SS_ISCONNECTING && so->so_error == 0) {
-
- dprint(3, ("waiting for SS_ISCONNECTED on %p\n", (void *)so));
- if (fmode & (FNDELAY|FNONBLOCK))
- return (EINPROGRESS);
-
- if (!cv_wait_sig_swap(&so->so_state_cv, &so->so_lock)) {
- /*
- * Return EINTR and let the application use
- * nonblocking techniques for detecting when
- * the connection has been established.
- */
- error = EINTR;
- break;
- }
- dprint(3, ("awoken on %p\n", (void *)so));
- }
-
- if (so->so_error != 0) {
- error = sogeterr(so);
- ASSERT(error != 0);
- dprint(3, ("sosdp_waitconnected: error %d\n", error));
- } else if (so->so_state & SS_ISCONNECTED) {
- error = 0;
- }
- return (error);
-}
-
-
-/*
- * Change the process/process group to which SIGIO is sent.
- */
-int
-sosdp_chgpgrp(struct sdp_sonode *ss, pid_t pid)
-{
- int error;
-
- ASSERT(MUTEX_HELD(&ss->ss_so.so_lock));
- if (pid != 0) {
- /*
- * Permissions check by sending signal 0.
- * Note that when kill fails it does a
- * set_errno causing the system call to fail.
- */
- error = kill(pid, 0);
- if (error != 0) {
- return (error);
- }
- }
- ss->ss_so.so_pgrp = pid;
- return (0);
-}
-
-
-/*
- * Generate a SIGIO, for 'writable' events include siginfo structure,
- * for read events just send the signal.
- */
-/*ARGSUSED*/
-static void
-sosdp_sigproc(proc_t *proc, int event)
-{
- k_siginfo_t info;
-
- if (event & SDPSIG_WRITE) {
- info.si_signo = SIGPOLL;
- info.si_code = POLL_OUT;
- info.si_errno = 0;
- info.si_fd = 0;
- info.si_band = 0;
- sigaddq(proc, NULL, &info, KM_NOSLEEP);
- }
- if (event & SDPSIG_READ) {
- sigtoproc(proc, NULL, SIGPOLL);
- }
- if (event & SDPSIG_URG) {
- sigtoproc(proc, NULL, SIGURG);
- }
-}
-
-void
-sosdp_sendsig(struct sdp_sonode *ss, int event)
-{
- proc_t *proc;
- struct sonode *so = &ss->ss_so;
-
- ASSERT(MUTEX_HELD(&ss->ss_so.so_lock));
-
- if (so->so_pgrp == 0 || (!(so->so_state & SS_ASYNC) &&
- event != SDPSIG_URG)) {
- return;
- }
-
- dprint(3, ("sending sig %d to %d\n", event, so->so_pgrp));
-
- if (so->so_pgrp > 0) {
- /*
- * XXX This unfortunately still generates
- * a signal when a fd is closed but
- * the proc is active.
- */
- mutex_enter(&pidlock);
- proc = prfind(so->so_pgrp);
- if (proc == NULL) {
- mutex_exit(&pidlock);
- return;
- }
- mutex_enter(&proc->p_lock);
- mutex_exit(&pidlock);
- sosdp_sigproc(proc, event);
- mutex_exit(&proc->p_lock);
- } else {
- /*
- * Send to process group. Hold pidlock across
- * calls to sosdp_sigproc().
- */
- pid_t pgrp = -so->so_pgrp;
-
- mutex_enter(&pidlock);
- proc = pgfind(pgrp);
- while (proc != NULL) {
- mutex_enter(&proc->p_lock);
- sosdp_sigproc(proc, event);
- mutex_exit(&proc->p_lock);
- proc = proc->p_pglink;
- }
- mutex_exit(&pidlock);
- }
-}
-
-
-/*
- * Inherit socket properties
- */
-void
-sosdp_so_inherit(struct sdp_sonode *lss, struct sdp_sonode *nss)
-{
- struct sonode *nso = &nss->ss_so;
- struct sonode *lso = &lss->ss_so;
-
- nso->so_options = lso->so_options & (SO_DEBUG|SO_REUSEADDR|
- SO_KEEPALIVE|SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
- SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
- nso->so_sndbuf = lso->so_sndbuf;
- nso->so_rcvbuf = lso->so_rcvbuf;
- nso->so_pgrp = lso->so_pgrp;
-
- nso->so_rcvlowat = lso->so_rcvlowat;
- nso->so_sndlowat = lso->so_sndlowat;
-}
diff --git a/usr/src/uts/common/fs/sockfs/socksdpvnops.c b/usr/src/uts/common/fs/sockfs/socksdpvnops.c
deleted file mode 100644
index 395599daab..0000000000
--- a/usr/src/uts/common/fs/sockfs/socksdpvnops.c
+++ /dev/null
@@ -1,535 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#include <sys/types.h>
-#include <sys/t_lock.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/buf.h>
-#include <sys/debug.h>
-#include <sys/errno.h>
-#include <sys/uio.h>
-#include <sys/vfs.h>
-#include <sys/vfs_opreg.h>
-#include <sys/vnode.h>
-#include <sys/stropts.h>
-#include <sys/cmn_err.h>
-#include <sys/sysmacros.h>
-#include <sys/stream.h>
-#include <sys/strsun.h>
-
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-
-#include <sys/project.h>
-#include <sys/strsubr.h>
-
-#include <fs/fs_subr.h>
-
-#include <sys/esunddi.h>
-#include <sys/ddi.h>
-
-#include <sys/filio.h>
-#include <sys/sockio.h>
-
-#include <inet/sdp_itf.h>
-#include "socksdp.h"
-
-/*
- * SDP sockfs vnode operations
- */
-static int socksdpv_open(struct vnode **, int, struct cred *,
- caller_context_t *);
-static int socksdpv_close(struct vnode *, int, int, offset_t,
- struct cred *, caller_context_t *);
-static int socksdpv_read(struct vnode *, struct uio *, int, struct cred *,
- caller_context_t *);
-static int socksdpv_write(struct vnode *, struct uio *, int, struct cred *,
- caller_context_t *);
-static int socksdpv_ioctl(struct vnode *, int, intptr_t, int,
- struct cred *, int32_t *, caller_context_t *);
-static int socksdp_setfl(vnode_t *, int, int, cred_t *, caller_context_t *);
-static void socksdpv_inactive(struct vnode *, struct cred *,
- caller_context_t *);
-static int socksdpv_poll(struct vnode *, short, int, short *,
- struct pollhead **, caller_context_t *);
-
-const fs_operation_def_t socksdp_vnodeops_template[] = {
- VOPNAME_OPEN, { .vop_open = socksdpv_open },
- VOPNAME_CLOSE, { .vop_close = socksdpv_close },
- VOPNAME_READ, { .vop_read = socksdpv_read },
- VOPNAME_WRITE, { .vop_write = socksdpv_write },
- VOPNAME_IOCTL, { .vop_ioctl = socksdpv_ioctl },
- VOPNAME_SETFL, { .vop_setfl = socksdp_setfl },
- VOPNAME_GETATTR, { .vop_getattr = socktpi_getattr },
- VOPNAME_SETATTR, { .vop_setattr = socktpi_setattr },
- VOPNAME_ACCESS, { .vop_access = socktpi_access },
- VOPNAME_FSYNC, { .vop_fsync = socktpi_fsync },
- VOPNAME_INACTIVE, { .vop_inactive = socksdpv_inactive },
- VOPNAME_FID, { .vop_fid = socktpi_fid },
- VOPNAME_SEEK, { .vop_seek = socktpi_seek },
- VOPNAME_POLL, { .vop_poll = socksdpv_poll },
- VOPNAME_DISPOSE, { .error = fs_error },
- NULL, NULL
-};
-struct vnodeops *socksdp_vnodeops;
-
-/*ARGSUSED3*/
-static int
-socksdpv_open(struct vnode **vpp, int flag, struct cred *cr,
- caller_context_t *ct)
-{
- struct sonode *so;
- struct sdp_sonode *ss;
- struct vnode *vp = *vpp;
- int error = EPROTONOSUPPORT; /* in case sdpib fails to load */
- sdp_sockbuf_limits_t sbl;
- sdp_upcalls_t *upcalls;
-
- flag &= ~FCREAT; /* paranoia */
-
- so = VTOSO(vp);
- ss = SOTOSDO(so);
-
- mutex_enter(&so->so_lock);
- so->so_count++; /* one more open reference */
- ASSERT(so->so_count != 0); /* wraparound */
- mutex_exit(&so->so_lock);
-
- ASSERT(vp->v_type == VSOCK);
-
- if (flag & SO_ACCEPTOR) {
- ASSERT(so->so_type == SOCK_STREAM);
- return (0);
- }
-
- /*
- * Active open.
- */
- upcalls = &sosdp_sock_upcalls;
-
- /*
- * When the necessary hardware is not available, the sdp_create stub
- * will evaluate to nomod_zero, which leaves 'error' untouched. Hence
- * the EPROTONOSUPPORT above. A successful call to sdp_create clears
- * the error.
- */
- so->so_priv = sdp_create(ss, NULL, so->so_family, SDP_CAN_BLOCK,
- upcalls, &sbl, cr, &error);
- if (so->so_priv == NULL) {
- ASSERT(error != 0);
- mutex_enter(&so->so_lock);
- ASSERT(so->so_count > 0);
- so->so_count--; /* one less open reference */
- mutex_exit(&so->so_lock);
- return (error);
- }
- so->so_rcvbuf = sbl.sbl_rxbuf;
- so->so_rcvlowat = sbl.sbl_rxlowat;
- so->so_sndbuf = sbl.sbl_txbuf;
- so->so_sndlowat = sbl.sbl_txlowat;
-
- return (error);
-}
-
-/*ARGSUSED*/
-static int
-socksdpv_close(struct vnode *vp, int flag, int count, offset_t offset,
- struct cred *cr, caller_context_t *ct)
-{
- int sendsig = 0;
- int error = 0;
- struct sonode *so;
- struct sdp_sonode *ss;
-
- so = VTOSO(vp);
- ss = SOTOSDO(so);
-
- cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
- cleanshares(vp, ttoproc(curthread)->p_pid);
-
- ASSERT(vp->v_stream == NULL);
- if (count > 1) {
- dprint(2, ("socksdpv_close: count %d\n", count));
- return (0);
- }
-
- mutex_enter(&so->so_lock);
- so_lock_single(so); /* Set SOLOCKED */
- ASSERT(so->so_count > 0);
- so->so_count--; /* one fewer open reference */
-
- dprint(2, ("socksdpv_close: %p so_count %d\n", (void *)so,
- so->so_count));
-
- if (so->so_count == 0) {
- /*
- * Need to set flags as there might be ops in progress on
- * this socket.
- *
- * If socket already disconnected/disconnecting,
- * don't send signal (again).
- */
- if (!(so->so_state & SS_CANTRCVMORE))
- sendsig |= SDPSIG_READ;
- if (!(so->so_state & SS_CANTSENDMORE))
- sendsig |= SDPSIG_WRITE;
- soisdisconnected(so, 0);
- mutex_exit(&so->so_lock);
-
- /*
- * Initiate connection shutdown.
- */
- error = sdp_disconnect(so->so_priv, flag);
-
- mutex_enter(&so->so_lock);
- if (sendsig != 0)
- sosdp_sendsig(ss, sendsig);
- mutex_exit(&so->so_lock);
-
- pollwakeup(&ss->ss_poll_list, POLLIN|POLLRDNORM|POLLOUT);
- }
- mutex_enter(&so->so_lock);
- so_unlock_single(so, SOLOCKED);
- mutex_exit(&so->so_lock);
-
- return (error);
-}
-
-/*ARGSUSED2*/
-static int
-socksdpv_read(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
- caller_context_t *ct)
-{
- struct sonode *so = VTOSO(vp);
- struct nmsghdr lmsg;
-
- if (so->so_type != SOCK_STREAM) {
- return (EOPNOTSUPP);
- }
-
- ASSERT(vp->v_type == VSOCK);
- so_update_attrs(so, SOACC);
- lmsg.msg_namelen = 0;
- lmsg.msg_controllen = 0;
- lmsg.msg_flags = 0;
- return (sosdp_recvmsg(so, &lmsg, uiop));
-}
-
-/*
- * Send data, see sosdp_sendmsg()
- */
-/*ARGSUSED2*/
-static int
-socksdpv_write(struct vnode *vp, struct uio *uiop, int ioflag, struct cred *cr,
- caller_context_t *ct)
-{
- struct sonode *so;
- ssize_t count;
- int error;
- int flags = 0;
-
- so = VTOSO(vp);
-
- mutex_enter(&so->so_lock);
- if (so->so_state & SS_CANTSENDMORE) {
- mutex_exit(&so->so_lock);
- tsignal(curthread, SIGPIPE);
- return (EPIPE);
- }
-
- if (so->so_error != 0) {
- error = sogeterr(so);
- if (error != 0) {
- mutex_exit(&so->so_lock);
- return (error);
- }
- }
-
- if (uiop->uio_fmode & (FNDELAY|FNONBLOCK))
- flags |= MSG_DONTWAIT;
-
- if (!(so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED))) {
- mutex_exit(&so->so_lock);
- return (ENOTCONN);
- }
- count = uiop->uio_resid;
- mutex_exit(&so->so_lock);
-
- if (count == 0) {
- return (0);
- }
- so_update_attrs(so, SOMOD);
-
- error = sdp_send(so->so_priv, NULL, count, flags, uiop);
- return (error);
-}
-
-/*ARGSUSED4*/
-static int
-socksdpv_ioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
- struct cred *cr, int32_t *rvalp, caller_context_t *ct)
-{
- struct sonode *so;
- struct sdp_sonode *ss;
- int32_t value;
- int error, intval;
- pid_t pid;
-
- so = VTOSO(vp);
- ss = SOTOSDO(so);
-
- /* handle socket specific ioctls */
- switch (cmd) {
- case FIONBIO:
- if (so_copyin((void *)arg, &value, sizeof (int32_t),
- (mode & (int)FKIOCTL))) {
- return (EFAULT);
- }
- mutex_enter(&so->so_lock);
- if (value != 0) {
- so->so_state |= SS_NDELAY;
- } else {
- so->so_state &= ~SS_NDELAY;
- }
- mutex_exit(&so->so_lock);
- return (0);
-
- case FIOASYNC:
- if (so_copyin((void *)arg, &value, sizeof (int32_t),
- (mode & (int)FKIOCTL))) {
- return (EFAULT);
- }
- mutex_enter(&so->so_lock);
-
- if (value) {
- /* Turn on SIGIO */
- so->so_state |= SS_ASYNC;
- } else {
- /* Turn off SIGIO */
- so->so_state &= ~SS_ASYNC;
- }
- mutex_exit(&so->so_lock);
- return (0);
-
- case SIOCSPGRP:
- case FIOSETOWN:
- if (so_copyin((void *)arg, &pid, sizeof (pid_t),
- (mode & (int)FKIOCTL))) {
- return (EFAULT);
- }
- mutex_enter(&so->so_lock);
-
- error = (pid != so->so_pgrp) ? sosdp_chgpgrp(ss, pid) : 0;
- mutex_exit(&so->so_lock);
- return (error);
-
- case SIOCGPGRP:
- case FIOGETOWN:
- if (so_copyout(&so->so_pgrp, (void *)arg,
- sizeof (pid_t), (mode & (int)FKIOCTL)))
- return (EFAULT);
- return (0);
-
- case SIOCATMARK:
- intval = 0;
- error = sdp_ioctl(so->so_priv, cmd, &intval, cr);
- if (so_copyout(&intval, (void *)arg, sizeof (int),
- (mode & (int)FKIOCTL)))
- return (EFAULT);
- return (0);
-
-
- case SIOCSENABLESDP: {
- int32_t enable;
-
- /*
- * System wide enable SDP
- */
-
- if (so_copyin((void *)arg, &enable, sizeof (int32_t),
- mode & (int)FKIOCTL))
- return (EFAULT);
-
- error = sdp_ioctl(so->so_priv, cmd, &enable, cr);
- if (so_copyout(&enable, (void *)arg,
- sizeof (int32_t), (mode & (int)FKIOCTL)))
- return (EFAULT);
- return (0);
- }
- /* from strioctl */
- case FIONREAD:
- /*
- * Return number of bytes of data in all data messages
- * in queue in "arg".
- * For stream socket, amount of available data.
- */
- if (so->so_state & SS_ACCEPTCONN) {
- intval = 0;
- } else {
- mutex_enter(&so->so_lock);
- intval = sdp_polldata(so->so_priv, SDP_READ);
- mutex_exit(&so->so_lock);
- }
- if (so_copyout(&intval, (void *)arg, sizeof (intval),
- (mode & (int)FKIOCTL)))
- return (EFAULT);
- return (0);
- default:
- return (EINVAL);
- }
-
-}
-
-/*
- * Allow any flags. Record FNDELAY and FNONBLOCK so that they can be inherited
- * from listener to acceptor.
- */
-/* ARGSUSED */
-static int
-socksdp_setfl(vnode_t *vp, int oflags, int nflags, cred_t *cr,
- caller_context_t *ct)
-{
- struct sonode *so;
-
- so = VTOSO(vp);
-
- mutex_enter(&so->so_lock);
- if (nflags & FNDELAY)
- so->so_state |= SS_NDELAY;
- else
- so->so_state &= ~SS_NDELAY;
- if (nflags & FNONBLOCK)
- so->so_state |= SS_NONBLOCK;
- else
- so->so_state &= ~SS_NONBLOCK;
- mutex_exit(&so->so_lock);
- return (0);
-}
-
-/*ARGSUSED*/
-static void
-socksdpv_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
-{
- struct sonode *so;
-
- so = VTOSO(vp);
-
- mutex_enter(&vp->v_lock);
- /*
- * If no one has reclaimed the vnode, remove from the
- * cache now.
- */
- if (vp->v_count < 1)
- cmn_err(CE_PANIC, "socksdpv_inactive: Bad v_count");
-
- /*
- * Drop the temporary hold by vn_rele now
- */
- if (--vp->v_count != 0) {
- mutex_exit(&vp->v_lock);
- return;
- }
- mutex_exit(&vp->v_lock);
-
- /* We are the sole owner of so now */
-
- ASSERT(!vn_has_cached_data(vp));
- if (so->so_priv) {
- sdp_close(so->so_priv);
- }
- so->so_priv = NULL;
- sosdp_free(so);
-}
-
-/*
- * Check socktpi_poll() on why so_lock is not held in this function.
- */
-/*ARGSUSED5*/
-static int
-socksdpv_poll(struct vnode *vp, short events, int anyyet, short *reventsp,
- struct pollhead **phpp, caller_context_t *ct)
-{
- struct sonode *so;
- struct sdp_sonode *ss;
- short origevents = events;
- int so_state;
-
- so = VTOSO(vp);
- ss = SOTOSDO(so);
- so_state = so->so_state;
-
-
- ASSERT(vp->v_type == VSOCK);
- ASSERT(vp->v_stream == NULL);
- ASSERT(so->so_version != SOV_STREAM);
-
- if (!(so_state & SS_ISCONNECTED) && (so->so_type == SOCK_STREAM)) {
- /*
- * Not connected yet - turn off write side events
- */
- events &= ~(POLLOUT|POLLWRBAND);
- }
-
- /*
- * Check for errors
- */
- if (so->so_error != 0 &&
- ((POLLIN|POLLRDNORM|POLLOUT) & origevents) != 0) {
- *reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
- return (0);
- }
-
- *reventsp = 0;
-
- /*
- * Don't mark socket as writable until TX queued data is
- * below watermark.
- */
- if (so->so_type == SOCK_STREAM) {
- if (sdp_polldata(so->so_priv, SDP_XMIT)) {
- *reventsp |= POLLOUT & events;
- }
- } else {
- *reventsp = 0;
- goto done;
- }
-
- if (sdp_polldata(so->so_priv, SDP_READ)) {
- *reventsp |= (POLLIN|POLLRDNORM) & events;
- }
-
- if ((so_state & (SS_HASCONNIND|SS_CANTRCVMORE)) != 0) {
- *reventsp |= (POLLIN|POLLRDNORM) & events;
- }
-
-done:
- if (!*reventsp && !anyyet) {
- *phpp = &ss->ss_poll_list;
- }
-
- return (0);
-}
diff --git a/usr/src/uts/common/fs/sockfs/sockssl.c b/usr/src/uts/common/fs/sockfs/sockssl.c
index 037805e6da..8df1d3fe58 100644
--- a/usr/src/uts/common/fs/sockfs/sockssl.c
+++ b/usr/src/uts/common/fs/sockfs/sockssl.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
@@ -43,8 +41,9 @@
#include <sys/sockio.h>
#include <sys/socketvar.h>
-#include <inet/kssl/ksslapi.h>
+#include <fs/sockfs/socktpi.h>
+#include <inet/kssl/ksslapi.h>
/*
* This routine is registered with the stream head to be called by kstrgetmsg()
@@ -61,7 +60,7 @@ strsock_kssl_input(vnode_t *vp, mblk_t *mp,
strsigset_t *allmsgsigs, strpollset_t *pollwakeups)
{
struct sonode *so = VTOSO(vp);
- kssl_ctx_t kssl_ctx = so->so_kssl_ctx;
+ kssl_ctx_t kssl_ctx = SOTOTPI(so)->sti_kssl_ctx;
kssl_cmd_t kssl_cmd;
mblk_t *out;
@@ -101,7 +100,7 @@ strsock_kssl_output(vnode_t *vp, mblk_t *mp,
strsigset_t *allmsgsigs, strpollset_t *pollwakeups)
{
struct sonode *so = VTOSO(vp);
- kssl_ctx_t kssl_ctx = so->so_kssl_ctx;
+ kssl_ctx_t kssl_ctx = SOTOTPI(so)->sti_kssl_ctx;
mblk_t *recmp;
dprintso(so, 1, ("strsock_kssl_output(%p, %p)\n",
diff --git a/usr/src/uts/common/fs/sockfs/sockstr.c b/usr/src/uts/common/fs/sockfs/sockstr.c
index b783a27251..71c8d4c49c 100644
--- a/usr/src/uts/common/fs/sockfs/sockstr.c
+++ b/usr/src/uts/common/fs/sockfs/sockstr.c
@@ -51,13 +51,15 @@
#include <sys/cmn_err.h>
#include <sys/proc.h>
#include <sys/ddi.h>
-#include <sys/kmem_impl.h>
#include <sys/suntpi.h>
#include <sys/socket.h>
#include <sys/sockio.h>
#include <sys/socketvar.h>
+#include <sys/sodirect.h>
#include <netinet/in.h>
+#include <inet/common.h>
+#include <inet/proto_set.h>
#include <sys/tiuser.h>
#define _SUN_TPI_VERSION 2
@@ -67,6 +69,8 @@
#include <c2/audit.h>
+#include <fs/sockfs/socktpi.h>
+#include <fs/sockfs/socktpi_impl.h>
#include <sys/dcopy.h>
int so_default_version = SOV_SOCKSTREAM;
@@ -115,13 +119,9 @@ static mblk_t *strsock_proto(vnode_t *vp, mblk_t *mp,
static mblk_t *strsock_misc(vnode_t *vp, mblk_t *mp,
strwakeup_t *wakeups, strsigset_t *firstmsgsigs,
strsigset_t *allmsgsigs, strpollset_t *pollwakeups);
-
-static int tlitosyserr(int terr);
-
/*
- * Sodirect kmem_cache and put/wakeup functions.
+ * STREAMS based sodirect put/wakeup functions.
*/
-struct kmem_cache *socktpi_sod_cache;
static int sodput(sodirect_t *, mblk_t *);
static void sodwakeup(sodirect_t *);
@@ -131,10 +131,7 @@ static void sodwakeup(sodirect_t *);
int
sostr_init()
{
- /* Allocate sodirect_t kmem_cache */
- socktpi_sod_cache = kmem_cache_create("socktpi_sod_cache",
- sizeof (sodirect_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-
+ sod_init();
return (0);
}
@@ -151,15 +148,16 @@ so_sock2stream(struct sonode *so)
queue_t *rq;
mblk_t *mp;
int error = 0;
+ sotpi_info_t *sti = SOTOTPI(so);
- ASSERT(MUTEX_HELD(&so->so_plumb_lock));
+ ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
mutex_enter(&so->so_lock);
so_lock_single(so);
ASSERT(so->so_version != SOV_STREAM);
- if (so->so_state & SS_DIRECT) {
+ if (sti->sti_direct) {
mblk_t **mpp;
int rval;
@@ -175,9 +173,9 @@ so_sock2stream(struct sonode *so)
"_SIOCSOCKFALLBACK failed\n", (void *)so));
goto exit;
}
- so->so_state &= ~SS_DIRECT;
+ sti->sti_direct = 0;
- for (mpp = &so->so_conn_ind_head; (mp = *mpp) != NULL;
+ for (mpp = &sti->sti_conn_ind_head; (mp = *mpp) != NULL;
mpp = &mp->b_next) {
struct T_conn_ind *conn_ind;
@@ -236,7 +234,7 @@ so_sock2stream(struct sonode *so)
}
so->so_version = SOV_STREAM;
- so->so_priv = NULL;
+ so->so_proto_handle = NULL;
/*
* Remove the hooks in the stream head to avoid queuing more
@@ -251,20 +249,20 @@ so_sock2stream(struct sonode *so)
* on the queue - the behavior of urgent data after a switch is
* left undefined.
*/
- so->so_error = so->so_delayed_error = 0;
+ so->so_error = sti->sti_delayed_error = 0;
freemsg(so->so_oobmsg);
so->so_oobmsg = NULL;
- so->so_oobsigcnt = so->so_oobcnt = 0;
+ sti->sti_oobsigcnt = sti->sti_oobcnt = 0;
so->so_state &= ~(SS_RCVATMARK|SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|
- SS_HASCONNIND|SS_SAVEDEOR);
+ SS_SAVEDEOR);
ASSERT(so_verify_oobstate(so));
- freemsg(so->so_ack_mp);
- so->so_ack_mp = NULL;
+ freemsg(sti->sti_ack_mp);
+ sti->sti_ack_mp = NULL;
/*
- * Flush the T_DISCON_IND on so_discon_ind_mp.
+ * Flush the T_DISCON_IND on sti_discon_ind_mp.
*/
so_flush_discon_ind(so);
@@ -272,16 +270,15 @@ so_sock2stream(struct sonode *so)
* Move any queued T_CONN_IND messages to stream head queue.
*/
rq = RD(strvp2wq(vp));
- while ((mp = so->so_conn_ind_head) != NULL) {
- so->so_conn_ind_head = mp->b_next;
+ while ((mp = sti->sti_conn_ind_head) != NULL) {
+ sti->sti_conn_ind_head = mp->b_next;
mp->b_next = NULL;
- if (so->so_conn_ind_head == NULL) {
- ASSERT(so->so_conn_ind_tail == mp);
- so->so_conn_ind_tail = NULL;
+ if (sti->sti_conn_ind_head == NULL) {
+ ASSERT(sti->sti_conn_ind_tail == mp);
+ sti->sti_conn_ind_tail = NULL;
}
dprintso(so, 0,
- ("so_sock2stream(%p): moving T_CONN_IND\n",
- (void *)so));
+ ("so_sock2stream(%p): moving T_CONN_IND\n", (void *)so));
/* Drop lock across put() */
mutex_exit(&so->so_lock);
@@ -311,14 +308,15 @@ void
so_stream2sock(struct sonode *so)
{
struct vnode *vp = SOTOV(so);
+ sotpi_info_t *sti = SOTOTPI(so);
- ASSERT(MUTEX_HELD(&so->so_plumb_lock));
+ ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
mutex_enter(&so->so_lock);
so_lock_single(so);
ASSERT(so->so_version == SOV_STREAM);
so->so_version = SOV_SOCKSTREAM;
- so->so_pushcnt = 0;
+ sti->sti_pushcnt = 0;
mutex_exit(&so->so_lock);
/*
@@ -350,7 +348,7 @@ so_stream2sock(struct sonode *so)
mutex_enter(&so->so_lock);
/*
- * Flush the T_DISCON_IND on so_discon_ind_mp.
+ * Flush the T_DISCON_IND on sti_discon_ind_mp.
*/
so_flush_discon_ind(so);
so_unlock_read(so); /* Clear SOREADLOCKED */
@@ -388,25 +386,18 @@ so_removehooks(struct sonode *so)
*/
}
-/*
- * Initialize the streams side of a socket including
- * T_info_req/ack processing. If tso is not NULL its values are used thereby
- * avoiding the T_INFO_REQ.
- */
-int
-so_strinit(struct sonode *so, struct sonode *tso)
+void
+so_basic_strinit(struct sonode *so)
{
struct vnode *vp = SOTOV(so);
struct stdata *stp;
mblk_t *mp;
- int error;
-
- dprintso(so, 1, ("so_strinit(%p)\n", (void *)so));
+ sotpi_info_t *sti = SOTOTPI(so);
/* Preallocate an unbind_req message */
mp = soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP);
mutex_enter(&so->so_lock);
- so->so_unbind_mp = mp;
+ sti->sti_unbind_mp = mp;
#ifdef DEBUG
so->so_options = so_default_options;
#endif /* DEBUG */
@@ -414,6 +405,40 @@ so_strinit(struct sonode *so, struct sonode *tso)
so_installhooks(so);
+ stp = vp->v_stream;
+ /*
+ * Have to keep minpsz at zero in order to allow write/send of zero
+ * bytes.
+ */
+ mutex_enter(&stp->sd_lock);
+ if (stp->sd_qn_minpsz == 1)
+ stp->sd_qn_minpsz = 0;
+ mutex_exit(&stp->sd_lock);
+
+ /*
+ * If sodirect capable allocate and initialize sodirect_t.
+ * Note, SS_SODIRECT is set in socktpi_open().
+ */
+ if ((so->so_state & SS_SODIRECT) &&
+ !(so->so_state & SS_FALLBACK_PENDING)) {
+ sod_sock_init(so, stp, sodput, sodwakeup, &stp->sd_lock);
+ }
+}
+
+/*
+ * Initialize the streams side of a socket including
+ * T_info_req/ack processing. If tso is not NULL its values are used thereby
+ * avoiding the T_INFO_REQ.
+ */
+int
+so_strinit(struct sonode *so, struct sonode *tso)
+{
+ sotpi_info_t *sti = SOTOTPI(so);
+ sotpi_info_t *tsti;
+ int error;
+
+ so_basic_strinit(so);
+
/*
* The T_CAPABILITY_REQ should be the first message sent down because
* at least TCP has a fast-path for this which avoids timeouts while
@@ -424,19 +449,21 @@ so_strinit(struct sonode *so, struct sonode *tso)
if (error)
return (error);
} else {
+ tsti = SOTOTPI(tso);
+
mutex_enter(&so->so_lock);
- so->so_tsdu_size = tso->so_tsdu_size;
- so->so_etsdu_size = tso->so_etsdu_size;
- so->so_addr_size = tso->so_addr_size;
- so->so_opt_size = tso->so_opt_size;
- so->so_tidu_size = tso->so_tidu_size;
- so->so_serv_type = tso->so_serv_type;
+ sti->sti_tsdu_size = tsti->sti_tsdu_size;
+ sti->sti_etsdu_size = tsti->sti_etsdu_size;
+ sti->sti_addr_size = tsti->sti_addr_size;
+ sti->sti_opt_size = tsti->sti_opt_size;
+ sti->sti_tidu_size = tsti->sti_tidu_size;
+ sti->sti_serv_type = tsti->sti_serv_type;
so->so_mode = tso->so_mode & ~SM_ACCEPTOR_ID;
mutex_exit(&so->so_lock);
/* the following do_tcapability may update so->so_mode */
- if ((tso->so_serv_type != T_CLTS) &&
- !(tso->so_state & SS_DIRECT)) {
+ if ((tsti->sti_serv_type != T_CLTS) &&
+ (sti->sti_direct == 0)) {
error = do_tcapability(so, TC1_ACCEPTOR_ID);
if (error)
return (error);
@@ -448,73 +475,19 @@ so_strinit(struct sonode *so, struct sonode *tso)
* We set the addr_size to something to allocate a the address
* structures.
*/
- if (so->so_addr_size == 0) {
+ if (sti->sti_addr_size == 0) {
so->so_state |= SS_ISBOUND | SS_ISCONNECTED;
/* Address size can vary with address families. */
if (so->so_family == AF_INET6)
- so->so_addr_size =
+ sti->sti_addr_size =
(t_scalar_t)sizeof (struct sockaddr_in6);
else
- so->so_addr_size =
+ sti->sti_addr_size =
(t_scalar_t)sizeof (struct sockaddr_in);
- ASSERT(so->so_unbind_mp);
+ ASSERT(sti->sti_unbind_mp);
}
- /*
- * Allocate the addresses.
- */
- ASSERT(so->so_laddr_sa == NULL && so->so_faddr_sa == NULL);
- ASSERT(so->so_laddr_len == 0 && so->so_faddr_len == 0);
- so->so_laddr_maxlen = so->so_faddr_maxlen =
- P2ROUNDUP(so->so_addr_size, KMEM_ALIGN);
- so->so_laddr_sa = kmem_alloc(so->so_laddr_maxlen * 2, KM_SLEEP);
- so->so_faddr_sa = (struct sockaddr *)((caddr_t)so->so_laddr_sa
- + so->so_laddr_maxlen);
-
- if (so->so_family == AF_UNIX) {
- /*
- * Initialize AF_UNIX related fields.
- */
- bzero(&so->so_ux_laddr, sizeof (so->so_ux_laddr));
- bzero(&so->so_ux_faddr, sizeof (so->so_ux_faddr));
- }
-
- stp = vp->v_stream;
- /*
- * Have to keep minpsz at zero in order to allow write/send of zero
- * bytes.
- */
- mutex_enter(&stp->sd_lock);
- if (stp->sd_qn_minpsz == 1)
- stp->sd_qn_minpsz = 0;
- mutex_exit(&stp->sd_lock);
- /*
- * If sodirect capable allocate and initialize sodirect_t.
- * Note, SS_SODIRECT is set in socktpi_open().
- */
- if (so->so_state & SS_SODIRECT) {
- sodirect_t *sodp;
-
- ASSERT(so->so_direct == NULL);
-
- sodp = kmem_cache_alloc(socktpi_sod_cache, KM_SLEEP);
- sodp->sod_state = SOD_ENABLED | SOD_WAKE_NOT;
- sodp->sod_want = 0;
- sodp->sod_q = RD(stp->sd_wrq);
- sodp->sod_enqueue = sodput;
- sodp->sod_wakeup = sodwakeup;
- sodp->sod_uioafh = NULL;
- sodp->sod_uioaft = NULL;
- sodp->sod_lockp = &stp->sd_lock;
- /*
- * Remainder of the sod_uioa members are left uninitialized
- * but will be initialized later by uioainit() before uioa
- * is enabled.
- */
- sodp->sod_uioa.uioa_state = UIOA_ALLOC;
- so->so_direct = sodp;
- stp->sd_sodirect = sodp;
- }
+ so_alloc_addr(so, sti->sti_addr_size);
return (0);
}
@@ -522,25 +495,28 @@ so_strinit(struct sonode *so, struct sonode *tso)
static void
copy_tinfo(struct sonode *so, struct T_info_ack *tia)
{
- so->so_tsdu_size = tia->TSDU_size;
- so->so_etsdu_size = tia->ETSDU_size;
- so->so_addr_size = tia->ADDR_size;
- so->so_opt_size = tia->OPT_size;
- so->so_tidu_size = tia->TIDU_size;
- so->so_serv_type = tia->SERV_type;
+ sotpi_info_t *sti = SOTOTPI(so);
+
+ sti->sti_tsdu_size = tia->TSDU_size;
+ sti->sti_etsdu_size = tia->ETSDU_size;
+ sti->sti_addr_size = tia->ADDR_size;
+ sti->sti_opt_size = tia->OPT_size;
+ sti->sti_tidu_size = tia->TIDU_size;
+ sti->sti_serv_type = tia->SERV_type;
switch (tia->CURRENT_state) {
case TS_UNBND:
break;
case TS_IDLE:
so->so_state |= SS_ISBOUND;
- so->so_laddr_len = 0;
- so->so_state &= ~SS_LADDR_VALID;
+ sti->sti_laddr_len = 0;
+ sti->sti_laddr_valid = 0;
break;
case TS_DATA_XFER:
so->so_state |= SS_ISBOUND|SS_ISCONNECTED;
- so->so_laddr_len = 0;
- so->so_faddr_len = 0;
- so->so_state &= ~(SS_LADDR_VALID | SS_FADDR_VALID);
+ sti->sti_laddr_len = 0;
+ sti->sti_faddr_len = 0;
+ sti->sti_laddr_valid = 0;
+ sti->sti_faddr_valid = 0;
break;
}
@@ -550,11 +526,11 @@ copy_tinfo(struct sonode *so, struct T_info_ack *tia)
* and SM_EXDATA, SM_OPTDATA, and SM_BYTESTREAM)
* from the info ack.
*/
- if (so->so_serv_type == T_CLTS) {
+ if (sti->sti_serv_type == T_CLTS) {
so->so_mode |= SM_ATOMIC | SM_ADDR;
} else {
so->so_mode |= SM_CONNREQUIRED;
- if (so->so_etsdu_size != 0 && so->so_etsdu_size != -2)
+ if (sti->sti_etsdu_size != 0 && sti->sti_etsdu_size != -2)
so->so_mode |= SM_EXDATA;
}
if (so->so_type == SOCK_SEQPACKET || so->so_type == SOCK_RAW) {
@@ -563,9 +539,9 @@ copy_tinfo(struct sonode *so, struct T_info_ack *tia)
}
if (so->so_family == AF_UNIX) {
so->so_mode |= SM_FDPASSING | SM_OPTDATA;
- if (so->so_addr_size == -1) {
+ if (sti->sti_addr_size == -1) {
/* MAXPATHLEN + soun_family + nul termination */
- so->so_addr_size = (t_scalar_t)(MAXPATHLEN +
+ sti->sti_addr_size = (t_scalar_t)(MAXPATHLEN +
sizeof (short) + 1);
}
if (so->so_type == SOCK_STREAM) {
@@ -573,60 +549,62 @@ copy_tinfo(struct sonode *so, struct T_info_ack *tia)
* Make it into a byte-stream transport.
* SOCK_SEQPACKET sockets are unchanged.
*/
- so->so_tsdu_size = 0;
+ sti->sti_tsdu_size = 0;
}
- } else if (so->so_addr_size == -1) {
+ } else if (sti->sti_addr_size == -1) {
/*
* Logic extracted from sockmod - have to pick some max address
* length in order to preallocate the addresses.
*/
- so->so_addr_size = SOA_DEFSIZE;
+ sti->sti_addr_size = SOA_DEFSIZE;
}
- if (so->so_tsdu_size == 0)
+ if (sti->sti_tsdu_size == 0)
so->so_mode |= SM_BYTESTREAM;
}
static int
check_tinfo(struct sonode *so)
{
+ sotpi_info_t *sti = SOTOTPI(so);
+
/* Consistency checks */
- if (so->so_type == SOCK_DGRAM && so->so_serv_type != T_CLTS) {
+ if (so->so_type == SOCK_DGRAM && sti->sti_serv_type != T_CLTS) {
eprintso(so, ("service type and socket type mismatch\n"));
eprintsoline(so, EPROTO);
return (EPROTO);
}
- if (so->so_type == SOCK_STREAM && so->so_serv_type == T_CLTS) {
+ if (so->so_type == SOCK_STREAM && sti->sti_serv_type == T_CLTS) {
eprintso(so, ("service type and socket type mismatch\n"));
eprintsoline(so, EPROTO);
return (EPROTO);
}
- if (so->so_type == SOCK_SEQPACKET && so->so_serv_type == T_CLTS) {
+ if (so->so_type == SOCK_SEQPACKET && sti->sti_serv_type == T_CLTS) {
eprintso(so, ("service type and socket type mismatch\n"));
eprintsoline(so, EPROTO);
return (EPROTO);
}
if (so->so_family == AF_INET &&
- so->so_addr_size != (t_scalar_t)sizeof (struct sockaddr_in)) {
+ sti->sti_addr_size != (t_scalar_t)sizeof (struct sockaddr_in)) {
eprintso(so,
("AF_INET must have sockaddr_in address length. Got %d\n",
- so->so_addr_size));
+ sti->sti_addr_size));
eprintsoline(so, EMSGSIZE);
return (EMSGSIZE);
}
if (so->so_family == AF_INET6 &&
- so->so_addr_size != (t_scalar_t)sizeof (struct sockaddr_in6)) {
+ sti->sti_addr_size != (t_scalar_t)sizeof (struct sockaddr_in6)) {
eprintso(so,
("AF_INET6 must have sockaddr_in6 address length. Got %d\n",
- so->so_addr_size));
+ sti->sti_addr_size));
eprintsoline(so, EMSGSIZE);
return (EMSGSIZE);
}
dprintso(so, 1, (
"tinfo: serv %d tsdu %d, etsdu %d, addr %d, opt %d, tidu %d\n",
- so->so_serv_type, so->so_tsdu_size, so->so_etsdu_size,
- so->so_addr_size, so->so_opt_size,
- so->so_tidu_size));
+ sti->sti_serv_type, sti->sti_tsdu_size, sti->sti_etsdu_size,
+ sti->sti_addr_size, sti->sti_opt_size,
+ sti->sti_tidu_size));
dprintso(so, 1, ("tinfo: so_state %s\n",
pr_state(so->so_state, so->so_mode)));
return (0);
@@ -646,7 +624,7 @@ do_tinfo(struct sonode *so)
ASSERT(MUTEX_NOT_HELD(&so->so_lock));
if (so_no_tinfo) {
- so->so_addr_size = 0;
+ SOTOTPI(so)->sti_addr_size = 0;
return (0);
}
@@ -697,16 +675,17 @@ do_tcapability(struct sonode *so, t_uscalar_t cap_bits1)
struct T_capability_ack *tca;
mblk_t *mp;
int error;
+ sotpi_info_t *sti = SOTOTPI(so);
ASSERT(cap_bits1 != 0);
ASSERT((cap_bits1 & ~(TC1_ACCEPTOR_ID | TC1_INFO)) == 0);
ASSERT(MUTEX_NOT_HELD(&so->so_lock));
- if (so->so_provinfo->tpi_capability == PI_NO)
+ if (sti->sti_provinfo->tpi_capability == PI_NO)
return (do_tinfo(so));
if (so_no_tinfo) {
- so->so_addr_size = 0;
+ sti->sti_addr_size = 0;
if ((cap_bits1 &= ~TC1_INFO) == 0)
return (0);
}
@@ -737,10 +716,10 @@ do_tcapability(struct sonode *so, t_uscalar_t cap_bits1)
if ((error = sowaitprim(so, T_CAPABILITY_REQ, T_CAPABILITY_ACK,
(t_uscalar_t)sizeof (*tca), &mp, sock_capability_timeout * hz))) {
mutex_exit(&so->so_lock);
- PI_PROVLOCK(so->so_provinfo);
- if (so->so_provinfo->tpi_capability == PI_DONTKNOW)
- so->so_provinfo->tpi_capability = PI_NO;
- PI_PROVUNLOCK(so->so_provinfo);
+ PI_PROVLOCK(sti->sti_provinfo);
+ if (sti->sti_provinfo->tpi_capability == PI_DONTKNOW)
+ sti->sti_provinfo->tpi_capability = PI_NO;
+ PI_PROVUNLOCK(sti->sti_provinfo);
ASSERT((so->so_mode & SM_ACCEPTOR_ID) == 0);
if (cap_bits1 & TC1_INFO) {
/*
@@ -758,27 +737,14 @@ do_tcapability(struct sonode *so, t_uscalar_t cap_bits1)
return (0);
}
- if (so->so_provinfo->tpi_capability == PI_DONTKNOW) {
- PI_PROVLOCK(so->so_provinfo);
- so->so_provinfo->tpi_capability = PI_YES;
- PI_PROVUNLOCK(so->so_provinfo);
- }
-
ASSERT(mp);
tca = (struct T_capability_ack *)mp->b_rptr;
ASSERT((cap_bits1 & TC1_INFO) == (tca->CAP_bits1 & TC1_INFO));
+ so_proc_tcapability_ack(so, tca);
cap_bits1 = tca->CAP_bits1;
- if (cap_bits1 & TC1_ACCEPTOR_ID) {
- so->so_acceptor_id = tca->ACCEPTOR_id;
- so->so_mode |= SM_ACCEPTOR_ID;
- }
-
- if (cap_bits1 & TC1_INFO)
- copy_tinfo(so, &tca->INFO_ack);
-
mutex_exit(&so->so_lock);
freemsg(mp);
@@ -789,17 +755,41 @@ do_tcapability(struct sonode *so, t_uscalar_t cap_bits1)
}
/*
- * Retrieve and clear the socket error.
+ * Process a T_CAPABILITY_ACK
+ */
+void
+so_proc_tcapability_ack(struct sonode *so, struct T_capability_ack *tca)
+{
+ sotpi_info_t *sti = SOTOTPI(so);
+
+ if (sti->sti_provinfo->tpi_capability == PI_DONTKNOW) {
+ PI_PROVLOCK(sti->sti_provinfo);
+ sti->sti_provinfo->tpi_capability = PI_YES;
+ PI_PROVUNLOCK(sti->sti_provinfo);
+ }
+
+ if (tca->CAP_bits1 & TC1_ACCEPTOR_ID) {
+ sti->sti_acceptor_id = tca->ACCEPTOR_id;
+ so->so_mode |= SM_ACCEPTOR_ID;
+ }
+
+ if (tca->CAP_bits1 & TC1_INFO)
+ copy_tinfo(so, &tca->INFO_ack);
+}
+
+/*
+ * Retrieve socket error, clear error if not peek.
*/
int
-sogeterr(struct sonode *so)
+sogeterr(struct sonode *so, boolean_t clear_err)
{
int error;
ASSERT(MUTEX_HELD(&so->so_lock));
error = so->so_error;
- so->so_error = 0;
+ if (clear_err)
+ so->so_error = 0;
return (error);
}
@@ -898,8 +888,7 @@ void
soisdisconnected(struct sonode *so, int error)
{
ASSERT(MUTEX_HELD(&so->so_lock));
- so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING|
- SS_LADDR_VALID|SS_FADDR_VALID);
+ so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE);
so->so_error = (ushort_t)error;
if (so->so_peercred != NULL) {
@@ -935,7 +924,7 @@ void
socantsendmore(struct sonode *so)
{
ASSERT(MUTEX_HELD(&so->so_lock));
- so->so_state = so->so_state & ~SS_FADDR_VALID | SS_CANTSENDMORE;
+ so->so_state |= SS_CANTSENDMORE;
cv_broadcast(&so->so_state_cv);
}
@@ -1013,13 +1002,11 @@ sowaitprim(struct sonode *so, t_scalar_t request_prim, t_scalar_t ack_prim,
if (tpr->error_ack.TLI_error == TSYSERR) {
error = tpr->error_ack.UNIX_error;
} else {
- error = tlitosyserr(tpr->error_ack.TLI_error);
+ error = proto_tlitosyserr(tpr->error_ack.TLI_error);
}
dprintso(so, 0, ("error_ack for %d: %d/%d ->%d\n",
- tpr->error_ack.ERROR_prim,
- tpr->error_ack.TLI_error,
- tpr->error_ack.UNIX_error,
- error));
+ tpr->error_ack.ERROR_prim, tpr->error_ack.TLI_error,
+ tpr->error_ack.UNIX_error, error));
freemsg(mp);
return (error);
}
@@ -1029,13 +1016,11 @@ sowaitprim(struct sonode *so, t_scalar_t request_prim, t_scalar_t ack_prim,
#ifdef DEBUG
if (tpr->type == T_ERROR_ACK) {
dprintso(so, 0, ("error_ack for %d: %d/%d\n",
- tpr->error_ack.ERROR_prim,
- tpr->error_ack.TLI_error,
+ tpr->error_ack.ERROR_prim, tpr->error_ack.TLI_error,
tpr->error_ack.UNIX_error));
} else if (tpr->type == T_OK_ACK) {
dprintso(so, 0, ("ok_ack for %d, expected %d for %d\n",
- tpr->ok_ack.CORRECT_prim,
- ack_prim, request_prim));
+ tpr->ok_ack.CORRECT_prim, ack_prim, request_prim));
} else {
dprintso(so, 0,
("unexpected primitive %d, expected %d for %d\n",
@@ -1066,11 +1051,13 @@ sowaitokack(struct sonode *so, t_scalar_t request_prim)
}
/*
- * Queue a received TPI ack message on so_ack_mp.
+ * Queue a received TPI ack message on sti_ack_mp.
*/
void
soqueueack(struct sonode *so, mblk_t *mp)
{
+ sotpi_info_t *sti = SOTOTPI(so);
+
if (DB_TYPE(mp) != M_PCPROTO) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: received unexpected M_PROTO TPI ack. Prim %d\n",
@@ -1080,13 +1067,13 @@ soqueueack(struct sonode *so, mblk_t *mp)
}
mutex_enter(&so->so_lock);
- if (so->so_ack_mp != NULL) {
- dprintso(so, 1, ("so_ack_mp already set\n"));
- freemsg(so->so_ack_mp);
- so->so_ack_mp = NULL;
+ if (sti->sti_ack_mp != NULL) {
+ dprintso(so, 1, ("sti_ack_mp already set\n"));
+ freemsg(sti->sti_ack_mp);
+ sti->sti_ack_mp = NULL;
}
- so->so_ack_mp = mp;
- cv_broadcast(&so->so_ack_cv);
+ sti->sti_ack_mp = mp;
+ cv_broadcast(&sti->sti_ack_cv);
mutex_exit(&so->so_lock);
}
@@ -1096,9 +1083,11 @@ soqueueack(struct sonode *so, mblk_t *mp)
int
sowaitack(struct sonode *so, mblk_t **mpp, clock_t wait)
{
+ sotpi_info_t *sti = SOTOTPI(so);
+
ASSERT(MUTEX_HELD(&so->so_lock));
- while (so->so_ack_mp == NULL) {
+ while (sti->sti_ack_mp == NULL) {
#ifdef SOCK_TEST
if (wait == 0 && sock_test_timelimit != 0)
wait = sock_test_timelimit;
@@ -1110,16 +1099,16 @@ sowaitack(struct sonode *so, mblk_t **mpp, clock_t wait)
clock_t now;
time_to_wait(&now, wait);
- if (cv_timedwait(&so->so_ack_cv, &so->so_lock,
+ if (cv_timedwait(&sti->sti_ack_cv, &so->so_lock,
now) == -1) {
eprintsoline(so, ETIME);
return (ETIME);
}
}
else
- cv_wait(&so->so_ack_cv, &so->so_lock);
+ cv_wait(&sti->sti_ack_cv, &so->so_lock);
}
- *mpp = so->so_ack_mp;
+ *mpp = sti->sti_ack_mp;
#ifdef DEBUG
{
union T_primitives *tpr;
@@ -1135,16 +1124,18 @@ sowaitack(struct sonode *so, mblk_t **mpp, clock_t wait)
tpr->type == T_OPTMGMT_ACK);
}
#endif /* DEBUG */
- so->so_ack_mp = NULL;
+ sti->sti_ack_mp = NULL;
return (0);
}
/*
- * Queue a received T_CONN_IND message on so_conn_ind_head/tail.
+ * Queue a received T_CONN_IND message on sti_conn_ind_head/tail.
*/
void
soqueueconnind(struct sonode *so, mblk_t *mp)
{
+ sotpi_info_t *sti = SOTOTPI(so);
+
if (DB_TYPE(mp) != M_PROTO) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: received unexpected M_PCPROTO T_CONN_IND\n");
@@ -1154,17 +1145,15 @@ soqueueconnind(struct sonode *so, mblk_t *mp)
mutex_enter(&so->so_lock);
ASSERT(mp->b_next == NULL);
- if (so->so_conn_ind_head == NULL) {
- so->so_conn_ind_head = mp;
- so->so_state |= SS_HASCONNIND;
+ if (sti->sti_conn_ind_head == NULL) {
+ sti->sti_conn_ind_head = mp;
} else {
- ASSERT(so->so_state & SS_HASCONNIND);
- ASSERT(so->so_conn_ind_tail->b_next == NULL);
- so->so_conn_ind_tail->b_next = mp;
+ ASSERT(sti->sti_conn_ind_tail->b_next == NULL);
+ sti->sti_conn_ind_tail->b_next = mp;
}
- so->so_conn_ind_tail = mp;
+ sti->sti_conn_ind_tail = mp;
/* Wakeup a single consumer of the T_CONN_IND */
- cv_signal(&so->so_connind_cv);
+ cv_signal(&so->so_acceptq_cv);
mutex_exit(&so->so_lock);
}
@@ -1177,37 +1166,43 @@ int
sowaitconnind(struct sonode *so, int fmode, mblk_t **mpp)
{
mblk_t *mp;
+ sotpi_info_t *sti = SOTOTPI(so);
int error = 0;
ASSERT(MUTEX_NOT_HELD(&so->so_lock));
mutex_enter(&so->so_lock);
check_error:
if (so->so_error) {
- error = sogeterr(so);
+ error = sogeterr(so, B_TRUE);
if (error) {
mutex_exit(&so->so_lock);
return (error);
}
}
- if (so->so_conn_ind_head == NULL) {
+ if (sti->sti_conn_ind_head == NULL) {
if (fmode & (FNDELAY|FNONBLOCK)) {
error = EWOULDBLOCK;
goto done;
}
- if (!cv_wait_sig_swap(&so->so_connind_cv, &so->so_lock)) {
+
+ if (so->so_state & SS_CLOSING) {
+ error = EINTR;
+ goto done;
+ }
+
+ if (!cv_wait_sig_swap(&so->so_acceptq_cv, &so->so_lock)) {
error = EINTR;
goto done;
}
goto check_error;
}
- mp = so->so_conn_ind_head;
- so->so_conn_ind_head = mp->b_next;
+ mp = sti->sti_conn_ind_head;
+ sti->sti_conn_ind_head = mp->b_next;
mp->b_next = NULL;
- if (so->so_conn_ind_head == NULL) {
- ASSERT(so->so_conn_ind_tail == mp);
- so->so_conn_ind_tail = NULL;
- so->so_state &= ~SS_HASCONNIND;
+ if (sti->sti_conn_ind_head == NULL) {
+ ASSERT(sti->sti_conn_ind_tail == mp);
+ sti->sti_conn_ind_tail = NULL;
}
*mpp = mp;
done:
@@ -1225,31 +1220,32 @@ soflushconnind(struct sonode *so, t_scalar_t seqno)
{
mblk_t *prevmp, *mp;
struct T_conn_ind *tci;
+ sotpi_info_t *sti = SOTOTPI(so);
mutex_enter(&so->so_lock);
- for (prevmp = NULL, mp = so->so_conn_ind_head; mp != NULL;
+ for (prevmp = NULL, mp = sti->sti_conn_ind_head; mp != NULL;
prevmp = mp, mp = mp->b_next) {
tci = (struct T_conn_ind *)mp->b_rptr;
if (tci->SEQ_number == seqno) {
dprintso(so, 1,
("t_discon_ind: found T_CONN_IND %d\n", seqno));
/* Deleting last? */
- if (so->so_conn_ind_tail == mp) {
- so->so_conn_ind_tail = prevmp;
+ if (sti->sti_conn_ind_tail == mp) {
+ sti->sti_conn_ind_tail = prevmp;
}
if (prevmp == NULL) {
/* Deleting first */
- so->so_conn_ind_head = mp->b_next;
+ sti->sti_conn_ind_head = mp->b_next;
} else {
prevmp->b_next = mp->b_next;
}
mp->b_next = NULL;
- if (so->so_conn_ind_head == NULL) {
- ASSERT(so->so_conn_ind_tail == NULL);
- so->so_state &= ~SS_HASCONNIND;
- } else {
- ASSERT(so->so_conn_ind_tail != NULL);
- }
+
+ ASSERT((sti->sti_conn_ind_head == NULL &&
+ sti->sti_conn_ind_tail == NULL) ||
+ (sti->sti_conn_ind_head != NULL &&
+ sti->sti_conn_ind_tail != NULL));
+
so->so_error = ECONNABORTED;
mutex_exit(&so->so_lock);
@@ -1295,6 +1291,9 @@ sowaitconnected(struct sonode *so, int fmode, int nosig)
if (fmode & (FNDELAY|FNONBLOCK))
return (EINPROGRESS);
+ if (so->so_state & SS_CLOSING)
+ return (EINTR);
+
if (nosig)
cv_wait(&so->so_state_cv, &so->so_lock);
else if (!cv_wait_sig_swap(&so->so_state_cv, &so->so_lock)) {
@@ -1309,7 +1308,7 @@ sowaitconnected(struct sonode *so, int fmode, int nosig)
}
if (so->so_error != 0) {
- error = sogeterr(so);
+ error = sogeterr(so, B_TRUE);
ASSERT(error != 0);
dprintso(so, 1, ("sowaitconnected: error %d\n", error));
return (error);
@@ -1335,11 +1334,13 @@ static void
so_oob_sig(struct sonode *so, int extrasig,
strsigset_t *signals, strpollset_t *pollwakeups)
{
+ sotpi_info_t *sti = SOTOTPI(so);
+
ASSERT(MUTEX_HELD(&so->so_lock));
ASSERT(so_verify_oobstate(so));
- ASSERT(so->so_oobsigcnt >= so->so_oobcnt);
- if (so->so_oobsigcnt > so->so_oobcnt) {
+ ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
+ if (sti->sti_oobsigcnt > sti->sti_oobcnt) {
/*
* Signal has already been generated once for this
* urgent "event". However, since TCP can receive updated
@@ -1353,9 +1354,9 @@ so_oob_sig(struct sonode *so, int extrasig,
return;
}
- so->so_oobsigcnt++;
- ASSERT(so->so_oobsigcnt > 0); /* Wraparound */
- ASSERT(so->so_oobsigcnt > so->so_oobcnt);
+ sti->sti_oobsigcnt++;
+ ASSERT(sti->sti_oobsigcnt > 0); /* Wraparound */
+ ASSERT(sti->sti_oobsigcnt > sti->sti_oobcnt);
/*
* Record (for select/poll) that urgent data is pending.
@@ -1385,15 +1386,17 @@ static mblk_t *
so_oob_exdata(struct sonode *so, mblk_t *mp,
strsigset_t *signals, strpollset_t *pollwakeups)
{
+ sotpi_info_t *sti = SOTOTPI(so);
+
ASSERT(MUTEX_HELD(&so->so_lock));
ASSERT(so_verify_oobstate(so));
- ASSERT(so->so_oobsigcnt > so->so_oobcnt);
+ ASSERT(sti->sti_oobsigcnt > sti->sti_oobcnt);
- so->so_oobcnt++;
- ASSERT(so->so_oobcnt > 0); /* wraparound? */
- ASSERT(so->so_oobsigcnt >= so->so_oobcnt);
+ sti->sti_oobcnt++;
+ ASSERT(sti->sti_oobcnt > 0); /* wraparound? */
+ ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
/*
* Set MSGMARK for SIOCATMARK.
@@ -1412,11 +1415,13 @@ static mblk_t *
so_oob_data(struct sonode *so, mblk_t *mp,
strsigset_t *signals, strpollset_t *pollwakeups)
{
+ sotpi_info_t *sti = SOTOTPI(so);
+
ASSERT(MUTEX_HELD(&so->so_lock));
ASSERT(so_verify_oobstate(so));
- ASSERT(so->so_oobsigcnt >= so->so_oobcnt);
+ ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
ASSERT(mp != NULL);
/*
* For OOBINLINE we keep the data in the T_EXDATA_IND.
@@ -1439,7 +1444,7 @@ so_oob_data(struct sonode *so, mblk_t *mp,
/*
* Caller must hold the mutex.
* For delayed processing, save the T_DISCON_IND received
- * from below on so_discon_ind_mp.
+ * from below on sti_discon_ind_mp.
* When the message is processed the framework will call:
* (*func)(so, mp);
*/
@@ -1448,14 +1453,16 @@ so_save_discon_ind(struct sonode *so,
mblk_t *mp,
void (*func)(struct sonode *so, mblk_t *))
{
+ sotpi_info_t *sti = SOTOTPI(so);
+
ASSERT(MUTEX_HELD(&so->so_lock));
/*
* Discard new T_DISCON_IND if we have already received another.
- * Currently the earlier message can either be on so_discon_ind_mp
+ * Currently the earlier message can either be on sti_discon_ind_mp
* or being processed.
*/
- if (so->so_discon_ind_mp != NULL || (so->so_flag & SOASYNC_UNBIND)) {
+ if (sti->sti_discon_ind_mp != NULL || (so->so_flag & SOASYNC_UNBIND)) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: received unexpected additional T_DISCON_IND\n");
freemsg(mp);
@@ -1463,13 +1470,13 @@ so_save_discon_ind(struct sonode *so,
}
mp->b_prev = (mblk_t *)func;
mp->b_next = NULL;
- so->so_discon_ind_mp = mp;
+ sti->sti_discon_ind_mp = mp;
}
/*
* Caller must hold the mutex and make sure that either SOLOCKED
* or SOASYNC_UNBIND is set. Called from so_unlock_single().
- * Perform delayed processing of T_DISCON_IND message on so_discon_ind_mp.
+ * Perform delayed processing of T_DISCON_IND message on sti_discon_ind_mp.
* Need to ensure that strsock_proto() will not end up sleeping for
* SOASYNC_UNBIND, while executing this function.
*/
@@ -1478,13 +1485,14 @@ so_drain_discon_ind(struct sonode *so)
{
mblk_t *bp;
void (*func)(struct sonode *so, mblk_t *);
+ sotpi_info_t *sti = SOTOTPI(so);
ASSERT(MUTEX_HELD(&so->so_lock));
ASSERT(so->so_flag & (SOLOCKED|SOASYNC_UNBIND));
- /* Process T_DISCON_IND on so_discon_ind_mp */
- if ((bp = so->so_discon_ind_mp) != NULL) {
- so->so_discon_ind_mp = NULL;
+ /* Process T_DISCON_IND on sti_discon_ind_mp */
+ if ((bp = sti->sti_discon_ind_mp) != NULL) {
+ sti->sti_discon_ind_mp = NULL;
func = (void (*)())bp->b_prev;
bp->b_prev = NULL;
@@ -1502,20 +1510,21 @@ so_drain_discon_ind(struct sonode *so)
/*
* Caller must hold the mutex.
- * Remove the T_DISCON_IND on so_discon_ind_mp.
+ * Remove the T_DISCON_IND on sti_discon_ind_mp.
*/
void
so_flush_discon_ind(struct sonode *so)
{
mblk_t *bp;
+ sotpi_info_t *sti = SOTOTPI(so);
ASSERT(MUTEX_HELD(&so->so_lock));
/*
- * Remove T_DISCON_IND mblk at so_discon_ind_mp.
+ * Remove T_DISCON_IND mblk at sti_discon_ind_mp.
*/
- if ((bp = so->so_discon_ind_mp) != NULL) {
- so->so_discon_ind_mp = NULL;
+ if ((bp = sti->sti_discon_ind_mp) != NULL) {
+ sti->sti_discon_ind_mp = NULL;
bp->b_prev = NULL;
freemsg(bp);
}
@@ -1526,9 +1535,9 @@ so_flush_discon_ind(struct sonode *so)
*
* This function is used to process the T_DISCON_IND message. It does
* immediate processing when called from strsock_proto and delayed
- * processing of discon_ind saved on so_discon_ind_mp when called from
+ * processing of discon_ind saved on sti_discon_ind_mp when called from
* so_drain_discon_ind. When a T_DISCON_IND message is saved in
- * so_discon_ind_mp for delayed processing, this function is registered
+ * sti_discon_ind_mp for delayed processing, this function is registered
* as the callback function to process the message.
*
* SOASYNC_UNBIND should be held in this function, during the non-blocking
@@ -1549,6 +1558,7 @@ strsock_discon_ind(struct sonode *so, mblk_t *discon_mp)
struct T_unbind_req *ubr;
mblk_t *mp;
int error;
+ sotpi_info_t *sti = SOTOTPI(so);
ASSERT(MUTEX_HELD(&so->so_lock));
ASSERT(discon_mp);
@@ -1571,6 +1581,8 @@ strsock_discon_ind(struct sonode *so, mblk_t *discon_mp)
* is the errno name space.
*/
soisdisconnected(so, tpr->discon_ind.DISCON_reason);
+ sti->sti_laddr_valid = 0;
+ sti->sti_faddr_valid = 0;
/*
* Unbind with the transport without blocking.
@@ -1581,14 +1593,14 @@ strsock_discon_ind(struct sonode *so, mblk_t *discon_mp)
*
* If the socket is not bound, no need to unbind.
*/
- mp = so->so_unbind_mp;
+ mp = sti->sti_unbind_mp;
if (mp == NULL) {
ASSERT(!(so->so_state & SS_ISBOUND));
mutex_exit(&so->so_lock);
} else if (!(so->so_state & SS_ISBOUND)) {
mutex_exit(&so->so_lock);
} else {
- so->so_unbind_mp = NULL;
+ sti->sti_unbind_mp = NULL;
/*
* Is another T_DISCON_IND being processed.
@@ -1602,7 +1614,8 @@ strsock_discon_ind(struct sonode *so, mblk_t *discon_mp)
*/
so->so_flag |= SOASYNC_UNBIND;
ASSERT(!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)));
- so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN|SS_LADDR_VALID);
+ so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
+ sti->sti_laddr_valid = 0;
mutex_exit(&so->so_lock);
/*
@@ -1686,8 +1699,10 @@ strsock_proto(vnode_t *vp, mblk_t *mp,
{
union T_primitives *tpr;
struct sonode *so;
+ sotpi_info_t *sti;
so = VTOSO(vp);
+ sti = SOTOTPI(so);
dprintso(so, 1, ("strsock_proto(%p, %p)\n", (void *)vp, (void *)mp));
@@ -1849,11 +1864,11 @@ strsock_proto(vnode_t *vp, mblk_t *mp,
*/
struct sockaddr_in *faddr, *sin;
- /* Prevent so_faddr_sa from changing while accessed */
+ /* Prevent sti_faddr_sa from changing while accessed */
mutex_enter(&so->so_lock);
- ASSERT(so->so_faddr_len ==
+ ASSERT(sti->sti_faddr_len ==
(socklen_t)sizeof (struct sockaddr_in));
- faddr = (struct sockaddr_in *)so->so_faddr_sa;
+ faddr = (struct sockaddr_in *)sti->sti_faddr_sa;
sin = (struct sockaddr_in *)addr;
if (addrlen !=
(t_uscalar_t)sizeof (struct sockaddr_in) ||
@@ -1866,11 +1881,10 @@ strsock_proto(vnode_t *vp, mblk_t *mp,
dprintso(so, 0,
("sockfs: T_UNITDATA_IND mismatch: %s",
pr_addr(so->so_family,
- (struct sockaddr *)addr,
- addrlen)));
+ (struct sockaddr *)addr, addrlen)));
dprintso(so, 0, (" - %s\n",
- pr_addr(so->so_family, so->so_faddr_sa,
- (t_uscalar_t)so->so_faddr_len)));
+ pr_addr(so->so_family, sti->sti_faddr_sa,
+ (t_uscalar_t)sti->sti_faddr_len)));
#endif /* DEBUG */
mutex_exit(&so->so_lock);
freemsg(mp);
@@ -1885,11 +1899,11 @@ strsock_proto(vnode_t *vp, mblk_t *mp,
struct sockaddr_in6 *faddr6, *sin6;
static struct in6_addr zeroes; /* inits to all zeros */
- /* Prevent so_faddr_sa from changing while accessed */
+ /* Prevent sti_faddr_sa from changing while accessed */
mutex_enter(&so->so_lock);
- ASSERT(so->so_faddr_len ==
+ ASSERT(sti->sti_faddr_len ==
(socklen_t)sizeof (struct sockaddr_in6));
- faddr6 = (struct sockaddr_in6 *)so->so_faddr_sa;
+ faddr6 = (struct sockaddr_in6 *)sti->sti_faddr_sa;
sin6 = (struct sockaddr_in6 *)addr;
/* XXX could we get a mapped address ::ffff:0.0.0.0 ? */
if (addrlen !=
@@ -1904,11 +1918,10 @@ strsock_proto(vnode_t *vp, mblk_t *mp,
dprintso(so, 0,
("sockfs: T_UNITDATA_IND mismatch: %s",
pr_addr(so->so_family,
- (struct sockaddr *)addr,
- addrlen)));
+ (struct sockaddr *)addr, addrlen)));
dprintso(so, 0, (" - %s\n",
- pr_addr(so->so_family, so->so_faddr_sa,
- (t_uscalar_t)so->so_faddr_len)));
+ pr_addr(so->so_family, sti->sti_faddr_sa,
+ (t_uscalar_t)sti->sti_faddr_len)));
#endif /* DEBUG */
mutex_exit(&so->so_lock);
freemsg(mp);
@@ -2008,6 +2021,7 @@ strsock_proto(vnode_t *vp, mblk_t *mp,
if (so_getopt_unix_close(opt, optlen)) {
mutex_enter(&so->so_lock);
socantsendmore(so);
+ sti->sti_faddr_valid = 0;
mutex_exit(&so->so_lock);
strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
freemsg(mp);
@@ -2045,7 +2059,7 @@ strsock_proto(vnode_t *vp, mblk_t *mp,
*/
dprintso(so, 1,
("T_EXDATA_IND(%p): counts %d/%d state %s\n",
- (void *)vp, so->so_oobsigcnt, so->so_oobcnt,
+ (void *)vp, sti->sti_oobsigcnt, sti->sti_oobcnt,
pr_state(so->so_state, so->so_mode)));
if (msgdsize(mp->b_cont) == 0) {
@@ -2113,8 +2127,8 @@ strsock_proto(vnode_t *vp, mblk_t *mp,
* adjust the OOB count and OOB signal count
* just incremented for the new OOB data.
*/
- so->so_oobcnt--;
- so->so_oobsigcnt--;
+ sti->sti_oobcnt--;
+ sti->sti_oobsigcnt--;
mutex_exit(QLOCK(qp));
mutex_exit(&so->so_lock);
return (NULL);
@@ -2141,15 +2155,15 @@ strsock_proto(vnode_t *vp, mblk_t *mp,
dprintso(so, 1,
("after outofline T_EXDATA_IND(%p): "
"counts %d/%d poll 0x%x sig 0x%x state %s\n",
- (void *)vp, so->so_oobsigcnt,
- so->so_oobcnt, *pollwakeups, *allmsgsigs,
+ (void *)vp, sti->sti_oobsigcnt,
+ sti->sti_oobcnt, *pollwakeups, *allmsgsigs,
pr_state(so->so_state, so->so_mode)));
} else {
dprintso(so, 1,
("after inline T_EXDATA_IND(%p): "
"counts %d/%d poll 0x%x sig 0x%x state %s\n",
- (void *)vp, so->so_oobsigcnt,
- so->so_oobcnt, *pollwakeups, *allmsgsigs,
+ (void *)vp, sti->sti_oobsigcnt,
+ sti->sti_oobcnt, *pollwakeups, *allmsgsigs,
pr_state(so->so_state, so->so_mode)));
}
#endif /* DEBUG */
@@ -2194,13 +2208,15 @@ strsock_proto(vnode_t *vp, mblk_t *mp,
* For AF_UNIX require the identical length.
*/
if (so->so_family == AF_UNIX ?
- addrlen != (t_uscalar_t)sizeof (so->so_ux_laddr) :
- addrlen > (t_uscalar_t)so->so_faddr_maxlen) {
+ addrlen != (t_uscalar_t)sizeof (sti->sti_ux_laddr) :
+ addrlen > (t_uscalar_t)sti->sti_faddr_maxlen) {
zcmn_err(getzoneid(), CE_WARN,
"sockfs: T_conn_con with different "
"length %u/%d\n",
addrlen, conn_con->RES_length);
soisdisconnected(so, EPROTO);
+ sti->sti_laddr_valid = 0;
+ sti->sti_faddr_valid = 0;
mutex_exit(&so->so_lock);
strsetrerror(SOTOV(so), 0, 0, sogetrderr);
strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
@@ -2240,10 +2256,10 @@ strsock_proto(vnode_t *vp, mblk_t *mp,
* Save for getpeername.
*/
if (so->so_family != AF_UNIX) {
- so->so_faddr_len = (socklen_t)addrlen;
- ASSERT(so->so_faddr_len <= so->so_faddr_maxlen);
- bcopy(addr, so->so_faddr_sa, addrlen);
- so->so_state |= SS_FADDR_VALID;
+ sti->sti_faddr_len = (socklen_t)addrlen;
+ ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
+ bcopy(addr, sti->sti_faddr_sa, addrlen);
+ sti->sti_faddr_valid = 1;
}
if (so->so_peercred != NULL)
@@ -2275,7 +2291,7 @@ strsock_proto(vnode_t *vp, mblk_t *mp,
case T_CONN_IND:
/*
* Verify the min size and queue the message on
- * the so_conn_ind_head/tail list.
+ * the sti_conn_ind_head/tail list.
*/
if (MBLKL(mp) < sizeof (struct T_conn_ind)) {
zcmn_err(getzoneid(), CE_WARN,
@@ -2301,7 +2317,7 @@ strsock_proto(vnode_t *vp, mblk_t *mp,
tpr->type = T_CONN_IND;
- fbso = kssl_find_fallback(so->so_kssl_ent);
+ fbso = kssl_find_fallback(sti->sti_kssl_ent);
/*
* No fallback: the remote will timeout and
@@ -2391,6 +2407,7 @@ strsock_proto(vnode_t *vp, mblk_t *mp,
if ((so->so_state & SS_CANTRCVMORE) &&
(so->so_family == AF_UNIX)) {
socantsendmore(so);
+ sti->sti_faddr_valid = 0;
mutex_exit(&so->so_lock);
strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
dprintso(so, 1,
@@ -2468,7 +2485,7 @@ strsock_proto(vnode_t *vp, mblk_t *mp,
/* Compare just IP address and port */
struct sockaddr_in *sin1, *sin2;
- sin1 = (struct sockaddr_in *)so->so_faddr_sa;
+ sin1 = (struct sockaddr_in *)sti->sti_faddr_sa;
sin2 = (struct sockaddr_in *)addr;
if (addrlen == sizeof (struct sockaddr_in) &&
sin1->sin_port == sin2->sin_port &&
@@ -2481,7 +2498,7 @@ strsock_proto(vnode_t *vp, mblk_t *mp,
/* Compare just IP address and port. Not flow */
struct sockaddr_in6 *sin1, *sin2;
- sin1 = (struct sockaddr_in6 *)so->so_faddr_sa;
+ sin1 = (struct sockaddr_in6 *)sti->sti_faddr_sa;
sin2 = (struct sockaddr_in6 *)addr;
if (addrlen == sizeof (struct sockaddr_in6) &&
sin1->sin6_port == sin2->sin6_port &&
@@ -2491,16 +2508,16 @@ strsock_proto(vnode_t *vp, mblk_t *mp,
break;
}
case AF_UNIX:
- faddr = &so->so_ux_faddr;
+ faddr = &sti->sti_ux_faddr;
faddr_len =
- (t_uscalar_t)sizeof (so->so_ux_faddr);
+ (t_uscalar_t)sizeof (sti->sti_ux_faddr);
if (faddr_len == addrlen &&
bcmp(addr, faddr, addrlen) == 0)
match = B_TRUE;
break;
default:
- faddr = so->so_faddr_sa;
- faddr_len = (t_uscalar_t)so->so_faddr_len;
+ faddr = sti->sti_faddr_sa;
+ faddr_len = (t_uscalar_t)sti->sti_faddr_len;
if (faddr_len == addrlen &&
bcmp(addr, faddr, addrlen) == 0)
match = B_TRUE;
@@ -2512,11 +2529,10 @@ strsock_proto(vnode_t *vp, mblk_t *mp,
dprintso(so, 0,
("sockfs: T_UDERR_IND mismatch: %s - ",
pr_addr(so->so_family,
- (struct sockaddr *)addr,
- addrlen)));
+ (struct sockaddr *)addr, addrlen)));
dprintso(so, 0, ("%s\n",
- pr_addr(so->so_family, so->so_faddr_sa,
- so->so_faddr_len)));
+ pr_addr(so->so_family, sti->sti_faddr_sa,
+ sti->sti_faddr_len)));
#endif /* DEBUG */
mutex_exit(&so->so_lock);
freemsg(mp);
@@ -2545,8 +2561,8 @@ strsock_proto(vnode_t *vp, mblk_t *mp,
}
/*
* If the application asked for delayed errors
- * record the T_UDERROR_IND so_eaddr_mp and the reason in
- * so_delayed_error for delayed error posting. If the reason
+ * record the T_UDERROR_IND sti_eaddr_mp and the reason in
+ * sti_delayed_error for delayed error posting. If the reason
* is zero use ECONNRESET.
* Note that delayed error indications do not make sense for
* AF_UNIX sockets since sendto checks that the destination
@@ -2557,15 +2573,15 @@ strsock_proto(vnode_t *vp, mblk_t *mp,
freemsg(mp);
return (NULL);
}
- if (so->so_eaddr_mp != NULL)
- freemsg(so->so_eaddr_mp);
+ if (sti->sti_eaddr_mp != NULL)
+ freemsg(sti->sti_eaddr_mp);
- so->so_eaddr_mp = mp;
+ sti->sti_eaddr_mp = mp;
if (tudi->ERROR_type != 0)
error = tudi->ERROR_type;
else
error = ECONNRESET;
- so->so_delayed_error = (ushort_t)error;
+ sti->sti_delayed_error = (ushort_t)error;
mutex_exit(&so->so_lock);
return (NULL);
}
@@ -2700,8 +2716,10 @@ strsock_misc(vnode_t *vp, mblk_t *mp,
strsigset_t *allmsgsigs, strpollset_t *pollwakeups)
{
struct sonode *so;
+ sotpi_info_t *sti;
so = VTOSO(vp);
+ sti = SOTOTPI(so);
dprintso(so, 1, ("strsock_misc(%p, %p, 0x%x)\n",
(void *)vp, (void *)mp, DB_TYPE(mp)));
@@ -2724,15 +2742,14 @@ strsock_misc(vnode_t *vp, mblk_t *mp,
mutex_enter(&so->so_lock);
dprintso(so, 1,
("SIGURG(%p): counts %d/%d state %s\n",
- (void *)vp, so->so_oobsigcnt,
- so->so_oobcnt,
+ (void *)vp, sti->sti_oobsigcnt, sti->sti_oobcnt,
pr_state(so->so_state, so->so_mode)));
so_oob_sig(so, 1, allmsgsigs, pollwakeups);
dprintso(so, 1,
("after SIGURG(%p): counts %d/%d "
" poll 0x%x sig 0x%x state %s\n",
- (void *)vp, so->so_oobsigcnt,
- so->so_oobcnt, *pollwakeups, *allmsgsigs,
+ (void *)vp, sti->sti_oobsigcnt, sti->sti_oobcnt,
+ *pollwakeups, *allmsgsigs,
pr_state(so->so_state, so->so_mode)));
mutex_exit(&so->so_lock);
}
@@ -2873,53 +2890,118 @@ bad:
return (error);
}
+/*
+ * Wrapper for getmsg. If the socket has been converted to a stream
+ * pass the request to the stream head.
+ */
+int
+sock_getmsg(
+ struct vnode *vp,
+ struct strbuf *mctl,
+ struct strbuf *mdata,
+ uchar_t *prip,
+ int *flagsp,
+ int fmode,
+ rval_t *rvp
+)
+{
+ struct sonode *so;
+
+ ASSERT(vp->v_type == VSOCK);
+ /*
+ * Use the stream head to find the real socket vnode.
+ * This is needed when namefs sits above sockfs. Some
+ * sockets (like SCTP) are not streams.
+ */
+ if (!vp->v_stream) {
+ return (ENOSTR);
+ }
+ ASSERT(vp->v_stream->sd_vnode);
+ vp = vp->v_stream->sd_vnode;
+ ASSERT(vn_matchops(vp, socket_vnodeops));
+ so = VTOSO(vp);
+ dprintso(so, 1, ("sock_getmsg(%p) %s\n",
+ (void *)so, pr_state(so->so_state, so->so_mode)));
+
+ if (so->so_version == SOV_STREAM) {
+ /* The imaginary "sockmod" has been popped - act as a stream */
+ return (strgetmsg(vp, mctl, mdata, prip, flagsp, fmode, rvp));
+ }
+ eprintsoline(so, ENOSTR);
+ return (ENOSTR);
+}
/*
- * Translate a TLI(/XTI) error into a system error as best we can.
+ * Wrapper for putmsg. If the socket has been converted to a stream
+ * pass the request to the stream head.
+ *
+ * Note that a while a regular socket (SOV_SOCKSTREAM) does support the
+ * streams ioctl set it does not support putmsg and getmsg.
+ * Allowing putmsg would prevent sockfs from tracking the state of
+ * the socket/transport and would also invalidate the locking in sockfs.
*/
-static const int tli_errs[] = {
- 0, /* no error */
- EADDRNOTAVAIL, /* TBADADDR */
- ENOPROTOOPT, /* TBADOPT */
- EACCES, /* TACCES */
- EBADF, /* TBADF */
- EADDRNOTAVAIL, /* TNOADDR */
- EPROTO, /* TOUTSTATE */
- ECONNABORTED, /* TBADSEQ */
- 0, /* TSYSERR - will never get */
- EPROTO, /* TLOOK - should never be sent by transport */
- EMSGSIZE, /* TBADDATA */
- EMSGSIZE, /* TBUFOVFLW */
- EPROTO, /* TFLOW */
- EWOULDBLOCK, /* TNODATA */
- EPROTO, /* TNODIS */
- EPROTO, /* TNOUDERR */
- EINVAL, /* TBADFLAG */
- EPROTO, /* TNOREL */
- EOPNOTSUPP, /* TNOTSUPPORT */
- EPROTO, /* TSTATECHNG */
- /* following represent error namespace expansion with XTI */
- EPROTO, /* TNOSTRUCTYPE - never sent by transport */
- EPROTO, /* TBADNAME - never sent by transport */
- EPROTO, /* TBADQLEN - never sent by transport */
- EADDRINUSE, /* TADDRBUSY */
- EBADF, /* TINDOUT */
- EBADF, /* TPROVMISMATCH */
- EBADF, /* TRESQLEN */
- EBADF, /* TRESADDR */
- EPROTO, /* TQFULL - never sent by transport */
- EPROTO, /* TPROTO */
-};
+int
+sock_putmsg(
+ struct vnode *vp,
+ struct strbuf *mctl,
+ struct strbuf *mdata,
+ uchar_t pri,
+ int flag,
+ int fmode
+)
+{
+ struct sonode *so;
-static int
-tlitosyserr(int terr)
+ ASSERT(vp->v_type == VSOCK);
+ /*
+ * Use the stream head to find the real socket vnode.
+ * This is needed when namefs sits above sockfs.
+ */
+ if (!vp->v_stream) {
+ return (ENOSTR);
+ }
+ ASSERT(vp->v_stream->sd_vnode);
+ vp = vp->v_stream->sd_vnode;
+ ASSERT(vn_matchops(vp, socket_vnodeops));
+ so = VTOSO(vp);
+
+ dprintso(so, 1, ("sock_putmsg(%p) %s\n",
+ (void *)so, pr_state(so->so_state, so->so_mode)));
+
+ if (so->so_version == SOV_STREAM) {
+ /* The imaginary "sockmod" has been popped - act as a stream */
+ return (strputmsg(vp, mctl, mdata, pri, flag, fmode));
+ }
+ eprintsoline(so, ENOSTR);
+ return (ENOSTR);
+}
+
+/*
+ * Special function called only from f_getfl().
+ * Returns FASYNC if the SS_ASYNC flag is set on a socket, else 0.
+ * No locks are acquired here, so it is safe to use while uf_lock is held.
+ * This exists solely for BSD fcntl() FASYNC compatibility.
+ */
+int
+sock_getfasync(vnode_t *vp)
{
- ASSERT(terr != TSYSERR);
- if (terr >= (sizeof (tli_errs) / sizeof (tli_errs[0])))
- return (EPROTO);
+ struct sonode *so;
+
+ ASSERT(vp->v_type == VSOCK);
+ /*
+ * For stream model, v_stream is used; For non-stream, v_stream always
+ * equals NULL
+ */
+ if (vp->v_stream != NULL)
+ so = VTOSO(vp->v_stream->sd_vnode);
else
- return (tli_errs[terr]);
+ so = VTOSO(vp);
+
+ if (so->so_version == SOV_STREAM || !(so->so_state & SS_ASYNC))
+ return (0);
+
+ return (FASYNC);
}
/*
diff --git a/usr/src/uts/common/fs/sockfs/socksubr.c b/usr/src/uts/common/fs/sockfs/socksubr.c
index 33a6841f16..b82adb1789 100644
--- a/usr/src/uts/common/fs/sockfs/socksubr.c
+++ b/usr/src/uts/common/fs/sockfs/socksubr.c
@@ -73,6 +73,9 @@
#include <c2/audit.h>
#include <fs/sockfs/nl7c.h>
+#include <fs/sockfs/sockcommon.h>
+#include <fs/sockfs/socktpi.h>
+#include <fs/sockfs/socktpi_impl.h>
/*
* Macros that operate on struct cmsghdr.
@@ -88,18 +91,16 @@
((uintptr_t)(cmsg) + (cmsg)->cmsg_len <= (uintptr_t)(end)))
#define SO_LOCK_WAKEUP_TIME 3000 /* Wakeup time in milliseconds */
-static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
-struct kmem_cache *socktpi_sod_cache;
-
dev_t sockdev; /* For fsid in getattr */
int sockfs_defer_nl7c_init = 0;
-struct sockparams *sphead;
-krwlock_t splist_lock;
struct socklist socklist;
+struct kmem_cache *socket_cache;
+
static int sockfs_update(kstat_t *, int);
static int sockfs_snapshot(kstat_t *, void *, int);
+extern smod_info_t *sotpi_smod_create(void);
extern void sendfile_init();
@@ -124,7 +125,7 @@ struct k_sockinfo {
* Translate from a device pathname (e.g. "/dev/tcp") to a vnode.
* Returns with the vnode held.
*/
-static int
+int
sogetvp(char *devpath, vnode_t **vpp, int uioflag)
{
struct snode *csp;
@@ -133,6 +134,7 @@ sogetvp(char *devpath, vnode_t **vpp, int uioflag)
int error;
ASSERT(uioflag == UIO_SYSSPACE || uioflag == UIO_USERSPACE);
+
/*
* Lookup the underlying filesystem vnode.
*/
@@ -179,382 +181,6 @@ sogetvp(char *devpath, vnode_t **vpp, int uioflag)
}
/*
- * Add or delete (latter if devpath is NULL) an enter to the sockparams
- * table. If devpathlen is zero the devpath with not be kmem_freed. Otherwise
- * this routine assumes that the caller has kmem_alloced devpath/devpathlen
- * for this routine to consume.
- * The zero devpathlen could be used if the kernel wants to create entries
- * itself by calling sockconfig(1,2,3, "/dev/tcp", 0);
- */
-int
-soconfig(int domain, int type, int protocol,
- char *devpath, int devpathlen)
-{
- struct sockparams **spp;
- struct sockparams *sp;
- int error = 0;
-
- dprint(0, ("soconfig(%d,%d,%d,%s,%d)\n",
- domain, type, protocol, devpath, devpathlen));
-
- if (sockfs_defer_nl7c_init) {
- nl7c_init();
- sockfs_defer_nl7c_init = 0;
- }
-
- /*
- * Look for an existing match.
- */
- rw_enter(&splist_lock, RW_WRITER);
- for (spp = &sphead; (sp = *spp) != NULL; spp = &sp->sp_next) {
- if (sp->sp_domain == domain &&
- sp->sp_type == type &&
- sp->sp_protocol == protocol) {
- break;
- }
- }
- if (devpath == NULL) {
- ASSERT(devpathlen == 0);
-
- /* Delete existing entry */
- if (sp == NULL) {
- error = ENXIO;
- goto done;
- }
- /* Unlink and free existing entry */
- *spp = sp->sp_next;
- ASSERT(sp->sp_vnode);
- VN_RELE(sp->sp_vnode);
- if (sp->sp_devpathlen != 0)
- kmem_free(sp->sp_devpath, sp->sp_devpathlen);
- kmem_free(sp, sizeof (*sp));
- } else {
- vnode_t *vp;
-
- /* Add new entry */
- if (sp != NULL) {
- error = EEXIST;
- goto done;
- }
-
- error = sogetvp(devpath, &vp, UIO_SYSSPACE);
- if (error) {
- dprint(0, ("soconfig: vp %s failed with %d\n",
- devpath, error));
- goto done;
- }
-
- dprint(0, ("soconfig: %s => vp %p, dev 0x%lx\n",
- devpath, (void *)vp, vp->v_rdev));
-
- sp = kmem_alloc(sizeof (*sp), KM_SLEEP);
- sp->sp_domain = domain;
- sp->sp_type = type;
- sp->sp_protocol = protocol;
- sp->sp_devpath = devpath;
- sp->sp_devpathlen = devpathlen;
- sp->sp_vnode = vp;
- sp->sp_next = NULL;
- *spp = sp;
- }
-done:
- rw_exit(&splist_lock);
- if (error) {
- if (devpath != NULL)
- kmem_free(devpath, devpathlen);
-#ifdef SOCK_DEBUG
- eprintline(error);
-#endif /* SOCK_DEBUG */
- }
- return (error);
-}
-
-/*
- * Lookup an entry in the sockparams list based on the triple.
- * If no entry is found and devpath is not NULL translate devpath to a
- * vnode. Note that devpath is a pointer to a user address!
- * Returns with the vnode held.
- *
- * When this routine uses devpath it does not create an entry in the sockparams
- * list since this routine can run on behalf of any user and one user
- * should not be able to effect the transport used by another user.
- *
- * In order to return the correct error this routine has to do wildcard scans
- * of the list. The errors are (in decreasing precedence):
- * EAFNOSUPPORT - address family not in list
- * EPROTONOSUPPORT - address family supported but not protocol.
- * EPROTOTYPE - address family and protocol supported but not socket type.
- */
-vnode_t *
-solookup(int domain, int type, int protocol, char *devpath, int *errorp)
-{
- struct sockparams *sp;
- int error;
- vnode_t *vp;
-
- rw_enter(&splist_lock, RW_READER);
- for (sp = sphead; sp != NULL; sp = sp->sp_next) {
- if (sp->sp_domain == domain &&
- sp->sp_type == type &&
- sp->sp_protocol == protocol) {
- break;
- }
- }
- if (sp == NULL) {
- dprint(0, ("solookup(%d,%d,%d) not found\n",
- domain, type, protocol));
- if (devpath == NULL) {
- /* Determine correct error code */
- int found = 0;
-
- for (sp = sphead; sp != NULL; sp = sp->sp_next) {
- if (sp->sp_domain == domain && found < 1)
- found = 1;
- if (sp->sp_domain == domain &&
- sp->sp_protocol == protocol && found < 2)
- found = 2;
- }
- rw_exit(&splist_lock);
- switch (found) {
- case 0:
- *errorp = EAFNOSUPPORT;
- break;
- case 1:
- *errorp = EPROTONOSUPPORT;
- break;
- case 2:
- *errorp = EPROTOTYPE;
- break;
- }
- return (NULL);
- }
- rw_exit(&splist_lock);
-
- /*
- * Return vp based on devpath.
- * Do not enter into table to avoid random users
- * modifying the sockparams list.
- */
- error = sogetvp(devpath, &vp, UIO_USERSPACE);
- if (error) {
- dprint(0, ("solookup: vp %p failed with %d\n",
- (void *)devpath, error));
- *errorp = EPROTONOSUPPORT;
- return (NULL);
- }
- dprint(0, ("solookup: %p => vp %p, dev 0x%lx\n",
- (void *)devpath, (void *)vp, vp->v_rdev));
-
- return (vp);
- }
- dprint(0, ("solookup(%d,%d,%d) vp %p devpath %s\n",
- domain, type, protocol, (void *)sp->sp_vnode, sp->sp_devpath));
-
- vp = sp->sp_vnode;
- VN_HOLD(vp);
- rw_exit(&splist_lock);
- return (vp);
-}
-
-/*
- * Return a socket vnode.
- *
- * Assumes that the caller is "passing" an VN_HOLD for accessvp i.e.
- * when the socket is freed a VN_RELE will take place.
- *
- * Note that sockets assume that the driver will clone (either itself
- * or by using the clone driver) i.e. a socket() call will always
- * result in a new vnode being created.
- */
-struct vnode *
-makesockvp(struct vnode *accessvp, int domain, int type, int protocol)
-{
- kmem_cache_t *cp;
- struct sonode *so;
- struct vnode *vp;
- time_t now;
- dev_t dev;
-
- cp = (domain == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
- so = kmem_cache_alloc(cp, KM_SLEEP);
- so->so_cache = cp;
- so->so_obj = so;
- vp = SOTOV(so);
- now = gethrestime_sec();
-
- so->so_flag = 0;
- ASSERT(so->so_accessvp == NULL);
- so->so_accessvp = accessvp;
- dev = accessvp->v_rdev;
-
- /*
- * Record in so_flag that it is a clone.
- */
- if (getmajor(dev) == clone_major) {
- so->so_flag |= SOCLONE;
- }
- so->so_dev = dev;
-
- so->so_state = 0;
- so->so_mode = 0;
-
- so->so_fsid = sockdev;
- so->so_atime = now;
- so->so_mtime = now;
- so->so_ctime = now; /* Never modified */
- so->so_count = 0;
-
- so->so_family = (short)domain;
- so->so_type = (short)type;
- so->so_protocol = (short)protocol;
- so->so_pushcnt = 0;
-
- so->so_options = 0;
- so->so_linger.l_onoff = 0;
- so->so_linger.l_linger = 0;
- so->so_sndbuf = 0;
- so->so_rcvbuf = 0;
- so->so_sndlowat = 0;
- so->so_rcvlowat = 0;
-#ifdef notyet
- so->so_sndtimeo = 0;
- so->so_rcvtimeo = 0;
-#endif /* notyet */
- so->so_error = 0;
- so->so_delayed_error = 0;
-
- ASSERT(so->so_oobmsg == NULL);
- so->so_oobcnt = 0;
- so->so_oobsigcnt = 0;
- so->so_pgrp = 0;
- so->so_provinfo = NULL;
-
- ASSERT(so->so_laddr_sa == NULL && so->so_faddr_sa == NULL);
- so->so_laddr_len = so->so_faddr_len = 0;
- so->so_laddr_maxlen = so->so_faddr_maxlen = 0;
- so->so_eaddr_mp = NULL;
- so->so_priv = NULL;
-
- so->so_peercred = NULL;
-
- ASSERT(so->so_ack_mp == NULL);
- ASSERT(so->so_conn_ind_head == NULL);
- ASSERT(so->so_conn_ind_tail == NULL);
- ASSERT(so->so_ux_bound_vp == NULL);
- ASSERT(so->so_unbind_mp == NULL);
-
- vn_reinit(vp);
- vp->v_vfsp = rootvfs;
- vp->v_type = VSOCK;
- vp->v_rdev = so->so_dev;
- vn_exists(vp);
-
- return (vp);
-}
-
-void
-sockfree(struct sonode *so)
-{
- mblk_t *mp;
- vnode_t *vp;
-
- ASSERT(so->so_count == 0);
- ASSERT(so->so_accessvp);
- ASSERT(so->so_discon_ind_mp == NULL);
-
- vp = so->so_accessvp;
- VN_RELE(vp);
-
- /*
- * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
- * indirect them. It also uses so_accessvp as a validity test.
- */
- mutex_enter(&so->so_lock);
-
- so->so_accessvp = NULL;
-
- if (so->so_laddr_sa) {
- ASSERT((caddr_t)so->so_faddr_sa ==
- (caddr_t)so->so_laddr_sa + so->so_laddr_maxlen);
- ASSERT(so->so_faddr_maxlen == so->so_laddr_maxlen);
- so->so_state &= ~(SS_LADDR_VALID | SS_FADDR_VALID);
- kmem_free(so->so_laddr_sa, so->so_laddr_maxlen * 2);
- so->so_laddr_sa = NULL;
- so->so_laddr_len = so->so_laddr_maxlen = 0;
- so->so_faddr_sa = NULL;
- so->so_faddr_len = so->so_faddr_maxlen = 0;
- }
-
- mutex_exit(&so->so_lock);
-
- if ((mp = so->so_eaddr_mp) != NULL) {
- freemsg(mp);
- so->so_eaddr_mp = NULL;
- so->so_delayed_error = 0;
- }
- if ((mp = so->so_ack_mp) != NULL) {
- freemsg(mp);
- so->so_ack_mp = NULL;
- }
- if ((mp = so->so_conn_ind_head) != NULL) {
- mblk_t *mp1;
-
- while (mp) {
- mp1 = mp->b_next;
- mp->b_next = NULL;
- freemsg(mp);
- mp = mp1;
- }
- so->so_conn_ind_head = so->so_conn_ind_tail = NULL;
- so->so_state &= ~SS_HASCONNIND;
- }
-#ifdef DEBUG
- mutex_enter(&so->so_lock);
- ASSERT(so_verify_oobstate(so));
- mutex_exit(&so->so_lock);
-#endif /* DEBUG */
- if ((mp = so->so_oobmsg) != NULL) {
- freemsg(mp);
- so->so_oobmsg = NULL;
- so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA);
- }
-
- if ((mp = so->so_nl7c_rcv_mp) != NULL) {
- so->so_nl7c_rcv_mp = NULL;
- freemsg(mp);
- }
- so->so_nl7c_rcv_rval = 0;
- if (so->so_nl7c_uri != NULL) {
- nl7c_urifree(so);
- /* urifree() cleared nl7c_uri */
- }
- if (so->so_nl7c_flags) {
- so->so_nl7c_flags = 0;
- }
-
- if (so->so_direct != NULL) {
- sodirect_t *sodp = so->so_direct;
-
- ASSERT(sodp->sod_uioafh == NULL);
-
- so->so_direct = NULL;
- kmem_cache_free(socktpi_sod_cache, sodp);
- }
-
- ASSERT(so->so_ux_bound_vp == NULL);
- if ((mp = so->so_unbind_mp) != NULL) {
- freemsg(mp);
- so->so_unbind_mp = NULL;
- }
- vn_invalid(SOTOV(so));
-
- if (so->so_peercred != NULL)
- crfree(so->so_peercred);
-
- kmem_cache_free(so->so_cache, so->so_obj);
-}
-
-/*
* Update the accessed, updated, or changed times in an sonode
* with the current time.
*
@@ -569,133 +195,20 @@ so_update_attrs(struct sonode *so, int flag)
{
time_t now = gethrestime_sec();
+ if (SOCK_IS_NONSTR(so))
+ return;
+
mutex_enter(&so->so_lock);
so->so_flag |= flag;
if (flag & SOACC)
- so->so_atime = now;
+ SOTOTPI(so)->sti_atime = now;
if (flag & SOMOD)
- so->so_mtime = now;
+ SOTOTPI(so)->sti_mtime = now;
mutex_exit(&so->so_lock);
}
-/*ARGSUSED*/
-static int
-socktpi_constructor(void *buf, void *cdrarg, int kmflags)
-{
- struct sonode *so = buf;
- struct vnode *vp;
-
- vp = so->so_vnode = vn_alloc(kmflags);
- if (vp == NULL) {
- return (-1);
- }
- vn_setops(vp, socktpi_vnodeops);
- vp->v_data = so;
-
- so->so_direct = NULL;
-
- so->so_nl7c_flags = 0;
- so->so_nl7c_uri = NULL;
- so->so_nl7c_rcv_mp = NULL;
-
- so->so_oobmsg = NULL;
- so->so_ack_mp = NULL;
- so->so_conn_ind_head = NULL;
- so->so_conn_ind_tail = NULL;
- so->so_discon_ind_mp = NULL;
- so->so_ux_bound_vp = NULL;
- so->so_unbind_mp = NULL;
- so->so_accessvp = NULL;
- so->so_laddr_sa = NULL;
- so->so_faddr_sa = NULL;
- so->so_ops = &sotpi_sonodeops;
-
- mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&so->so_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&so->so_ack_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&so->so_connind_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&so->so_want_cv, NULL, CV_DEFAULT, NULL);
-
- return (0);
-}
-
-/*ARGSUSED1*/
-static void
-socktpi_destructor(void *buf, void *cdrarg)
-{
- struct sonode *so = buf;
- struct vnode *vp = SOTOV(so);
-
- ASSERT(so->so_direct == NULL);
-
- ASSERT(so->so_nl7c_flags == 0);
- ASSERT(so->so_nl7c_uri == NULL);
- ASSERT(so->so_nl7c_rcv_mp == NULL);
-
- ASSERT(so->so_oobmsg == NULL);
- ASSERT(so->so_ack_mp == NULL);
- ASSERT(so->so_conn_ind_head == NULL);
- ASSERT(so->so_conn_ind_tail == NULL);
- ASSERT(so->so_discon_ind_mp == NULL);
- ASSERT(so->so_ux_bound_vp == NULL);
- ASSERT(so->so_unbind_mp == NULL);
- ASSERT(so->so_ops == &sotpi_sonodeops);
-
- ASSERT(vn_matchops(vp, socktpi_vnodeops));
- ASSERT(vp->v_data == so);
-
- vn_free(vp);
-
- mutex_destroy(&so->so_lock);
- mutex_destroy(&so->so_plumb_lock);
- cv_destroy(&so->so_state_cv);
- cv_destroy(&so->so_ack_cv);
- cv_destroy(&so->so_connind_cv);
- cv_destroy(&so->so_want_cv);
-}
-
-static int
-socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
-{
- int retval;
-
- if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
- struct sonode *so = (struct sonode *)buf;
-
- mutex_enter(&socklist.sl_lock);
-
- so->so_next = socklist.sl_list;
- so->so_prev = NULL;
- if (so->so_next != NULL)
- so->so_next->so_prev = so;
- socklist.sl_list = so;
-
- mutex_exit(&socklist.sl_lock);
-
- }
- return (retval);
-}
-
-static void
-socktpi_unix_destructor(void *buf, void *cdrarg)
-{
- struct sonode *so = (struct sonode *)buf;
-
- mutex_enter(&socklist.sl_lock);
-
- if (so->so_next != NULL)
- so->so_next->so_prev = so->so_prev;
- if (so->so_prev != NULL)
- so->so_prev->so_next = so->so_next;
- else
- socklist.sl_list = so->so_next;
-
- mutex_exit(&socklist.sl_lock);
-
- socktpi_destructor(buf, cdrarg);
-}
-
+extern so_create_func_t sock_comm_create_function;
+extern so_destroy_func_t sock_comm_destroy_function;
/*
* Init function called when sockfs is loaded.
*/
@@ -716,21 +229,20 @@ sockinit(int fstype, char *name)
return (error);
}
- error = vn_make_ops(name, socktpi_vnodeops_template, &socktpi_vnodeops);
+ error = vn_make_ops(name, socket_vnodeops_template,
+ &socket_vnodeops);
if (error != 0) {
- err_str = "sockinit: bad sock vnode ops template";
+ err_str = "sockinit: bad socket vnode ops template";
/* vn_make_ops() does not reset socktpi_vnodeops on failure. */
- socktpi_vnodeops = NULL;
+ socket_vnodeops = NULL;
goto failure;
}
- error = sosctp_init();
- if (error != 0) {
- err_str = NULL;
- goto failure;
- }
+ socket_cache = kmem_cache_create("socket_cache",
+ sizeof (struct sonode), 0, sonode_constructor,
+ sonode_destructor, NULL, NULL, NULL, 0);
- error = sosdp_init();
+ error = socktpi_init();
if (error != 0) {
err_str = NULL;
goto failure;
@@ -743,21 +255,18 @@ sockinit(int fstype, char *name)
}
/*
- * Create sonode caches. We create a special one for AF_UNIX so
- * that we can track them for netstat(1m).
+ * Set up the default create and destroy functions
*/
- socktpi_cache = kmem_cache_create("socktpi_cache",
- sizeof (struct sonode), 0, socktpi_constructor,
- socktpi_destructor, NULL, NULL, NULL, 0);
-
- socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
- sizeof (struct sonode), 0, socktpi_unix_constructor,
- socktpi_unix_destructor, NULL, NULL, NULL, 0);
+ sock_comm_create_function = socket_sonode_create;
+ sock_comm_destroy_function = socket_sonode_destroy;
/*
* Build initial list mapping socket parameters to vnode.
*/
- rw_init(&splist_lock, NULL, RW_DEFAULT, NULL);
+ smod_init();
+ smod_add(sotpi_smod_create());
+
+ sockparams_init();
/*
* If sockets are needed before init runs /sbin/soconfig
@@ -786,8 +295,8 @@ sockinit(int fstype, char *name)
failure:
(void) vfs_freevfsops_by_type(fstype);
- if (socktpi_vnodeops != NULL)
- vn_freevnodeops(socktpi_vnodeops);
+ if (socket_vnodeops != NULL)
+ vn_freevnodeops(socket_vnodeops);
if (err_str != NULL)
zcmn_err(GLOBAL_ZONEID, CE_WARN, err_str);
return (error);
@@ -820,15 +329,18 @@ so_unlock_single(struct sonode *so, int flag)
ASSERT(flag & (SOLOCKED|SOASYNC_UNBIND));
ASSERT((flag & ~(SOLOCKED|SOASYNC_UNBIND)) == 0);
ASSERT(so->so_flag & flag);
-
/*
- * Process the T_DISCON_IND on so_discon_ind_mp.
+ * Process the T_DISCON_IND on sti_discon_ind_mp.
*
* Call to so_drain_discon_ind will result in so_lock
* being dropped and re-acquired later.
*/
- if (so->so_discon_ind_mp != NULL)
- so_drain_discon_ind(so);
+ if (!SOCK_IS_NONSTR(so)) {
+ sotpi_info_t *sti = SOTOTPI(so);
+
+ if (sti->sti_discon_ind_mp != NULL)
+ so_drain_discon_ind(so);
+ }
if (so->so_flag & SOWANT)
cv_broadcast(&so->so_want_cv);
@@ -1076,7 +588,7 @@ so_addr_verify(struct sonode *so, const struct sockaddr *name,
break;
}
case AF_UNIX:
- if (so->so_state & SS_FADDR_NOXLATE) {
+ if (SOTOTPI(so)->sti_faddr_noxlate) {
return (0);
}
if (namelen < (socklen_t)sizeof (short)) {
@@ -1122,13 +634,14 @@ so_ux_addr_xlate(struct sonode *so, struct sockaddr *name,
vnode_t *vp;
void *addr;
socklen_t addrlen;
+ sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 1, ("so_ux_addr_xlate(%p, %p, %d, %d)\n",
(void *)so, (void *)name, namelen, checkaccess));
ASSERT(name != NULL);
ASSERT(so->so_family == AF_UNIX);
- ASSERT(!(so->so_state & SS_FADDR_NOXLATE));
+ ASSERT(!sti->sti_faddr_noxlate);
ASSERT(namelen >= (socklen_t)sizeof (short));
ASSERT(name->sa_family == AF_UNIX);
soun = (struct sockaddr_un *)name;
@@ -1147,10 +660,10 @@ so_ux_addr_xlate(struct sonode *so, struct sockaddr *name,
* closed by the time the T_CONN_REQ or T_UNIDATA_REQ reaches the
* transport the message will get an error or be dropped.
*/
- so->so_ux_faddr.soua_vp = vp;
- so->so_ux_faddr.soua_magic = SOU_MAGIC_EXPLICIT;
- addr = &so->so_ux_faddr;
- addrlen = (socklen_t)sizeof (so->so_ux_faddr);
+ sti->sti_ux_faddr.soua_vp = vp;
+ sti->sti_ux_faddr.soua_magic = SOU_MAGIC_EXPLICIT;
+ addr = &sti->sti_ux_faddr;
+ addrlen = (socklen_t)sizeof (sti->sti_ux_faddr);
dprintso(so, 1, ("ux_xlate UNIX: addrlen %d, vp %p\n",
addrlen, (void *)vp));
VN_RELE(vp);
@@ -2007,8 +1520,6 @@ pr_state(uint_t state, uint_t mode)
(void) strcat(buf, "ASYNC ");
if (state & SS_ACCEPTCONN)
(void) strcat(buf, "ACCEPTCONN ");
- if (state & SS_HASCONNIND)
- (void) strcat(buf, "HASCONNIND ");
if (state & SS_SAVEDEOR)
(void) strcat(buf, "SAVEDEOR ");
@@ -2021,9 +1532,6 @@ pr_state(uint_t state, uint_t mode)
if (state & SS_HADOOBDATA)
(void) strcat(buf, "HADOOBDATA ");
- if (state & SS_FADDR_NOXLATE)
- (void) strcat(buf, "FADDR_NOXLATE ");
-
if (mode & SM_PRIV)
(void) strcat(buf, "PRIV ");
if (mode & SM_ATOMIC)
@@ -2102,6 +1610,8 @@ pr_addr(int family, struct sockaddr *addr, t_uscalar_t addrlen)
int
so_verify_oobstate(struct sonode *so)
{
+ boolean_t havemark;
+
ASSERT(MUTEX_HELD(&so->so_lock));
/*
@@ -2120,28 +1630,29 @@ so_verify_oobstate(struct sonode *so)
case SS_HADOOBDATA:
break;
default:
- printf("Bad oob state 1 (%p): counts %d/%d state %s\n",
- (void *)so, so->so_oobsigcnt,
- so->so_oobcnt, pr_state(so->so_state, so->so_mode));
+ printf("Bad oob state 1 (%p): state %s\n",
+ (void *)so, pr_state(so->so_state, so->so_mode));
return (0);
}
/* SS_RCVATMARK should only be set when SS_OOBPEND is set */
if ((so->so_state & (SS_RCVATMARK|SS_OOBPEND)) == SS_RCVATMARK) {
- printf("Bad oob state 2 (%p): counts %d/%d state %s\n",
- (void *)so, so->so_oobsigcnt,
- so->so_oobcnt, pr_state(so->so_state, so->so_mode));
+ printf("Bad oob state 2 (%p): state %s\n",
+ (void *)so, pr_state(so->so_state, so->so_mode));
return (0);
}
/*
- * (so_oobsigcnt != 0 or SS_RCVATMARK) iff SS_OOBPEND
+ * (havemark != 0 or SS_RCVATMARK) iff SS_OOBPEND
+ * For TPI, the presence of a "mark" is indicated by sti_oobsigcnt.
*/
- if (!EQUIV((so->so_oobsigcnt != 0) || (so->so_state & SS_RCVATMARK),
+ havemark = (SOCK_IS_NONSTR(so)) ? so->so_oobmark > 0 :
+ SOTOTPI(so)->sti_oobsigcnt > 0;
+
+ if (!EQUIV(havemark || (so->so_state & SS_RCVATMARK),
so->so_state & SS_OOBPEND)) {
- printf("Bad oob state 3 (%p): counts %d/%d state %s\n",
- (void *)so, so->so_oobsigcnt,
- so->so_oobcnt, pr_state(so->so_state, so->so_mode));
+ printf("Bad oob state 3 (%p): state %s\n",
+ (void *)so, pr_state(so->so_state, so->so_mode));
return (0);
}
@@ -2150,21 +1661,23 @@ so_verify_oobstate(struct sonode *so)
*/
if (!(so->so_options & SO_OOBINLINE) &&
!EQUIV(so->so_oobmsg != NULL, so->so_state & SS_HAVEOOBDATA)) {
- printf("Bad oob state 4 (%p): counts %d/%d state %s\n",
- (void *)so, so->so_oobsigcnt,
- so->so_oobcnt, pr_state(so->so_state, so->so_mode));
+ printf("Bad oob state 4 (%p): state %s\n",
+ (void *)so, pr_state(so->so_state, so->so_mode));
return (0);
}
- if (so->so_oobsigcnt < so->so_oobcnt) {
+
+ if (!SOCK_IS_NONSTR(so) &&
+ SOTOTPI(so)->sti_oobsigcnt < SOTOTPI(so)->sti_oobcnt) {
printf("Bad oob state 5 (%p): counts %d/%d state %s\n",
- (void *)so, so->so_oobsigcnt,
- so->so_oobcnt, pr_state(so->so_state, so->so_mode));
+ (void *)so, SOTOTPI(so)->sti_oobsigcnt,
+ SOTOTPI(so)->sti_oobcnt,
+ pr_state(so->so_state, so->so_mode));
return (0);
}
+
return (1);
}
#undef EQUIV
-
#endif /* DEBUG */
/* initialize sockfs zone specific kstat related items */
@@ -2224,8 +1737,8 @@ sockfs_update(kstat_t *ksp, int rw)
return (EACCES);
}
- for (so = socklist.sl_list; so != NULL; so = so->so_next) {
- if (so->so_accessvp != NULL && so->so_zoneid == myzoneid) {
+ for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) {
+ if (so->so_count != 0 && so->so_zoneid == myzoneid) {
nactive++;
}
}
@@ -2243,6 +1756,7 @@ sockfs_snapshot(kstat_t *ksp, void *buf, int rw)
struct k_sockinfo *pksi; /* where we put sockinfo data */
t_uscalar_t sn_len; /* soa_len */
zoneid_t myzoneid = (zoneid_t)(uintptr_t)ksp->ks_private;
+ sotpi_info_t *sti;
ASSERT((zoneid_t)(uintptr_t)ksp->ks_private == getzoneid());
@@ -2257,9 +1771,10 @@ sockfs_snapshot(kstat_t *ksp, void *buf, int rw)
* info into buf, in k_sockinfo format.
*/
pksi = (struct k_sockinfo *)buf;
- for (ns = 0, so = socklist.sl_list; so != NULL; so = so->so_next) {
+ ns = 0;
+ for (so = socklist.sl_list; so != NULL; so = SOTOTPI(so)->sti_next_so) {
/* only stuff active sonodes and the same zone: */
- if (so->so_accessvp == NULL || so->so_zoneid != myzoneid) {
+ if (so->so_count == 0 || so->so_zoneid != myzoneid) {
continue;
}
@@ -2271,50 +1786,54 @@ sockfs_snapshot(kstat_t *ksp, void *buf, int rw)
break;
}
+ sti = SOTOTPI(so);
/* copy important info into buf: */
pksi->ks_si.si_size = sizeof (struct k_sockinfo);
pksi->ks_si.si_family = so->so_family;
pksi->ks_si.si_type = so->so_type;
pksi->ks_si.si_flag = so->so_flag;
pksi->ks_si.si_state = so->so_state;
- pksi->ks_si.si_serv_type = so->so_serv_type;
- pksi->ks_si.si_ux_laddr_sou_magic = so->so_ux_laddr.soua_magic;
- pksi->ks_si.si_ux_faddr_sou_magic = so->so_ux_faddr.soua_magic;
- pksi->ks_si.si_laddr_soa_len = so->so_laddr.soa_len;
- pksi->ks_si.si_faddr_soa_len = so->so_faddr.soa_len;
+ pksi->ks_si.si_serv_type = sti->sti_serv_type;
+ pksi->ks_si.si_ux_laddr_sou_magic =
+ sti->sti_ux_laddr.soua_magic;
+ pksi->ks_si.si_ux_faddr_sou_magic =
+ sti->sti_ux_faddr.soua_magic;
+ pksi->ks_si.si_laddr_soa_len = sti->sti_laddr.soa_len;
+ pksi->ks_si.si_faddr_soa_len = sti->sti_faddr.soa_len;
pksi->ks_si.si_szoneid = so->so_zoneid;
+ pksi->ks_si.si_faddr_noxlate = sti->sti_faddr_noxlate;
mutex_enter(&so->so_lock);
- if (so->so_laddr_sa != NULL) {
- ASSERT(so->so_laddr_sa->sa_data != NULL);
- sn_len = so->so_laddr_len;
+ if (sti->sti_laddr_sa != NULL) {
+ ASSERT(sti->sti_laddr_sa->sa_data != NULL);
+ sn_len = sti->sti_laddr_len;
ASSERT(sn_len <= sizeof (short) +
sizeof (pksi->ks_si.si_laddr_sun_path));
pksi->ks_si.si_laddr_family =
- so->so_laddr_sa->sa_family;
+ sti->sti_laddr_sa->sa_family;
if (sn_len != 0) {
/* AF_UNIX socket names are NULL terminated */
(void) strncpy(pksi->ks_si.si_laddr_sun_path,
- so->so_laddr_sa->sa_data,
+ sti->sti_laddr_sa->sa_data,
sizeof (pksi->ks_si.si_laddr_sun_path));
sn_len = strlen(pksi->ks_si.si_laddr_sun_path);
}
pksi->ks_si.si_laddr_sun_path[sn_len] = 0;
}
- if (so->so_faddr_sa != NULL) {
- ASSERT(so->so_faddr_sa->sa_data != NULL);
- sn_len = so->so_faddr_len;
+ if (sti->sti_faddr_sa != NULL) {
+ ASSERT(sti->sti_faddr_sa->sa_data != NULL);
+ sn_len = sti->sti_faddr_len;
ASSERT(sn_len <= sizeof (short) +
sizeof (pksi->ks_si.si_faddr_sun_path));
pksi->ks_si.si_faddr_family =
- so->so_faddr_sa->sa_family;
+ sti->sti_faddr_sa->sa_family;
if (sn_len != 0) {
(void) strncpy(pksi->ks_si.si_faddr_sun_path,
- so->so_faddr_sa->sa_data,
+ sti->sti_faddr_sa->sa_data,
sizeof (pksi->ks_si.si_faddr_sun_path));
sn_len = strlen(pksi->ks_si.si_faddr_sun_path);
}
@@ -2325,9 +1844,9 @@ sockfs_snapshot(kstat_t *ksp, void *buf, int rw)
(void) sprintf(pksi->ks_straddr[0], "%p", (void *)so);
(void) sprintf(pksi->ks_straddr[1], "%p",
- (void *)so->so_ux_laddr.soua_vp);
+ (void *)sti->sti_ux_laddr.soua_vp);
(void) sprintf(pksi->ks_straddr[2], "%p",
- (void *)so->so_ux_faddr.soua_vp);
+ (void *)sti->sti_ux_faddr.soua_vp);
ns++;
pksi++;
@@ -2389,3 +1908,23 @@ out:
return (cnt);
}
}
+
+int
+so_copyin(const void *from, void *to, size_t size, int fromkernel)
+{
+ if (fromkernel) {
+ bcopy(from, to, size);
+ return (0);
+ }
+ return (xcopyin(from, to, size));
+}
+
+int
+so_copyout(const void *from, void *to, size_t size, int tokernel)
+{
+ if (tokernel) {
+ bcopy(from, to, size);
+ return (0);
+ }
+ return (xcopyout(from, to, size));
+}
diff --git a/usr/src/uts/common/fs/sockfs/socksyscalls.c b/usr/src/uts/common/fs/sockfs/socksyscalls.c
index 95f4f5738d..4d0929f39b 100644
--- a/usr/src/uts/common/fs/sockfs/socksyscalls.c
+++ b/usr/src/uts/common/fs/sockfs/socksyscalls.c
@@ -64,7 +64,10 @@
#include <vm/seg.h>
#include <vm/seg_map.h>
#include <vm/seg_kpm.h>
+
#include <fs/sockfs/nl7c.h>
+#include <fs/sockfs/sockcommon.h>
+#include <fs/sockfs/socktpi.h>
#ifdef SOCK_TEST
int do_useracc = 1; /* Controlled by setting SO_DEBUG to 4 */
@@ -90,115 +93,39 @@ extern int xnet_truncate_print;
* devpath for the kernel to use.
*/
int
-so_socket(int domain, int type, int protocol, char *devpath, int version)
+so_socket(int family, int type, int protocol, char *devpath, int version)
{
- vnode_t *accessvp;
struct sonode *so;
vnode_t *vp;
struct file *fp;
int fd;
int error;
- boolean_t wildcard = B_FALSE;
- int saved_error = 0;
- int sdomain = domain;
-
- dprint(1, ("so_socket(%d,%d,%d,%p,%d)\n",
- domain, type, protocol, (void *)devpath, version));
-
- if (domain == AF_NCA) {
- /*
- * The request is for an NCA socket so for NL7C use the
- * INET domain instead and mark NL7C_AF_NCA below.
- */
- domain = AF_INET;
- /*
- * NL7C is not supported in non-global zones,
- * we enforce this restriction here.
- */
- if (getzoneid() != GLOBAL_ZONEID) {
- return (set_errno(ENOTSUP));
- }
- }
-
- accessvp = solookup(domain, type, protocol, devpath, &error);
- if (accessvp == NULL) {
- /*
- * If there is either an EPROTONOSUPPORT or EPROTOTYPE error
- * it makes sense doing the wildcard lookup since the
- * protocol might not be in the table.
- */
- if (devpath != NULL || protocol == 0 ||
- !(error == EPROTONOSUPPORT || error == EPROTOTYPE))
- return (set_errno(error));
- saved_error = error;
+ if (devpath != NULL) {
+ char *buf;
+ size_t kdevpathlen = 0;
- /*
- * Try wildcard lookup. Never use devpath for wildcards.
- */
- accessvp = solookup(domain, type, 0, NULL, &error);
- if (accessvp == NULL) {
- /*
- * Can't find in kernel table - have library
- * fall back to /etc/netconfig and tell us
- * the devpath (The library will do this if it didn't
- * already pass in a devpath).
- */
- if (saved_error != 0)
- error = saved_error;
+ buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ if ((error = copyinstr(devpath, buf,
+ MAXPATHLEN, &kdevpathlen)) != 0) {
+ kmem_free(buf, MAXPATHLEN);
return (set_errno(error));
}
- wildcard = B_TRUE;
- }
-
- /* Check the device policy */
- if ((error = secpolicy_spec_open(CRED(),
- accessvp, FREAD|FWRITE)) != 0) {
- return (set_errno(error));
- }
-
- if (protocol == IPPROTO_SCTP) {
- so = sosctp_create(accessvp, domain, type, protocol, version,
- NULL, &error);
- } else if (protocol == PROTO_SDP) {
- so = sosdp_create(accessvp, domain, type, protocol, version,
- NULL, &error);
+ so = socket_create(family, type, protocol, buf, NULL,
+ SOCKET_SLEEP, version, CRED(), &error);
+ kmem_free(buf, MAXPATHLEN);
} else {
- so = sotpi_create(accessvp, domain, type, protocol, version,
- NULL, &error);
+ so = socket_create(family, type, protocol, NULL, NULL,
+ SOCKET_SLEEP, version, CRED(), &error);
}
- if (so == NULL) {
+ if (so == NULL)
return (set_errno(error));
- }
- if (sdomain == AF_NCA && domain == AF_INET) {
- so->so_nl7c_flags = NL7C_AF_NCA;
- }
- vp = SOTOV(so);
- if (wildcard) {
- /*
- * Issue SO_PROTOTYPE setsockopt.
- */
- error = SOP_SETSOCKOPT(so, SOL_SOCKET, SO_PROTOTYPE,
- &protocol,
- (t_uscalar_t)sizeof (protocol));
- if (error) {
- (void) VOP_CLOSE(vp, 0, 1, 0, CRED(), NULL);
- VN_RELE(vp);
- /*
- * Setsockopt often fails with ENOPROTOOPT but socket()
- * should fail with EPROTONOSUPPORT/EPROTOTYPE.
- */
- if (saved_error != 0 && error == ENOPROTOOPT)
- error = saved_error;
- else
- error = EPROTONOSUPPORT;
- return (set_errno(error));
- }
- }
+ /* Allocate a file descriptor for the socket */
+ vp = SOTOV(so);
if (error = falloc(vp, FWRITE|FREAD, &fp, &fd)) {
- (void) VOP_CLOSE(vp, 0, 1, 0, CRED(), NULL);
- VN_RELE(vp);
+ (void) socket_close(so, 0, CRED());
+ socket_destroy(so);
return (set_errno(error));
}
@@ -402,6 +329,8 @@ so_socketpair(int sv[2])
int error;
struct sockaddr_ux *name;
size_t namelen;
+ sotpi_info_t *sti1;
+ sotpi_info_t *sti2;
dprint(1, ("so_socketpair(%p)\n", (void *)sv));
@@ -425,6 +354,9 @@ so_socketpair(int sv[2])
goto done;
}
+ sti1 = SOTOTPI(so1);
+ sti2 = SOTOTPI(so2);
+
/*
* The code below makes assumptions about the "sockfs" implementation.
* So make sure that the correct implementation is really used.
@@ -437,12 +369,12 @@ so_socketpair(int sv[2])
* Bind both sockets and connect them with each other.
* Need to allocate name/namelen for soconnect.
*/
- error = SOP_BIND(so1, NULL, 0, _SOBIND_UNSPEC);
+ error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC, CRED());
if (error) {
eprintsoline(so1, error);
goto done;
}
- error = SOP_BIND(so2, NULL, 0, _SOBIND_UNSPEC);
+ error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
if (error) {
eprintsoline(so2, error);
goto done;
@@ -450,21 +382,21 @@ so_socketpair(int sv[2])
namelen = sizeof (struct sockaddr_ux);
name = kmem_alloc(namelen, KM_SLEEP);
name->sou_family = AF_UNIX;
- name->sou_addr = so2->so_ux_laddr;
- error = SOP_CONNECT(so1,
+ name->sou_addr = sti2->sti_ux_laddr;
+ error = socket_connect(so1,
(struct sockaddr *)name,
(socklen_t)namelen,
- 0, _SOCONNECT_NOXLATE);
+ 0, _SOCONNECT_NOXLATE, CRED());
if (error) {
kmem_free(name, namelen);
eprintsoline(so1, error);
goto done;
}
- name->sou_addr = so1->so_ux_laddr;
- error = SOP_CONNECT(so2,
+ name->sou_addr = sti1->sti_ux_laddr;
+ error = socket_connect(so2,
(struct sockaddr *)name,
(socklen_t)namelen,
- 0, _SOCONNECT_NOXLATE);
+ 0, _SOCONNECT_NOXLATE, CRED());
kmem_free(name, namelen);
if (error) {
eprintsoline(so2, error);
@@ -487,17 +419,18 @@ so_socketpair(int sv[2])
int nfd;
/*
- * We could simply call SOP_LISTEN() here (which would do the
+ * We could simply call socket_listen() here (which would do the
* binding automatically) if the code didn't rely on passing
- * _SOBIND_NOXLATE to the TPI implementation of SOP_BIND().
+ * _SOBIND_NOXLATE to the TPI implementation of socket_bind().
*/
- error = SOP_BIND(so1, NULL, 0, _SOBIND_UNSPEC|_SOBIND_NOXLATE|
- _SOBIND_LISTEN|_SOBIND_SOCKETPAIR);
+ error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC|
+ _SOBIND_NOXLATE|_SOBIND_LISTEN|_SOBIND_SOCKETPAIR,
+ CRED());
if (error) {
eprintsoline(so1, error);
goto done;
}
- error = SOP_BIND(so2, NULL, 0, _SOBIND_UNSPEC);
+ error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
if (error) {
eprintsoline(so2, error);
goto done;
@@ -506,20 +439,19 @@ so_socketpair(int sv[2])
namelen = sizeof (struct sockaddr_ux);
name = kmem_alloc(namelen, KM_SLEEP);
name->sou_family = AF_UNIX;
- name->sou_addr = so1->so_ux_laddr;
- error = SOP_CONNECT(so2,
+ name->sou_addr = sti1->sti_ux_laddr;
+ error = socket_connect(so2,
(struct sockaddr *)name,
(socklen_t)namelen,
- FNONBLOCK, _SOCONNECT_NOXLATE);
+ FNONBLOCK, _SOCONNECT_NOXLATE, CRED());
kmem_free(name, namelen);
if (error) {
if (error != EINPROGRESS) {
- eprintsoline(so2, error);
- goto done;
+ eprintsoline(so2, error); goto done;
}
}
- error = SOP_ACCEPT(so1, 0, &nso);
+ error = socket_accept(so1, 0, CRED(), &nso);
if (error) {
eprintsoline(so1, error);
goto done;
@@ -529,17 +461,17 @@ so_socketpair(int sv[2])
mutex_enter(&so2->so_lock);
error = sowaitconnected(so2, 0, 1);
mutex_exit(&so2->so_lock);
- nvp = SOTOV(nso);
if (error != 0) {
- (void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL);
- VN_RELE(nvp);
+ (void) socket_close(nso, 0, CRED());
+ socket_destroy(nso);
eprintsoline(so2, error);
goto done;
}
+ nvp = SOTOV(nso);
if (error = falloc(nvp, FWRITE|FREAD, &nfp, &nfd)) {
- (void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL);
- VN_RELE(nvp);
+ (void) socket_close(nso, 0, CRED());
+ socket_destroy(nso);
eprintsoline(nso, error);
goto done;
}
@@ -603,13 +535,13 @@ bind(int sock, struct sockaddr *name, socklen_t namelen, int version)
switch (version) {
default:
- error = SOP_BIND(so, name, namelen, 0);
+ error = socket_bind(so, name, namelen, 0, CRED());
break;
case SOV_XPG4_2:
- error = SOP_BIND(so, name, namelen, _SOBIND_XPG4_2);
+ error = socket_bind(so, name, namelen, _SOBIND_XPG4_2, CRED());
break;
case SOV_SOCKBSD:
- error = SOP_BIND(so, name, namelen, _SOBIND_SOCKBSD);
+ error = socket_bind(so, name, namelen, _SOBIND_SOCKBSD, CRED());
break;
}
done:
@@ -635,7 +567,7 @@ listen(int sock, int backlog, int version)
if ((so = getsonode(sock, &error, NULL)) == NULL)
return (set_errno(error));
- error = SOP_LISTEN(so, backlog);
+ error = socket_listen(so, backlog, CRED());
releasef(sock);
if (error)
@@ -655,6 +587,8 @@ accept(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
struct vnode *nvp;
struct file *nfp;
int nfd;
+ struct sockaddr *addrp;
+ socklen_t addrlen;
dprint(1, ("accept(%d, %p, %p)\n",
sock, (void *)name, (void *)namelenp));
@@ -681,15 +615,15 @@ accept(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
}
/*
- * Allocate the user fd before SOP_ACCEPT() in order to
- * catch EMFILE errors before calling SOP_ACCEPT().
+ * Allocate the user fd before socket_accept() in order to
+ * catch EMFILE errors before calling socket_accept().
*/
if ((nfd = ufalloc(0)) == -1) {
eprintsoline(so, EMFILE);
releasef(sock);
return (set_errno(EMFILE));
}
- error = SOP_ACCEPT(so, fp->f_flag, &nso);
+ error = socket_accept(so, fp->f_flag, CRED(), &nso);
releasef(sock);
if (error) {
setf(nfd, NULL);
@@ -698,34 +632,32 @@ accept(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
nvp = SOTOV(nso);
- /*
- * so_faddr_sa can not go away even though we are not holding so_lock.
- * However, in theory its content could change from underneath us.
- * But this is not possible in practice since it can only
- * change due to either some socket system call
- * or due to a T_CONN_CON being received from the stream head.
- * Since the falloc/setf have not yet been done no thread
- * can do any system call on nso and T_CONN_CON can not arrive
- * on a socket that is already connected.
- * Thus there is no reason to hold so_lock here.
- *
- * SOP_ACCEPT() is required to have set the valid bit for the faddr,
- * but it could be instantly cleared by a disconnect from the transport.
- * For that reason we ignore it here.
- */
ASSERT(MUTEX_NOT_HELD(&nso->so_lock));
- error = copyout_name(name, namelen, namelenp,
- nso->so_faddr_sa, (socklen_t)nso->so_faddr_len);
+ if (namelen != 0) {
+ addrlen = so->so_max_addr_len;
+ addrp = (struct sockaddr *)kmem_alloc(addrlen, KM_SLEEP);
+
+ if ((error = socket_getpeername(nso, (struct sockaddr *)addrp,
+ &addrlen, B_TRUE, CRED())) == 0) {
+ error = copyout_name(name, namelen, namelenp,
+ addrp, addrlen);
+ } else {
+ ASSERT(error == EINVAL || error == ENOTCONN);
+ error = ECONNABORTED;
+ }
+ kmem_free(addrp, so->so_max_addr_len);
+ }
+
if (error) {
setf(nfd, NULL);
- (void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL);
- VN_RELE(nvp);
+ (void) socket_close(nso, 0, CRED());
+ socket_destroy(nso);
return (set_errno(error));
}
if (error = falloc(NULL, FWRITE|FREAD, &nfp, NULL)) {
setf(nfd, NULL);
- (void) VOP_CLOSE(nvp, 0, 1, 0, CRED(), NULL);
- VN_RELE(nvp);
+ (void) socket_close(nso, 0, CRED());
+ socket_destroy(nso);
eprintsoline(so, error);
return (set_errno(error));
}
@@ -790,8 +722,8 @@ connect(int sock, struct sockaddr *name, socklen_t namelen, int version)
} else
name = NULL;
- error = SOP_CONNECT(so, name, namelen, fp->f_flag,
- (version != SOV_XPG4_2) ? 0 : _SOCONNECT_XPG4_2);
+ error = socket_connect(so, name, namelen, fp->f_flag,
+ (version != SOV_XPG4_2) ? 0 : _SOCONNECT_XPG4_2, CRED());
releasef(sock);
if (name)
kmem_free(name, (size_t)namelen);
@@ -813,7 +745,7 @@ shutdown(int sock, int how, int version)
if ((so = getsonode(sock, &error, NULL)) == NULL)
return (set_errno(error));
- error = SOP_SHUTDOWN(so, how);
+ error = socket_shutdown(so, how, CRED());
releasef(sock);
if (error)
@@ -857,13 +789,12 @@ recvit(int sock,
msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL |
MSG_DONTWAIT | MSG_XPG4_2);
- error = SOP_RECVMSG(so, msg, uiop);
+ error = socket_recvmsg(so, msg, uiop, CRED());
if (error) {
releasef(sock);
return (set_errno(error));
}
lwp_stat_update(LWP_STAT_MSGRCV, 1);
- so_update_attrs(so, SOACC);
releasef(sock);
error = copyout_name(name, namelen, namelenp,
@@ -1198,7 +1129,7 @@ sendit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags)
len = uiop->uio_resid;
msg->msg_flags = flags;
- error = SOP_SENDMSG(so, msg, uiop);
+ error = socket_sendmsg(so, msg, uiop, CRED());
done1:
if (control != NULL)
kmem_free(control, controllen);
@@ -1211,7 +1142,6 @@ done3:
return (set_errno(error));
}
lwp_stat_update(LWP_STAT_MSGSND, 1);
- so_update_attrs(so, SOMOD);
releasef(sock);
return (len - uiop->uio_resid);
}
@@ -1413,12 +1343,8 @@ getpeername(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
struct sonode *so;
int error;
socklen_t namelen;
- union {
- struct sockaddr_in sin;
- struct sockaddr_in6 sin6;
- } sin; /* Temporary buffer, common case */
- void *addr; /* Temporary buffer, uncommon case */
- socklen_t addrlen, size;
+ socklen_t sock_addrlen;
+ struct sockaddr *sock_addrp;
dprint(1, ("getpeername(%d, %p, %p)\n",
sock, (void *)name, (void *)namelenp));
@@ -1432,44 +1358,16 @@ getpeername(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
error = EFAULT;
goto rel_out;
}
- /*
- * If a connect or accept has been done, unless we're an Xnet socket,
- * the remote address has already been updated in so_faddr_sa.
- */
- if (so->so_version != SOV_SOCKSTREAM && so->so_version != SOV_SOCKBSD ||
- !(so->so_state & SS_FADDR_VALID)) {
- if ((error = SOP_GETPEERNAME(so)) != 0)
- goto rel_out;
- }
+ sock_addrlen = so->so_max_addr_len;
+ sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
- if (so->so_faddr_maxlen <= sizeof (sin)) {
- size = 0;
- addr = &sin;
- } else {
- /*
- * Allocate temporary to avoid holding so_lock across
- * copyout
- */
- size = so->so_faddr_maxlen;
- addr = kmem_alloc(size, KM_SLEEP);
+ if ((error = socket_getpeername(so, sock_addrp, &sock_addrlen,
+ B_FALSE, CRED())) == 0) {
+ ASSERT(sock_addrlen <= so->so_max_addr_len);
+ error = copyout_name(name, namelen, namelenp,
+ (void *)sock_addrp, sock_addrlen);
}
- /* Prevent so_faddr_sa/len from changing while accessed */
- mutex_enter(&so->so_lock);
- if (!(so->so_state & SS_ISCONNECTED)) {
- mutex_exit(&so->so_lock);
- error = ENOTCONN;
- goto free_out;
- }
- addrlen = so->so_faddr_len;
- bcopy(so->so_faddr_sa, addr, addrlen);
- mutex_exit(&so->so_lock);
-
- ASSERT(MUTEX_NOT_HELD(&so->so_lock));
- error = copyout_name(name, namelen, namelenp, addr,
- (so->so_state & SS_FADDR_NOXLATE) ? 0 : addrlen);
-free_out:
- if (size != 0)
- kmem_free(addr, size);
+ kmem_free(sock_addrp, so->so_max_addr_len);
rel_out:
releasef(sock);
bad: return (error != 0 ? set_errno(error) : 0);
@@ -1482,13 +1380,8 @@ getsockname(int sock, struct sockaddr *name,
{
struct sonode *so;
int error;
- socklen_t namelen;
- union {
- struct sockaddr_in sin;
- struct sockaddr_in6 sin6;
- } sin; /* Temporary buffer, common case */
- void *addr; /* Temporary buffer, uncommon case */
- socklen_t addrlen, size;
+ socklen_t namelen, sock_addrlen;
+ struct sockaddr *sock_addrp;
dprint(1, ("getsockname(%d, %p, %p)\n",
sock, (void *)name, (void *)namelenp));
@@ -1503,39 +1396,16 @@ getsockname(int sock, struct sockaddr *name,
goto rel_out;
}
- /*
- * If a bind or accept has been done, unless we're an Xnet endpoint,
- * the local address has already been updated in so_laddr_sa.
- */
- if ((so->so_version != SOV_SOCKSTREAM &&
- so->so_version != SOV_SOCKBSD) ||
- !(so->so_state & SS_LADDR_VALID)) {
- if ((error = SOP_GETSOCKNAME(so)) != 0)
- goto rel_out;
- }
-
- if (so->so_laddr_maxlen <= sizeof (sin)) {
- size = 0;
- addr = &sin;
- } else {
- /*
- * Allocate temporary to avoid holding so_lock across
- * copyout
- */
- size = so->so_laddr_maxlen;
- addr = kmem_alloc(size, KM_SLEEP);
+ sock_addrlen = so->so_max_addr_len;
+ sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
+ if ((error = socket_getsockname(so, sock_addrp, &sock_addrlen,
+ CRED())) == 0) {
+ ASSERT(MUTEX_NOT_HELD(&so->so_lock));
+ ASSERT(sock_addrlen <= so->so_max_addr_len);
+ error = copyout_name(name, namelen, namelenp,
+ (void *)sock_addrp, sock_addrlen);
}
- /* Prevent so_laddr_sa/len from changing while accessed */
- mutex_enter(&so->so_lock);
- addrlen = so->so_laddr_len;
- bcopy(so->so_laddr_sa, addr, addrlen);
- mutex_exit(&so->so_lock);
-
- ASSERT(MUTEX_NOT_HELD(&so->so_lock));
- error = copyout_name(name, namelen, namelenp,
- addr, addrlen);
- if (size != 0)
- kmem_free(addr, size);
+ kmem_free(sock_addrp, so->so_max_addr_len);
rel_out:
releasef(sock);
bad: return (error != 0 ? set_errno(error) : 0);
@@ -1577,8 +1447,9 @@ getsockopt(int sock,
}
optval = kmem_alloc(optlen, KM_SLEEP);
optlen_res = optlen;
- error = SOP_GETSOCKOPT(so, level, option_name, optval,
- &optlen_res, (version != SOV_XPG4_2) ? 0 : _SOGETSOCKOPT_XPG4_2);
+ error = socket_getsockopt(so, level, option_name, optval,
+ &optlen_res, (version != SOV_XPG4_2) ? 0 : _SOGETSOCKOPT_XPG4_2,
+ CRED());
releasef(sock);
if (error) {
kmem_free(optval, optlen);
@@ -1633,8 +1504,8 @@ setsockopt(int sock,
} else
option_len = 0;
- error = SOP_SETSOCKOPT(so, level, option_name, optval,
- (t_uscalar_t)option_len);
+ error = socket_setsockopt(so, level, option_name, optval,
+ (t_uscalar_t)option_len, CRED());
done1:
if (optval != buffer)
kmem_free(optval, (size_t)option_len);
@@ -1646,51 +1517,140 @@ done2:
}
/*
- * Add config info when devpath is non-NULL; delete info when devpath is NULL.
- * devpath is a user address.
+ * Add config info when name is non-NULL; delete info when name is NULL.
+ * name could be a device name or a module name and are user address.
*/
int
-sockconfig(int domain, int type, int protocol, char *devpath)
+sockconfig(int family, int type, int protocol, char *name)
{
- char *kdevpath; /* Copied in devpath string */
- size_t kdevpathlen;
+ char *kdevpath = NULL; /* Copied in devpath string */
+ char *kmodule = NULL;
+ size_t pathlen = 0;
int error = 0;
dprint(1, ("sockconfig(%d, %d, %d, %p)\n",
- domain, type, protocol, (void *)devpath));
+ family, type, protocol, (void *)name));
if (secpolicy_net_config(CRED(), B_FALSE) != 0)
return (set_errno(EPERM));
- if (devpath == NULL) {
- /* Deleting an entry */
- kdevpath = NULL;
- kdevpathlen = 0;
- } else {
+ /*
+ * By default set the kdevpath and kmodule to NULL to delete an entry.
+ * Otherwise when name is not NULL, set the kdevpath or kmodule
+ * value to add an entry.
+ */
+ if (name != NULL) {
/*
* Adding an entry.
- * Copyin the devpath.
+ * Copyin the name.
* This also makes it possible to check for too long pathnames.
- * Compress the space needed for the devpath before passing it
+ * Compress the space needed for the name before passing it
* to soconfig - soconfig will store the string until
* the configuration is removed.
*/
char *buf;
-
buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- if ((error = copyinstr(devpath, buf, MAXPATHLEN,
- &kdevpathlen)) != 0) {
+ if ((error = copyinstr(name, buf, MAXPATHLEN, &pathlen)) != 0) {
kmem_free(buf, MAXPATHLEN);
goto done;
}
+ if (strncmp(buf, "/dev", strlen("/dev")) == 0) {
+ /* For device */
- kdevpath = kmem_alloc(kdevpathlen, KM_SLEEP);
- bcopy(buf, kdevpath, kdevpathlen);
- kdevpath[kdevpathlen - 1] = '\0';
+ /*
+ * Special handling for NCA:
+ *
+ * DEV_NCA is never opened even if an application
+ * requests for AF_NCA. The device opened is instead a
+ * predefined AF_INET transport (NCA_INET_DEV).
+ *
+ * Prior to Volo (PSARC/2007/587) NCA would determine
+ * the device using a lookup, which worked then because
+ * all protocols were based on TPI. Since TPI is no
+ * longer the default, we have to explicitly state
+ * which device to use.
+ */
+ if (strcmp(buf, NCA_DEV) == 0) {
+ /* only support entry <28, 2, 0> */
+ if (family != AF_NCA || type != SOCK_STREAM ||
+ protocol != 0) {
+ kmem_free(buf, MAXPATHLEN);
+ error = EINVAL;
+ goto done;
+ }
+
+ pathlen = strlen(NCA_INET_DEV) + 1;
+ kdevpath = kmem_alloc(pathlen, KM_SLEEP);
+ bcopy(NCA_INET_DEV, kdevpath, pathlen);
+ kdevpath[pathlen - 1] = '\0';
+ } else {
+ kdevpath = kmem_alloc(pathlen, KM_SLEEP);
+ bcopy(buf, kdevpath, pathlen);
+ kdevpath[pathlen - 1] = '\0';
+ }
+ } else {
+ /* For socket module */
+ kmodule = kmem_alloc(pathlen, KM_SLEEP);
+ bcopy(buf, kmodule, pathlen);
+ kmodule[pathlen - 1] = '\0';
+
+ pathlen = 0;
+ if (strcmp(kmodule, "tcp") == 0) {
+ /* Get the tcp device name for fallback */
+ if (family == 2) {
+ pathlen = strlen("/dev/tcp") + 1;
+ kdevpath = kmem_alloc(pathlen,
+ KM_SLEEP);
+ bcopy("/dev/tcp", kdevpath,
+ pathlen);
+ kdevpath[pathlen - 1] = '\0';
+ } else {
+ ASSERT(family == 26);
+ pathlen = strlen("/dev/tcp6") + 1;
+ kdevpath = kmem_alloc(pathlen,
+ KM_SLEEP);
+ bcopy("/dev/tcp6", kdevpath, pathlen);
+ kdevpath[pathlen - 1] = '\0';
+ }
+ } else if (strcmp(kmodule, "udp") == 0) {
+ /* Get the udp device name for fallback */
+ if (family == 2) {
+ pathlen = strlen("/dev/udp") + 1;
+ kdevpath = kmem_alloc(pathlen,
+ KM_SLEEP);
+ bcopy("/dev/udp", kdevpath, pathlen);
+ kdevpath[pathlen - 1] = '\0';
+ } else {
+ ASSERT(family == 26);
+ pathlen = strlen("/dev/udp6") + 1;
+ kdevpath = kmem_alloc(pathlen,
+ KM_SLEEP);
+ bcopy("/dev/udp6", kdevpath, pathlen);
+ kdevpath[pathlen - 1] = '\0';
+ }
+ } else if (strcmp(kmodule, "icmp") == 0) {
+ /* Get the icmp device name for fallback */
+ if (family == 2) {
+ pathlen = strlen("/dev/rawip") + 1;
+ kdevpath = kmem_alloc(pathlen,
+ KM_SLEEP);
+ bcopy("/dev/rawip", kdevpath, pathlen);
+ kdevpath[pathlen - 1] = '\0';
+ } else {
+ ASSERT(family == 26);
+ pathlen = strlen("/dev/rawip6") + 1;
+ kdevpath = kmem_alloc(pathlen,
+ KM_SLEEP);
+ bcopy("/dev/rawip6", kdevpath, pathlen);
+ kdevpath[pathlen - 1] = '\0';
+ }
+ }
+ }
kmem_free(buf, MAXPATHLEN);
}
- error = soconfig(domain, type, protocol, kdevpath, (int)kdevpathlen);
+ error = soconfig(family, type, protocol, kdevpath, (int)pathlen,
+ kmodule);
done:
if (error) {
eprintline(error);
@@ -1961,9 +1921,15 @@ snf_async_read(snf_req_t *sr)
*/
so = VTOSO(vp);
stp = vp->v_stream;
- wroff = (int)(stp->sd_wroff);
- maxblk = (int)(stp->sd_maxblk);
- extra = wroff + (int)(stp->sd_tail);
+ if (stp == NULL) {
+ wroff = so->so_proto_props.sopp_wroff;
+ maxblk = so->so_proto_props.sopp_maxblk;
+ extra = wroff + so->so_proto_props.sopp_tail;
+ } else {
+ wroff = (int)(stp->sd_wroff);
+ maxblk = (int)(stp->sd_maxblk);
+ extra = wroff + (int)(stp->sd_tail);
+ }
}
while ((size != 0) && (sr->sr_write_error == 0)) {
@@ -1975,7 +1941,8 @@ snf_async_read(snf_req_t *sr)
* need to adjust the size to the maximum
* SSL record size set in the stream head.
*/
- if (vp->v_type == VSOCK && so->so_kssl_ctx != NULL)
+ if (vp->v_type == VSOCK && !SOCK_IS_NONSTR(so) &&
+ SOTOTPI(so)->sti_kssl_ctx != NULL)
iosize = (int)MIN(iosize, maxblk);
if ((mp = allocb(iosize + extra, BPRI_MED)) == NULL) {
@@ -2066,7 +2033,7 @@ create_thread(int operation, struct vnode *vp, file_t *fp,
* store sd_qn_maxpsz into sr_maxpsz while we have stream head.
* stream might be closed before thread returns from snf_async_read.
*/
- if (stp->sd_qn_maxpsz > 0) {
+ if (stp != NULL && stp->sd_qn_maxpsz > 0) {
sr->sr_maxpsz = MIN(MAXBSIZE, stp->sd_qn_maxpsz);
} else {
sr->sr_maxpsz = MAXBSIZE;
@@ -2115,9 +2082,11 @@ snf_direct_io(file_t *fp, file_t *rfp, u_offset_t fileoff, u_offset_t size,
short fflag;
struct vnode *vp;
int ksize;
+ struct nmsghdr msg;
ksize = 0;
*count = 0;
+ bzero(&msg, sizeof (msg));
vp = fp->f_vnode;
fflag = fp->f_flag;
@@ -2138,8 +2107,11 @@ snf_direct_io(file_t *fp, file_t *rfp, u_offset_t fileoff, u_offset_t size,
}
iosize = MBLKL(mp);
- if ((error = kstrwritemp(vp, mp, fflag)) != 0) {
- freeb(mp);
+ error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
+
+ if (error != 0) {
+ if (mp != NULL)
+ freeb(mp);
break;
}
ksize += iosize;
@@ -2233,10 +2205,13 @@ snf_segmap(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size,
snf_smap_desbinfo *snfi;
struct vattr va;
boolean_t dowait = B_FALSE;
+ struct nmsghdr msg;
vp = fp->f_vnode;
fflag = fp->f_flag;
ksize = 0;
+ bzero(&msg, sizeof (msg));
+
for (;;) {
if (ISSIG(curthread, JUSTLOOKING)) {
error = EINTR;
@@ -2307,9 +2282,11 @@ snf_segmap(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size,
mp->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
}
VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
- if ((error = kstrwritemp(vp, mp, fflag)) != 0) {
+ error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
+ if (error != 0) {
*count = ksize;
- freemsg(mp);
+ if (mp != NULL)
+ freemsg(mp);
return (error);
}
ksize += iosize;
@@ -2335,16 +2312,22 @@ done:
stdata_t *stp;
stp = vp->v_stream;
- mutex_enter(&stp->sd_lock);
- while (!(stp->sd_flag & STZCNOTIFY)) {
- if (cv_wait_sig(&stp->sd_zcopy_wait,
- &stp->sd_lock) == 0) {
- error = EINTR;
- break;
+ if (stp == NULL) {
+ struct sonode *so;
+ so = VTOSO(vp);
+ error = so_zcopy_wait(so);
+ } else {
+ mutex_enter(&stp->sd_lock);
+ while (!(stp->sd_flag & STZCNOTIFY)) {
+ if (cv_wait_sig(&stp->sd_zcopy_wait,
+ &stp->sd_lock) == 0) {
+ error = EINTR;
+ break;
+ }
}
+ stp->sd_flag &= ~STZCNOTIFY;
+ mutex_exit(&stp->sd_lock);
}
- stp->sd_flag &= ~STZCNOTIFY;
- mutex_exit(&stp->sd_lock);
}
return (error);
}
@@ -2367,6 +2350,7 @@ snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size,
int maxblk = 0;
int wroff = 0;
struct sonode *so;
+ struct nmsghdr msg;
vp = fp->f_vnode;
if (vp->v_type == VSOCK) {
@@ -2377,11 +2361,17 @@ snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size,
*/
so = VTOSO(vp);
stp = vp->v_stream;
- wroff = (int)(stp->sd_wroff);
- maxblk = (int)(stp->sd_maxblk);
- extra = wroff + (int)(stp->sd_tail);
+ if (stp == NULL) {
+ wroff = so->so_proto_props.sopp_wroff;
+ maxblk = so->so_proto_props.sopp_maxblk;
+ extra = wroff + so->so_proto_props.sopp_tail;
+ } else {
+ wroff = (int)(stp->sd_wroff);
+ maxblk = (int)(stp->sd_maxblk);
+ extra = wroff + (int)(stp->sd_tail);
+ }
}
-
+ bzero(&msg, sizeof (msg));
fflag = fp->f_flag;
ksize = 0;
auio.uio_iov = &aiov;
@@ -2406,7 +2396,8 @@ snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size,
* need to adjust the size to the maximum
* SSL record size set in the stream head.
*/
- if (vp->v_type == VSOCK && so->so_kssl_ctx != NULL)
+ if (vp->v_type == VSOCK && !SOCK_IS_NONSTR(so) &&
+ SOTOTPI(so)->sti_kssl_ctx != NULL)
iosize = (int)MIN(iosize, maxblk);
if ((mp = allocb(iosize + extra, BPRI_MED)) == NULL) {
@@ -2434,9 +2425,13 @@ snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size,
mp->b_wptr = mp->b_rptr + iosize;
VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
- if ((error = kstrwritemp(vp, mp, fflag)) != 0) {
+
+ error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
+
+ if (error != 0) {
*count = ksize;
- freeb(mp);
+ if (mp != NULL)
+ freeb(mp);
return (error);
}
ksize += iosize;
@@ -2540,14 +2535,17 @@ sosendfile64(file_t *fp, file_t *rfp, const struct ksendfilevec64 *sfv,
if (sfv_len >= MAXBSIZE && (sfv_len >= (va_size >> 1) ||
(sfv->sfv_flag & SFV_NOWAIT) || sfv_len >= 0x1000000) &&
!vn_has_flocks(fvp) && !(fvp->v_flag & VNOMAP)) {
- if ((stp->sd_copyflag & (STZCVMSAFE|STZCVMUNSAFE)) == 0) {
+ uint_t copyflag;
+ copyflag = stp != NULL ? stp->sd_copyflag :
+ VTOSO(vp)->so_proto_props.sopp_zcopyflag;
+ if ((copyflag & (STZCVMSAFE|STZCVMUNSAFE)) == 0) {
int on = 1;
- if (SOP_SETSOCKOPT(VTOSO(vp), SOL_SOCKET,
- SO_SND_COPYAVOID, &on, sizeof (on)) == 0)
+ if (socket_setsockopt(VTOSO(vp), SOL_SOCKET,
+ SO_SND_COPYAVOID, &on, sizeof (on), CRED()) == 0)
dozcopy = B_TRUE;
} else {
- dozcopy = (stp->sd_copyflag & STZCVMSAFE);
+ dozcopy = copyflag & STZCVMSAFE;
}
}
if (dozcopy) {
@@ -2555,10 +2553,19 @@ sosendfile64(file_t *fp, file_t *rfp, const struct ksendfilevec64 *sfv,
error = snf_segmap(fp, fvp, sfv_off, (u_offset_t)sfv_len,
&count, ((sfv->sfv_flag & SFV_NOWAIT) != 0));
} else {
- if (stp->sd_qn_maxpsz == INFPSZ)
+ if (vp->v_type == VSOCK && stp == NULL) {
+ sonode_t *so = VTOSO(vp);
+ maxpsz = so->so_proto_props.sopp_maxpsz;
+ } else if (stp != NULL) {
+ maxpsz = stp->sd_qn_maxpsz;
+ } else {
+ maxpsz = maxphys;
+ }
+
+ if (maxpsz == INFPSZ)
maxpsz = maxphys;
else
- maxpsz = roundup(stp->sd_qn_maxpsz, MAXBSIZE);
+ maxpsz = roundup(maxpsz, MAXBSIZE);
sf_stats.ss_file_cached++;
error = snf_cache(fp, fvp, sfv_off, (u_offset_t)sfv_len,
maxpsz, &count);
@@ -2613,7 +2620,7 @@ sendto32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
int
soaccept(struct sonode *so, int fflag, struct sonode **nsop)
{
- return (SOP_ACCEPT(so, fflag, nsop));
+ return (socket_accept(so, fflag, CRED(), nsop));
}
int
@@ -2622,9 +2629,9 @@ sobind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
{
int error;
- error = SOP_BIND(so, name, namelen, flags);
+ error = socket_bind(so, name, namelen, flags, CRED());
if (error == 0 && backlog != 0)
- return (SOP_LISTEN(so, backlog));
+ return (socket_listen(so, backlog, CRED()));
return (error);
}
@@ -2632,59 +2639,48 @@ sobind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
int
solisten(struct sonode *so, int backlog)
{
- return (SOP_LISTEN(so, backlog));
+ return (socket_listen(so, backlog, CRED()));
}
int
soconnect(struct sonode *so, const struct sockaddr *name, socklen_t namelen,
int fflag, int flags)
{
- return (SOP_CONNECT(so, name, namelen, fflag, flags));
+ return (socket_connect(so, name, namelen, fflag, flags, CRED()));
}
int
sorecvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
{
- return (SOP_RECVMSG(so, msg, uiop));
+ return (socket_recvmsg(so, msg, uiop, CRED()));
}
int
sosendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
{
- return (SOP_SENDMSG(so, msg, uiop));
-}
-
-int
-sogetpeername(struct sonode *so)
-{
- return (SOP_GETPEERNAME(so));
-}
-
-int
-sogetsockname(struct sonode *so)
-{
- return (SOP_GETSOCKNAME(so));
+ return (socket_sendmsg(so, msg, uiop, CRED()));
}
int
soshutdown(struct sonode *so, int how)
{
- return (SOP_SHUTDOWN(so, how));
+ return (socket_shutdown(so, how, CRED()));
}
int
sogetsockopt(struct sonode *so, int level, int option_name, void *optval,
socklen_t *optlenp, int flags)
{
- return (SOP_GETSOCKOPT(so, level, option_name, optval, optlenp,
- flags));
+ return (socket_getsockopt(so, level, option_name, optval, optlenp,
+ flags, CRED()));
}
int
sosetsockopt(struct sonode *so, int level, int option_name, const void *optval,
t_uscalar_t optlen)
{
- return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen));
+ return (socket_setsockopt(so, level, option_name, optval, optlen,
+ CRED()));
}
/*
@@ -2692,9 +2688,25 @@ sosetsockopt(struct sonode *so, int level, int option_name, const void *optval,
* able to handle the creation of TPI sockfs sockets.
*/
struct sonode *
-socreate(vnode_t *accessvp, int domain, int type, int protocol, int version,
- struct sonode *tso, int *errorp)
+socreate(struct sockparams *sp, int family, int type, int protocol, int version,
+ int *errorp)
{
- return (sotpi_create(accessvp, domain, type, protocol, version, tso,
- errorp));
+ struct sonode *so;
+
+ ASSERT(sp != NULL);
+
+ so = sp->sp_smod_info->smod_sock_create_func(sp, family, type, protocol,
+ version, SOCKET_SLEEP, errorp, CRED());
+ if (so == NULL) {
+ SOCKPARAMS_DEC_REF(sp);
+ } else {
+ if ((*errorp = SOP_INIT(so, NULL, CRED(), SOCKET_SLEEP)) == 0) {
+ /* Cannot fail, only bumps so_count */
+ (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, CRED(), NULL);
+ } else {
+ socket_destroy(so);
+ so = NULL;
+ }
+ }
+ return (so);
}
diff --git a/usr/src/uts/common/fs/sockfs/socktpi.c b/usr/src/uts/common/fs/sockfs/socktpi.c
index f27c34578b..01873727f8 100644
--- a/usr/src/uts/common/fs/sockfs/socktpi.c
+++ b/usr/src/uts/common/fs/sockfs/socktpi.c
@@ -32,6 +32,7 @@
#include <sys/conf.h>
#include <sys/cred.h>
#include <sys/kmem.h>
+#include <sys/kmem_impl.h>
#include <sys/sysmacros.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
@@ -45,6 +46,7 @@
#include <sys/stream.h>
#include <sys/strsubr.h>
#include <sys/strsun.h>
+#include <sys/suntpi.h>
#include <sys/ddi.h>
#include <sys/esunddi.h>
#include <sys/flock.h>
@@ -81,6 +83,10 @@
#include <inet/kssl/ksslapi.h>
+#include <fs/sockfs/sockcommon.h>
+#include <fs/sockfs/socktpi.h>
+#include <fs/sockfs/socktpi_impl.h>
+
/*
* Possible failures when memory can't be allocated. The documented behavior:
*
@@ -170,13 +176,29 @@ int xnet_skip_checks = 0;
int xnet_check_print = 0;
int xnet_truncate_print = 0;
+static void sotpi_destroy(struct sonode *);
+static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
+ int, int *, cred_t *cr);
+
+static boolean_t sotpi_info_create(struct sonode *, int);
+static void sotpi_info_init(struct sonode *);
+static void sotpi_info_fini(struct sonode *);
+static void sotpi_info_destroy(struct sonode *);
+
+/*
+ * Do direct function call to the transport layer below; this would
+ * also allow the transport to utilize read-side synchronous stream
+ * interface if necessary. This is a /etc/system tunable that must
+ * not be modified on a running system. By default this is enabled
+ * for performance reasons and may be disabled for debugging purposes.
+ */
+boolean_t socktpi_direct = B_TRUE;
+
+static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
+
extern void sigintr(k_sigset_t *, int);
extern void sigunintr(k_sigset_t *);
-extern void *nl7c_lookup_addr(void *, t_uscalar_t);
-extern void *nl7c_add_addr(void *, t_uscalar_t);
-extern void nl7c_listener_addr(void *, struct sonode *);
-
/* Sockets acting as an in-kernel SSL proxy */
extern mblk_t *strsock_kssl_input(vnode_t *, mblk_t *, strwakeup_t *,
strsigset_t *, strsigset_t *, strpollset_t *);
@@ -189,62 +211,198 @@ extern int sodput(sodirect_t *, mblk_t *);
extern void sodwakeup(sodirect_t *);
/* TPI sockfs sonode operations */
-static int sotpi_accept(struct sonode *, int, struct sonode **);
-static int sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
+int sotpi_init(struct sonode *, struct sonode *, struct cred *,
int);
+static int sotpi_accept(struct sonode *, int, struct cred *,
+ struct sonode **);
+static int sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
+ int, struct cred *);
+static int sotpi_listen(struct sonode *, int, struct cred *);
static int sotpi_connect(struct sonode *, const struct sockaddr *,
- socklen_t, int, int);
-static int sotpi_listen(struct sonode *, int);
+ socklen_t, int, int, struct cred *);
+extern int sotpi_recvmsg(struct sonode *, struct nmsghdr *,
+ struct uio *, struct cred *);
static int sotpi_sendmsg(struct sonode *, struct nmsghdr *,
- struct uio *);
-static int sotpi_shutdown(struct sonode *, int);
-static int sotpi_getsockname(struct sonode *);
+ struct uio *, struct cred *);
+static int sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
+ struct cred *, mblk_t **);
static int sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
struct uio *, void *, t_uscalar_t, int);
static int sodgram_direct(struct sonode *, struct sockaddr *,
socklen_t, struct uio *, int);
+extern int sotpi_getpeername(struct sonode *, struct sockaddr *,
+ socklen_t *, boolean_t, struct cred *);
+static int sotpi_getsockname(struct sonode *, struct sockaddr *,
+ socklen_t *, struct cred *);
+static int sotpi_shutdown(struct sonode *, int, struct cred *);
+extern int sotpi_getsockopt(struct sonode *, int, int, void *,
+ socklen_t *, int, struct cred *);
+extern int sotpi_setsockopt(struct sonode *, int, int, const void *,
+ socklen_t, struct cred *);
+static int sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
+ int32_t *);
+static int socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
+ struct cred *, int32_t *);
+static int sotpi_poll(struct sonode *, short, int, short *,
+ struct pollhead **);
+static int sotpi_close(struct sonode *, int, struct cred *);
+
+static int i_sotpi_info_constructor(sotpi_info_t *);
+static void i_sotpi_info_destructor(sotpi_info_t *);
sonodeops_t sotpi_sonodeops = {
+ sotpi_init, /* sop_init */
sotpi_accept, /* sop_accept */
sotpi_bind, /* sop_bind */
sotpi_listen, /* sop_listen */
sotpi_connect, /* sop_connect */
sotpi_recvmsg, /* sop_recvmsg */
sotpi_sendmsg, /* sop_sendmsg */
+ sotpi_sendmblk, /* sop_sendmblk */
sotpi_getpeername, /* sop_getpeername */
sotpi_getsockname, /* sop_getsockname */
sotpi_shutdown, /* sop_shutdown */
sotpi_getsockopt, /* sop_getsockopt */
- sotpi_setsockopt /* sop_setsockopt */
+ sotpi_setsockopt, /* sop_setsockopt */
+ sotpi_ioctl, /* sop_ioctl */
+ sotpi_poll, /* sop_poll */
+ sotpi_close, /* sop_close */
};
/*
+ * Return a TPI socket vnode.
+ *
+ * Note that sockets assume that the driver will clone (either itself
+ * or by using the clone driver) i.e. a socket() call will always
+ * result in a new vnode being created.
+ */
+
+/*
* Common create code for socket and accept. If tso is set the values
* from that node is used instead of issuing a T_INFO_REQ.
- *
- * Assumes that the caller has a VN_HOLD on accessvp.
- * The VN_RELE will occur either when sotpi_create() fails or when
- * the returned sonode is freed.
*/
-struct sonode *
-sotpi_create(vnode_t *accessvp, int domain, int type, int protocol, int version,
- struct sonode *tso, int *errorp)
+
+/* ARGSUSED */
+static struct sonode *
+sotpi_create(struct sockparams *sp, int family, int type, int protocol,
+ int version, int sflags, int *errorp, cred_t *cr)
{
struct sonode *so;
- vnode_t *vp;
- int flags, error;
+ kmem_cache_t *cp;
+ int sfamily = family;
- ASSERT(accessvp != NULL);
- vp = makesockvp(accessvp, domain, type, protocol);
- ASSERT(vp != NULL);
- so = VTOSO(vp);
+ ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
+
+ if (family == AF_NCA) {
+ /*
+ * The request is for an NCA socket so for NL7C use the
+ * INET domain instead and mark NL7C_AF_NCA below.
+ */
+ family = AF_INET;
+ /*
+ * NL7C is not supported in the non-global zone,
+ * we enforce this restriction here.
+ */
+ if (getzoneid() != GLOBAL_ZONEID) {
+ *errorp = ENOTSUP;
+ return (NULL);
+ }
+ }
+
+ /*
+ * to be compatible with old tpi socket implementation ignore
+ * sleep flag (sflags) passed in
+ */
+ cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
+ so = kmem_cache_alloc(cp, KM_SLEEP);
+ if (so == NULL) {
+ *errorp = ENOMEM;
+ return (NULL);
+ }
+
+ sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
+ sotpi_info_init(so);
+
+ if (sfamily == AF_NCA) {
+ SOTOTPI(so)->sti_nl7c_flags = NL7C_AF_NCA;
+ }
+
+ if (version == SOV_DEFAULT)
+ version = so_default_version;
+
+ so->so_version = (short)version;
+ *errorp = 0;
+
+ return (so);
+}
+
+static void
+sotpi_destroy(struct sonode *so)
+{
+ kmem_cache_t *cp;
+ struct sockparams *origsp;
+
+ /*
+ * If there is a new dealloc function (ie. smod_destroy_func),
+ * then it should check the correctness of the ops.
+ */
+
+ ASSERT(so->so_ops == &sotpi_sonodeops);
+
+ origsp = SOTOTPI(so)->sti_orig_sp;
+
+ sotpi_info_fini(so);
+
+ if (so->so_state & SS_FALLBACK_COMP) {
+ /*
+ * A fallback happend, which means that a sotpi_info_t struct
+ * was allocated (as opposed to being allocated from the TPI
+ * sonode cache. Therefore we explicitly free the struct
+ * here.
+ */
+ sotpi_info_destroy(so);
+ ASSERT(origsp != NULL);
+
+ origsp->sp_smod_info->smod_sock_destroy_func(so);
+ SOCKPARAMS_DEC_REF(origsp);
+ } else {
+ sonode_fini(so);
+ cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
+ socktpi_cache;
+ kmem_cache_free(cp, so);
+ }
+}
+
+/* ARGSUSED1 */
+int
+sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
+{
+ major_t maj;
+ dev_t newdev;
+ struct vnode *vp;
+ int error = 0;
+ struct stdata *stp;
+
+ sotpi_info_t *sti = SOTOTPI(so);
+
+ dprint(1, ("sotpi_init()\n"));
+
+ /*
+ * over write the sleep flag passed in but that is ok
+ * as tpi socket does not honor sleep flag.
+ */
+ flags |= FREAD|FWRITE;
- flags = FREAD|FWRITE;
+ /*
+ * Record in so_flag that it is a clone.
+ */
+ if (getmajor(sti->sti_dev) == clone_major)
+ so->so_flag |= SOCLONE;
- if ((type == SOCK_STREAM || type == SOCK_DGRAM) &&
- (domain == AF_INET || domain == AF_INET6) &&
- (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP ||
- protocol == IPPROTO_IP)) {
+ if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
+ (so->so_family == AF_INET || so->so_family == AF_INET6) &&
+ (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
+ so->so_protocol == IPPROTO_IP)) {
/* Tell tcp or udp that it's talking to sockets */
flags |= SO_SOCKSTR;
@@ -253,25 +411,25 @@ sotpi_create(vnode_t *accessvp, int domain, int type, int protocol, int version,
* make direct calls between sockfs and transport.
* The final decision is left to socktpi_open().
*/
- so->so_state |= SS_DIRECT;
+ sti->sti_direct = 1;
ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
if (so->so_type == SOCK_STREAM && tso != NULL) {
- if (tso->so_state & SS_DIRECT) {
+ if (SOTOTPI(tso)->sti_direct) {
/*
- * Inherit SS_DIRECT from listener and pass
+ * Inherit sti_direct from listener and pass
* SO_ACCEPTOR open flag to tcp, indicating
* that this is an accept fast-path instance.
*/
flags |= SO_ACCEPTOR;
} else {
/*
- * SS_DIRECT is not set on listener, meaning
+ * sti_direct is not set on listener, meaning
* that the listener has been converted from
* a socket to a stream. Ensure that the
* acceptor inherits these settings.
*/
- so->so_state &= ~SS_DIRECT;
+ sti->sti_direct = 0;
flags &= ~SO_SOCKSTR;
}
}
@@ -284,30 +442,157 @@ sotpi_create(vnode_t *accessvp, int domain, int type, int protocol, int version,
flags |= SO_SOCKSTR;
}
- /* Initialize the kernel SSL proxy fields */
- so->so_kssl_type = KSSL_NO_PROXY;
- so->so_kssl_ent = NULL;
- so->so_kssl_ctx = NULL;
+ vp = SOTOV(so);
+ newdev = vp->v_rdev;
+ maj = getmajor(newdev);
+ ASSERT(STREAMSTAB(maj));
- if (error = socktpi_open(&vp, flags, CRED(), NULL)) {
- VN_RELE(vp);
- *errorp = error;
- return (NULL);
- }
+ error = stropen(vp, &newdev, flags, cr);
- if (error = so_strinit(so, tso)) {
- (void) VOP_CLOSE(vp, 0, 1, 0, CRED(), NULL);
- VN_RELE(vp);
- *errorp = error;
- return (NULL);
- }
+ stp = vp->v_stream;
+ if (error == 0) {
+ if (so->so_flag & SOCLONE)
+ ASSERT(newdev != vp->v_rdev);
+ mutex_enter(&so->so_lock);
+ sti->sti_dev = newdev;
+ vp->v_rdev = newdev;
+ mutex_exit(&so->so_lock);
- if (version == SOV_DEFAULT)
- version = so_default_version;
+ if (stp->sd_flag & STRISTTY) {
+ /*
+ * this is a post SVR4 tty driver - a socket can not
+ * be a controlling terminal. Fail the open.
+ */
+ (void) sotpi_close(so, flags, cr);
+ return (ENOTTY); /* XXX */
+ }
- so->so_version = (short)version;
+ ASSERT(stp->sd_wrq != NULL);
+ sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
- return (so);
+ /*
+ * If caller is interested in doing direct function call
+ * interface to/from transport module, probe the module
+ * directly beneath the streamhead to see if it qualifies.
+ *
+ * We turn off the direct interface when qualifications fail.
+ * In the acceptor case, we simply turn off the sti_direct
+ * flag on the socket. We do the fallback after the accept
+ * has completed, before the new socket is returned to the
+ * application.
+ */
+ if (sti->sti_direct) {
+ queue_t *tq = stp->sd_wrq->q_next;
+
+ /*
+ * sti_direct is currently supported and tested
+ * only for tcp/udp; this is the main reason to
+ * have the following assertions.
+ */
+ ASSERT(so->so_family == AF_INET ||
+ so->so_family == AF_INET6);
+ ASSERT(so->so_protocol == IPPROTO_UDP ||
+ so->so_protocol == IPPROTO_TCP ||
+ so->so_protocol == IPPROTO_IP);
+ ASSERT(so->so_type == SOCK_DGRAM ||
+ so->so_type == SOCK_STREAM);
+
+ /*
+ * Abort direct call interface if the module directly
+ * underneath the stream head is not defined with the
+ * _D_DIRECT flag. This could happen in the tcp or
+ * udp case, when some other module is autopushed
+ * above it, or for some reasons the expected module
+ * isn't purely D_MP (which is the main requirement).
+ *
+ * Else, SS_DIRECT is valid. If the read-side Q has
+ * _QSODIRECT set then and uioasync is enabled then
+ * set SS_SODIRECT to enable sodirect.
+ */
+ if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
+ !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
+ int rval;
+
+ /* Continue on without direct calls */
+ sti->sti_direct = 0;
+
+ /*
+ * Cannot issue ioctl on fallback socket since
+ * there is no conn associated with the queue.
+ * The fallback downcall will notify the proto
+ * of the change.
+ */
+ if (!(flags & SO_ACCEPTOR) &&
+ !(flags & SO_FALLBACK)) {
+ if ((error = strioctl(vp,
+ _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
+ cr, &rval)) != 0) {
+ (void) sotpi_close(so, flags,
+ cr);
+ return (error);
+ }
+ }
+ } else if ((_OTHERQ(tq)->q_flag & _QSODIRECT) &&
+ uioasync.enabled) {
+ /* Enable sodirect */
+ so->so_state |= SS_SODIRECT;
+ }
+ }
+
+ if (flags & SO_FALLBACK) {
+ /*
+ * The stream created does not have a conn.
+ * do stream set up after conn has been assigned
+ */
+ return (error);
+ }
+ if (error = so_strinit(so, tso)) {
+ (void) sotpi_close(so, flags, cr);
+ return (error);
+ }
+
+ /* Wildcard */
+ if (so->so_protocol != so->so_sockparams->sp_protocol) {
+ int protocol = so->so_protocol;
+ /*
+ * Issue SO_PROTOTYPE setsockopt.
+ */
+ error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
+ &protocol, (t_uscalar_t)sizeof (protocol), cr);
+ if (error != 0) {
+ (void) sotpi_close(so, flags, cr);
+ /*
+ * Setsockopt often fails with ENOPROTOOPT but
+ * socket() should fail with
+ * EPROTONOSUPPORT/EPROTOTYPE.
+ */
+ return (EPROTONOSUPPORT);
+ }
+ }
+
+ } else {
+ /*
+ * While the same socket can not be reopened (unlike specfs)
+ * the stream head sets STREOPENFAIL when the autopush fails.
+ */
+ if ((stp != NULL) &&
+ (stp->sd_flag & STREOPENFAIL)) {
+ /*
+ * Open failed part way through.
+ */
+ mutex_enter(&stp->sd_lock);
+ stp->sd_flag &= ~STREOPENFAIL;
+ mutex_exit(&stp->sd_lock);
+ (void) sotpi_close(so, flags, cr);
+ return (error);
+ /*NOTREACHED*/
+ }
+ ASSERT(stp == NULL);
+ }
+ TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
+ "sockfs open:maj %d vp %p so %p error %d",
+ maj, vp, so, error);
+ return (error);
}
/*
@@ -318,15 +603,16 @@ sotpi_create(vnode_t *accessvp, int domain, int type, int protocol, int version,
static void
so_automatic_bind(struct sonode *so)
{
+ sotpi_info_t *sti = SOTOTPI(so);
ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
ASSERT(MUTEX_HELD(&so->so_lock));
ASSERT(!(so->so_state & SS_ISBOUND));
- ASSERT(so->so_unbind_mp);
+ ASSERT(sti->sti_unbind_mp);
- ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
- bzero(so->so_laddr_sa, so->so_laddr_len);
- so->so_laddr_sa->sa_family = so->so_family;
+ ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
+ bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
+ sti->sti_laddr_sa->sa_family = so->so_family;
so->so_state |= SS_ISBOUND;
}
@@ -353,9 +639,10 @@ so_automatic_bind(struct sonode *so)
* - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
* and no listen() has been done.
*/
+/* ARGSUSED */
static int
sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
- socklen_t namelen, int backlog, int flags)
+ socklen_t namelen, int backlog, int flags, struct cred *cr)
{
struct T_bind_req bind_req;
struct T_bind_ack *bind_ack;
@@ -370,6 +657,7 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
t_scalar_t PRIM_type = O_T_BIND_REQ;
boolean_t tcp_udp_xport;
void *nl7c = NULL;
+ sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
(void *)so, (void *)name, namelen, backlog, flags,
@@ -390,10 +678,10 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
* before binding. This message allocated when the socket is
* created but it might be have been consumed.
*/
- if (so->so_unbind_mp == NULL) {
+ if (sti->sti_unbind_mp == NULL) {
dprintso(so, 1, ("sobind: allocating unbind_req\n"));
/* NOTE: holding so_lock while sleeping */
- so->so_unbind_mp =
+ sti->sti_unbind_mp =
soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP);
}
@@ -405,17 +693,17 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
ASSERT(name == NULL && namelen == 0);
if (so->so_family == AF_UNIX) {
- ASSERT(so->so_ux_bound_vp);
- addr = &so->so_ux_laddr;
- addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
+ ASSERT(sti->sti_ux_bound_vp);
+ addr = &sti->sti_ux_laddr;
+ addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
"addr 0x%p, vp %p\n",
addrlen,
(void *)((struct so_ux_addr *)addr)->soua_vp,
- (void *)so->so_ux_bound_vp));
+ (void *)sti->sti_ux_bound_vp));
} else {
- addr = so->so_laddr_sa;
- addrlen = (t_uscalar_t)so->so_laddr_len;
+ addr = sti->sti_laddr_sa;
+ addrlen = (t_uscalar_t)sti->sti_laddr_len;
}
} else if (flags & _SOBIND_UNSPEC) {
ASSERT(name == NULL && namelen == 0);
@@ -436,21 +724,21 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
* Use an address with same size as struct sockaddr
* just like BSD.
*/
- so->so_laddr_len =
+ sti->sti_laddr_len =
(socklen_t)sizeof (struct sockaddr);
- ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
- bzero(so->so_laddr_sa, so->so_laddr_len);
- so->so_laddr_sa->sa_family = so->so_family;
+ ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
+ bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
+ sti->sti_laddr_sa->sa_family = so->so_family;
/*
* Pass down an address with the implicit bind
* magic number and the rest all zeros.
* The transport will return a unique address.
*/
- so->so_ux_laddr.soua_vp = NULL;
- so->so_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
- addr = &so->so_ux_laddr;
- addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
+ sti->sti_ux_laddr.soua_vp = NULL;
+ sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
+ addr = &sti->sti_ux_laddr;
+ addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
break;
case AF_INET:
@@ -459,12 +747,12 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
* An unspecified bind in TPI has a NULL address.
* Set the address in sockfs to have the sa_family.
*/
- so->so_laddr_len = (so->so_family == AF_INET) ?
+ sti->sti_laddr_len = (so->so_family == AF_INET) ?
(socklen_t)sizeof (sin_t) :
(socklen_t)sizeof (sin6_t);
- ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
- bzero(so->so_laddr_sa, so->so_laddr_len);
- so->so_laddr_sa->sa_family = so->so_family;
+ ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
+ bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
+ sti->sti_laddr_sa->sa_family = so->so_family;
addr = NULL;
addrlen = 0;
break;
@@ -478,8 +766,8 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
* protocol families. For example, AF_X25 does not
* have a family field.
*/
- bzero(so->so_laddr_sa, so->so_laddr_len);
- so->so_laddr_len = 0; /* XXX correct? */
+ bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
+ sti->sti_laddr_len = 0; /* XXX correct? */
addr = NULL;
addrlen = 0;
break;
@@ -525,6 +813,7 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
goto done;
}
}
+
/* X/Open requires this check */
if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
if (xnet_check_print) {
@@ -656,7 +945,7 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
break;
}
- if (namelen > (t_uscalar_t)so->so_laddr_maxlen) {
+ if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
error = ENAMETOOLONG;
eprintsoline(so, error);
goto done;
@@ -664,26 +953,26 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
/*
* Save local address.
*/
- so->so_laddr_len = (socklen_t)namelen;
- ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
- bcopy(name, so->so_laddr_sa, namelen);
+ sti->sti_laddr_len = (socklen_t)namelen;
+ ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
+ bcopy(name, sti->sti_laddr_sa, namelen);
- addr = so->so_laddr_sa;
- addrlen = (t_uscalar_t)so->so_laddr_len;
+ addr = sti->sti_laddr_sa;
+ addrlen = (t_uscalar_t)sti->sti_laddr_len;
switch (so->so_family) {
case AF_INET6:
case AF_INET:
break;
case AF_UNIX: {
struct sockaddr_un *soun =
- (struct sockaddr_un *)so->so_laddr_sa;
+ (struct sockaddr_un *)sti->sti_laddr_sa;
struct vnode *vp, *rvp;
struct vattr vattr;
- ASSERT(so->so_ux_bound_vp == NULL);
+ ASSERT(sti->sti_ux_bound_vp == NULL);
/*
* Create vnode for the specified path name.
- * Keep vnode held with a reference in so_ux_bound_vp.
+ * Keep vnode held with a reference in sti_ux_bound_vp.
* Use the vnode pointer as the address used in the
* bind with the transport.
*
@@ -691,7 +980,7 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
* not observe the umask.
*/
/* MAXPATHLEN + soun_family + nul termination */
- if (so->so_laddr_len >
+ if (sti->sti_laddr_len >
(socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
error = ENAMETOOLONG;
eprintsoline(so, error);
@@ -712,7 +1001,7 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
/*
* Establish pointer from the underlying filesystem
* vnode to the socket node.
- * so_ux_bound_vp and v_stream->sd_vnode form the
+ * sti_ux_bound_vp and v_stream->sd_vnode form the
* cross-linkage between the underlying filesystem
* node and the socket node.
*/
@@ -726,7 +1015,7 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
ASSERT(SOTOV(so)->v_stream);
mutex_enter(&vp->v_lock);
vp->v_stream = SOTOV(so)->v_stream;
- so->so_ux_bound_vp = vp;
+ sti->sti_ux_bound_vp = vp;
mutex_exit(&vp->v_lock);
/*
@@ -734,13 +1023,14 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
* (together with the magic number to avoid conflicts
* with implicit binds) in the transport provider.
*/
- so->so_ux_laddr.soua_vp = (void *)so->so_ux_bound_vp;
- so->so_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
- addr = &so->so_ux_laddr;
- addrlen = (t_uscalar_t)sizeof (so->so_ux_laddr);
+ sti->sti_ux_laddr.soua_vp =
+ (void *)sti->sti_ux_bound_vp;
+ sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
+ addr = &sti->sti_ux_laddr;
+ addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
addrlen,
- ((struct so_ux_addr *)addr)->soua_vp));
+ (void *)((struct so_ux_addr *)addr)->soua_vp));
break;
}
} /* end switch (so->so_family) */
@@ -771,14 +1061,14 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
if (nl7c_enabled && ((addr != NULL &&
(so->so_family == AF_INET || so->so_family == AF_INET6) &&
(nl7c = nl7c_lookup_addr(addr, addrlen))) ||
- so->so_nl7c_flags == NL7C_AF_NCA)) {
+ sti->sti_nl7c_flags == NL7C_AF_NCA)) {
/*
* NL7C is not supported in non-global zones,
* we enforce this restriction here.
*/
if (so->so_zoneid == GLOBAL_ZONEID) {
/* An NL7C socket, mark it */
- so->so_nl7c_flags |= NL7C_ENABLED;
+ sti->sti_nl7c_flags |= NL7C_ENABLED;
if (nl7c == NULL) {
/*
* Was an AF_NCA bind() so add it to the
@@ -789,6 +1079,7 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
} else
nl7c = NULL;
}
+
/*
* We send a T_BIND_REQ for TCP/UDP since we know it supports it,
* for other transports we will send in a O_T_BIND_REQ.
@@ -804,9 +1095,9 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
/* NOTE: holding so_lock while sleeping */
mp = soallocproto2(&bind_req, sizeof (bind_req),
addr, addrlen, 0, _ALLOC_SLEEP);
- so->so_state &= ~SS_LADDR_VALID;
+ sti->sti_laddr_valid = 0;
- /* Done using so_laddr_sa - can drop the lock */
+ /* Done using sti_laddr_sa - can drop the lock */
mutex_exit(&so->so_lock);
/*
@@ -820,13 +1111,15 @@ sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
(so->so_family == AF_INET || so->so_family == AF_INET6) &&
so->so_type == SOCK_STREAM) {
- if (so->so_kssl_ent != NULL) {
- kssl_release_ent(so->so_kssl_ent, so, so->so_kssl_type);
- so->so_kssl_ent = NULL;
+ if (sti->sti_kssl_ent != NULL) {
+ kssl_release_ent(sti->sti_kssl_ent, so,
+ sti->sti_kssl_type);
+ sti->sti_kssl_ent = NULL;
}
- so->so_kssl_type = kssl_check_proxy(mp, so, &so->so_kssl_ent);
- switch (so->so_kssl_type) {
+ sti->sti_kssl_type = kssl_check_proxy(mp, so,
+ &sti->sti_kssl_ent);
+ switch (sti->sti_kssl_type) {
case KSSL_NO_PROXY:
break;
@@ -865,11 +1158,11 @@ skip_transport:
/* Mark as bound. This will be undone if we detect errors below. */
if (flags & _SOBIND_NOXLATE) {
ASSERT(so->so_family == AF_UNIX);
- so->so_state |= SS_FADDR_NOXLATE;
+ sti->sti_faddr_noxlate = 1;
}
ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
so->so_state |= SS_ISBOUND;
- ASSERT(so->so_unbind_mp);
+ ASSERT(sti->sti_unbind_mp);
/* note that we've already set SS_ACCEPTCONN above */
@@ -879,7 +1172,7 @@ skip_transport:
* in return.
*/
addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
- sizeof (so->so_ux_laddr) : so->so_laddr_len);
+ sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
bind_ack = (struct T_bind_ack *)mp->b_rptr;
/*
@@ -965,7 +1258,7 @@ skip_transport:
sin_t *rname, *aname;
rname = (sin_t *)addr;
- aname = (sin_t *)so->so_laddr_sa;
+ aname = (sin_t *)sti->sti_laddr_sa;
/*
* Take advantage of the alignment
@@ -990,7 +1283,7 @@ skip_transport:
*/
if (aname->sin_port == 0)
aname->sin_port = rname->sin_port;
- so->so_state |= SS_LADDR_VALID;
+ sti->sti_laddr_valid = 1;
break;
}
if (aname->sin_port != 0 &&
@@ -1031,31 +1324,31 @@ skip_transport:
break;
}
case AF_UNIX:
- if (bcmp(addr, &so->so_ux_laddr, addrlen) != 0) {
+ if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
freemsg(mp);
error = EADDRINUSE;
eprintsoline(so, error);
eprintso(so,
("addrlen %d, addr 0x%x, vp %p\n",
addrlen, *((int *)addr),
- (void *)so->so_ux_bound_vp));
+ (void *)sti->sti_ux_bound_vp));
goto done;
}
- so->so_state |= SS_LADDR_VALID;
+ sti->sti_laddr_valid = 1;
break;
default:
/*
* NOTE: This assumes that addresses can be
* byte-compared for equivalence.
*/
- if (bcmp(addr, so->so_laddr_sa, addrlen) != 0) {
+ if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
freemsg(mp);
error = EADDRINUSE;
eprintsoline(so, error);
goto done;
}
/*
- * Don't mark SS_LADDR_VALID, as we cannot be
+ * Don't mark sti_laddr_valid, as we cannot be
* sure that the returned address is the real
* bound address when talking to an unknown
* transport.
@@ -1071,8 +1364,8 @@ skip_transport:
* caching info here is much better performance than
* a TPI/STREAMS trip to the transport for getsockname.
* Any which can't for some reason _must_ _not_ set
- * LADDR_VALID here for the caching version of getsockname
- * to not break;
+ * sti_laddr_valid here for the caching version of
+ * getsockname to not break;
*/
switch (so->so_family) {
case AF_UNIX:
@@ -1080,18 +1373,18 @@ skip_transport:
* Record the address bound with the transport
* for use by socketpair.
*/
- bcopy(addr, &so->so_ux_laddr, addrlen);
- so->so_state |= SS_LADDR_VALID;
+ bcopy(addr, &sti->sti_ux_laddr, addrlen);
+ sti->sti_laddr_valid = 1;
break;
case AF_INET:
case AF_INET6:
- ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
- bcopy(addr, so->so_laddr_sa, so->so_laddr_len);
- so->so_state |= SS_LADDR_VALID;
+ ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
+ bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
+ sti->sti_laddr_valid = 1;
break;
default:
/*
- * Don't mark SS_LADDR_VALID, as we cannot be
+ * Don't mark sti_laddr_valid, as we cannot be
* sure that the returned address is the real
* bound address when talking to an unknown
* transport.
@@ -1131,7 +1424,6 @@ done:
so_unlock_single(so, SOLOCKED);
mutex_exit(&so->so_lock);
} else {
- /* If the caller held the lock don't release it here */
ASSERT(MUTEX_HELD(&so->so_lock));
ASSERT(so->so_flag & SOLOCKED);
}
@@ -1141,13 +1433,13 @@ done:
/* bind the socket */
static int
sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
- int flags)
+ int flags, struct cred *cr)
{
if ((flags & _SOBIND_SOCKETPAIR) == 0)
- return (sotpi_bindlisten(so, name, namelen, 0, flags));
+ return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
flags &= ~_SOBIND_SOCKETPAIR;
- return (sotpi_bindlisten(so, name, namelen, 1, flags));
+ return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
}
/*
@@ -1162,6 +1454,7 @@ sotpi_unbind(struct sonode *so, int flags)
struct T_unbind_req unbind_req;
int error = 0;
mblk_t *mp;
+ sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
(void *)so, flags, pr_state(so->so_state, so->so_mode)));
@@ -1211,26 +1504,26 @@ sotpi_unbind(struct sonode *so, int flags)
*/
vnode_t *vp;
- if ((vp = so->so_ux_bound_vp) != NULL) {
+ if ((vp = sti->sti_ux_bound_vp) != NULL) {
/* Undo any SSL proxy setup */
if ((so->so_family == AF_INET ||
so->so_family == AF_INET6) &&
(so->so_type == SOCK_STREAM) &&
- (so->so_kssl_ent != NULL)) {
- kssl_release_ent(so->so_kssl_ent, so,
- so->so_kssl_type);
- so->so_kssl_ent = NULL;
- so->so_kssl_type = KSSL_NO_PROXY;
+ (sti->sti_kssl_ent != NULL)) {
+ kssl_release_ent(sti->sti_kssl_ent, so,
+ sti->sti_kssl_type);
+ sti->sti_kssl_ent = NULL;
+ sti->sti_kssl_type = KSSL_NO_PROXY;
}
-
- so->so_ux_bound_vp = NULL;
+ sti->sti_ux_bound_vp = NULL;
vn_rele_stream(vp);
}
/* Clear out address */
- so->so_laddr_len = 0;
+ sti->sti_laddr_len = 0;
}
- so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN|SS_LADDR_VALID);
+ so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
+ sti->sti_laddr_valid = 0;
done:
@@ -1246,15 +1539,17 @@ done:
* For TPI conforming transports this has to first unbind with the transport
* and then bind again using the new backlog.
*/
+/* ARGSUSED */
int
-sotpi_listen(struct sonode *so, int backlog)
+sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
{
int error = 0;
+ sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
(void *)so, backlog, pr_state(so->so_state, so->so_mode)));
- if (so->so_serv_type == T_CLTS)
+ if (sti->sti_serv_type == T_CLTS)
return (EOPNOTSUPP);
/*
@@ -1276,24 +1571,6 @@ sotpi_listen(struct sonode *so, int backlog)
mutex_enter(&so->so_lock);
so_lock_single(so); /* Set SOLOCKED */
- if (backlog < 0)
- backlog = 0;
- /*
- * Use the same qlimit as in BSD. BSD checks the qlimit
- * before queuing the next connection implying that a
- * listen(sock, 0) allows one connection to be queued.
- * BSD also uses 1.5 times the requested backlog.
- *
- * XNS Issue 4 required a strict interpretation of the backlog.
- * This has been waived subsequently for Issue 4 and the change
- * incorporated in XNS Issue 5. So we aren't required to do
- * anything special for XPG apps.
- */
- if (backlog >= (INT_MAX - 1) / 3)
- backlog = INT_MAX;
- else
- backlog = backlog * 3 / 2 + 1;
-
/*
* If the listen doesn't change the backlog we do nothing.
* This avoids an EPROTO error from the transport.
@@ -1311,7 +1588,7 @@ sotpi_listen(struct sonode *so, int backlog)
goto done;
}
error = sotpi_bindlisten(so, NULL, 0, backlog,
- _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN);
+ _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
} else if (backlog > 0) {
/*
* AF_INET{,6} hack to avoid losing the port.
@@ -1327,7 +1604,7 @@ sotpi_listen(struct sonode *so, int backlog)
goto done;
}
error = sotpi_bindlisten(so, NULL, 0, backlog,
- _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN);
+ _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
} else {
so->so_state |= SS_ACCEPTCONN;
so->so_backlog = backlog;
@@ -1349,7 +1626,7 @@ done:
* the current use of sodisconnect(seqno == -1) is only for shutdown
* so there is no point (and potentially incorrect) to unbind.
*/
-int
+static int
sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
{
struct T_discon_req discon_req;
@@ -1406,8 +1683,9 @@ sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
* is allowed to complete. However, it is not possible to
* assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
*/
- so->so_state &=
- ~(SS_ISCONNECTED|SS_ISCONNECTING|SS_LADDR_VALID|SS_FADDR_VALID);
+ so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
+ SOTOTPI(so)->sti_laddr_valid = 0;
+ SOTOTPI(so)->sti_faddr_valid = 0;
done:
if (!(flags & _SODISCONNECT_LOCK_HELD)) {
so_unlock_single(so, SOLOCKED);
@@ -1420,8 +1698,10 @@ done:
return (error);
}
+/* ARGSUSED */
int
-sotpi_accept(struct sonode *so, int fflag, struct sonode **nsop)
+sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
+ struct sonode **nsop)
{
struct T_conn_ind *conn_ind;
struct T_conn_res *conn_res;
@@ -1436,6 +1716,8 @@ sotpi_accept(struct sonode *so, int fflag, struct sonode **nsop)
t_scalar_t PRIM_type;
t_scalar_t SEQ_number;
size_t sinlen;
+ sotpi_info_t *sti = SOTOTPI(so);
+ sotpi_info_t *nsti;
dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
(void *)so, fflag, (void *)nsop,
@@ -1454,7 +1736,7 @@ again:
if ((error = sowaitconnind(so, fflag, &mp)) != 0)
goto e_bad;
- ASSERT(mp);
+ ASSERT(mp != NULL);
conn_ind = (struct T_conn_ind *)mp->b_rptr;
ctxmp = mp->b_cont;
@@ -1475,8 +1757,7 @@ again:
switch (so->so_family) {
case AF_INET:
case AF_INET6:
- if ((optlen == sizeof (intptr_t)) &&
- ((so->so_state & SS_DIRECT) != 0)) {
+ if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
bcopy(mp->b_rptr + conn_ind->OPT_offset,
&opt, conn_ind->OPT_length);
} else {
@@ -1489,7 +1770,7 @@ again:
* problems when sockfs sends a normal T_CONN_RES
* message down the new stream.
*/
- if (so->so_state & SS_DIRECT) {
+ if (sti->sti_direct) {
int rval;
/*
* For consistency we inform tcp to disable
@@ -1498,7 +1779,7 @@ again:
* because no data will ever travel upstream
* on the listening socket.
*/
- so->so_state &= ~SS_DIRECT;
+ sti->sti_direct = 0;
(void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
0, 0, K_TO_K, CRED(), &rval);
}
@@ -1519,7 +1800,7 @@ again:
}
}
if (so->so_family == AF_UNIX) {
- if (!(so->so_state & SS_FADDR_NOXLATE)) {
+ if (!sti->sti_faddr_noxlate) {
src = NULL;
srclen = 0;
}
@@ -1533,9 +1814,7 @@ again:
/*
* Create the new socket.
*/
- VN_HOLD(so->so_accessvp);
- nso = sotpi_create(so->so_accessvp, so->so_family, so->so_type,
- so->so_protocol, so->so_version, so, &error);
+ nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
if (nso == NULL) {
ASSERT(error != 0);
/*
@@ -1549,6 +1828,7 @@ again:
goto e_disc_unl;
}
nvp = SOTOV(nso);
+ nsti = SOTOTPI(nso);
/*
* If the transport sent up an SSL connection context, then attach
@@ -1561,7 +1841,7 @@ again:
* This kssl_ctx_t is already held for us by the transport.
* So, we don't need to do a kssl_hold_ctx() here.
*/
- nso->so_kssl_ctx = *((kssl_ctx_t *)ctxmp->b_rptr);
+ nsti->sti_kssl_ctx = *((kssl_ctx_t *)ctxmp->b_rptr);
freemsg(ctxmp);
mp->b_cont = NULL;
strsetrwputdatahooks(nvp, strsock_kssl_input,
@@ -1572,7 +1852,6 @@ again:
mutex_enter(nso->so_direct->sod_lockp);
SOD_DISABLE(nso->so_direct);
mutex_exit(nso->so_direct->sod_lockp);
- nso->so_direct = NULL;
}
}
#ifdef DEBUG
@@ -1591,16 +1870,16 @@ again:
* NOTE: AF_UNIX NUL termination is ensured by the sender's
* copyin_name().
*/
- if (srclen > (t_uscalar_t)nso->so_faddr_maxlen) {
+ if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
error = EINVAL;
freemsg(mp);
eprintsoline(so, error);
goto disconnect_vp_unlocked;
}
- nso->so_faddr_len = (socklen_t)srclen;
- ASSERT(so->so_faddr_len <= so->so_faddr_maxlen);
- bcopy(src, nso->so_faddr_sa, srclen);
- nso->so_state |= SS_FADDR_VALID;
+ nsti->sti_faddr_len = (socklen_t)srclen;
+ ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
+ bcopy(src, nsti->sti_faddr_sa, srclen);
+ nsti->sti_faddr_valid = 1;
if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
(sizeof (struct T_conn_res) + sizeof (intptr_t))) {
@@ -1654,7 +1933,8 @@ again:
mutex_exit(&nso->so_lock);
} else {
/* Perform NULL bind with the transport provider. */
- if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC)) != 0) {
+ if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
+ cr)) != 0) {
ASSERT(error != ENOBUFS);
freemsg(mp);
eprintsoline(nso, error);
@@ -1671,7 +1951,8 @@ again:
* can access the new socket thus we relax the locking.
*/
nso->so_pgrp = so->so_pgrp;
- nso->so_state |= so->so_state & (SS_ASYNC|SS_FADDR_NOXLATE);
+ nso->so_state |= so->so_state & SS_ASYNC;
+ nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
if (nso->so_pgrp != 0) {
if ((error = so_set_events(nso, nvp, CRED())) != 0) {
@@ -1695,7 +1976,12 @@ again:
if (nso->so_options & SO_LINGER)
nso->so_linger = so->so_linger;
- if ((so->so_state & SS_DIRECT) != 0) {
+ /*
+ * Note that the following sti_direct code path should be
+ * removed once we are confident that the direct sockets
+ * do not result in any degradation.
+ */
+ if (sti->sti_direct) {
ASSERT(opt != NULL);
@@ -1731,22 +2017,23 @@ again:
sin = (sin_t *)(ack_mp->b_rptr +
sizeof (struct T_ok_ack));
- bcopy(sin, nso->so_laddr_sa, sizeof (sin_t));
- nso->so_laddr_len = sizeof (sin_t);
+ bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
+ nsti->sti_laddr_len = sizeof (sin_t);
} else {
sin6_t *sin6;
sin6 = (sin6_t *)(ack_mp->b_rptr +
sizeof (struct T_ok_ack));
- bcopy(sin6, nso->so_laddr_sa, sizeof (sin6_t));
- nso->so_laddr_len = sizeof (sin6_t);
+ bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
+ nsti->sti_laddr_len = sizeof (sin6_t);
}
freemsg(ack_mp);
- nso->so_state |= SS_ISCONNECTED | SS_LADDR_VALID;
- nso->so_priv = opt;
+ nso->so_state |= SS_ISCONNECTED;
+ nso->so_proto_handle = (sock_lower_handle_t)opt;
+ nsti->sti_laddr_valid = 1;
- if (so->so_nl7c_flags & NL7C_ENABLED) {
+ if (sti->sti_nl7c_flags & NL7C_ENABLED) {
/*
* A NL7C marked listen()er so the new socket
* inherits the listen()er's NL7C state, except
@@ -1755,14 +2042,15 @@ again:
* Only call NL7C to process the new socket if
* the listen socket allows blocking i/o.
*/
- nso->so_nl7c_flags = so->so_nl7c_flags & (~NL7C_POLLIN);
+ nsti->sti_nl7c_flags =
+ sti->sti_nl7c_flags & (~NL7C_POLLIN);
if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
/*
* Nonblocking accept() just make it
* persist to defer processing to the
* read-side syscall (e.g. read).
*/
- nso->so_nl7c_flags |= NL7C_SOPERSIST;
+ nsti->sti_nl7c_flags |= NL7C_SOPERSIST;
} else if (nl7c_process(nso, B_FALSE)) {
/*
* NL7C has completed processing on the
@@ -1782,12 +2070,12 @@ again:
/*
* It's possible, through the use of autopush for example,
- * that the acceptor stream may not support SS_DIRECT
- * semantics. If the new socket does not support SS_DIRECT
+ * that the acceptor stream may not support sti_direct
+ * semantics. If the new socket does not support sti_direct
* we issue a _SIOCSOCKFALLBACK to inform the transport
* as we would in the I_PUSH case.
*/
- if (!(nso->so_state & SS_DIRECT)) {
+ if (nsti->sti_direct == 0) {
int rval;
if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
@@ -1842,7 +2130,7 @@ again:
conn_res->PRIM_type = O_T_CONN_RES;
PRIM_type = O_T_CONN_RES;
} else {
- conn_res->ACCEPTOR_id = nso->so_acceptor_id;
+ conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
conn_res->PRIM_type = T_CONN_RES;
PRIM_type = T_CONN_RES;
}
@@ -1871,27 +2159,28 @@ again:
* If there is a sin/sin6 appended onto the T_OK_ACK use
* that to set the local address. If this is not present
* then we zero out the address and don't set the
- * SS_LADDR_VALID bit. For AF_UNIX endpoints we copy over
+ * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
* the pathname from the listening socket.
*/
sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
ack_mp->b_rptr += sizeof (struct T_ok_ack);
- bcopy(ack_mp->b_rptr, nso->so_laddr_sa, sinlen);
- nso->so_laddr_len = sinlen;
- nso->so_state |= SS_LADDR_VALID;
+ bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
+ nsti->sti_laddr_len = sinlen;
+ nsti->sti_laddr_valid = 1;
} else if (nso->so_family == AF_UNIX) {
ASSERT(so->so_family == AF_UNIX);
- nso->so_laddr_len = so->so_laddr_len;
- ASSERT(nso->so_laddr_len <= nso->so_laddr_maxlen);
- bcopy(so->so_laddr_sa, nso->so_laddr_sa, nso->so_laddr_len);
- nso->so_state |= SS_LADDR_VALID;
+ nsti->sti_laddr_len = sti->sti_laddr_len;
+ ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
+ bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
+ nsti->sti_laddr_len);
+ nsti->sti_laddr_valid = 1;
} else {
- nso->so_laddr_len = so->so_laddr_len;
- ASSERT(nso->so_laddr_len <= nso->so_laddr_maxlen);
- bzero(nso->so_laddr_sa, nso->so_addr_size);
- nso->so_laddr_sa->sa_family = nso->so_family;
+ nsti->sti_laddr_len = sti->sti_laddr_len;
+ ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
+ bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
+ nsti->sti_laddr_sa->sa_family = nso->so_family;
}
freemsg(ack_mp);
@@ -1953,7 +2242,8 @@ sotpi_connect(struct sonode *so,
const struct sockaddr *name,
socklen_t namelen,
int fflag,
- int flags)
+ int flags,
+ struct cred *cr)
{
struct T_conn_req conn_req;
int error = 0;
@@ -1963,6 +2253,7 @@ sotpi_connect(struct sonode *so,
void *addr;
socklen_t addrlen;
boolean_t need_unlock;
+ sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
(void *)so, (void *)name, namelen, fflag, flags,
@@ -1971,13 +2262,13 @@ sotpi_connect(struct sonode *so,
/*
* Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
* avoid sleeping for memory with SOLOCKED held.
- * We know that the T_CONN_REQ can't be larger than 2 * so_faddr_maxlen
+ * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
* + sizeof (struct T_opthdr).
* (the AF_UNIX so_ux_addr_xlate() does not make the address
- * exceed so_faddr_maxlen).
+ * exceed sti_faddr_maxlen).
*/
mp = soallocproto(sizeof (struct T_conn_req) +
- 2 * so->so_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR);
+ 2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR);
if (mp == NULL) {
/*
* Connect can not fail with ENOBUFS. A signal was
@@ -2001,12 +2292,12 @@ sotpi_connect(struct sonode *so,
so_lock_single(so); /* Set SOLOCKED */
need_unlock = B_TRUE;
- if (so->so_unbind_mp == NULL) {
+ if (sti->sti_unbind_mp == NULL) {
dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
/* NOTE: holding so_lock while sleeping */
- so->so_unbind_mp =
+ sti->sti_unbind_mp =
soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR);
- if (so->so_unbind_mp == NULL) {
+ if (sti->sti_unbind_mp == NULL) {
error = EINTR;
goto done;
}
@@ -2034,7 +2325,7 @@ sotpi_connect(struct sonode *so,
so_automatic_bind(so);
} else {
error = sotpi_bind(so, NULL, 0,
- _SOBIND_UNSPEC|_SOBIND_LOCK_HELD);
+ _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
if (error)
goto done;
}
@@ -2088,17 +2379,19 @@ sotpi_connect(struct sonode *so,
_SODISCONNECT_LOCK_HELD);
} else {
so->so_state &=
- ~(SS_ISCONNECTED | SS_ISCONNECTING |
- SS_FADDR_VALID);
- so->so_faddr_len = 0;
+ ~(SS_ISCONNECTED | SS_ISCONNECTING);
+ sti->sti_faddr_valid = 0;
+ sti->sti_faddr_len = 0;
}
+ /* Remove SOLOCKED since setsockopt will grab it */
so_unlock_single(so, SOLOCKED);
mutex_exit(&so->so_lock);
val = 0;
- (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
- &val, (t_uscalar_t)sizeof (val));
+ (void) sotpi_setsockopt(so, SOL_SOCKET,
+ SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
+ cr);
mutex_enter(&so->so_lock);
so_lock_single(so); /* Set SOLOCKED */
@@ -2112,7 +2405,7 @@ sotpi_connect(struct sonode *so,
goto done;
}
/*
- * Mark the socket if so_faddr_sa represents the transport level
+ * Mark the socket if sti_faddr_sa represents the transport level
* address.
*/
if (flags & _SOCONNECT_NOXLATE) {
@@ -2126,7 +2419,7 @@ sotpi_connect(struct sonode *so,
soaddr_ux = (struct sockaddr_ux *)name;
name = (struct sockaddr *)&soaddr_ux->sou_addr;
namelen = sizeof (soaddr_ux->sou_addr);
- so->so_state |= SS_FADDR_NOXLATE;
+ sti->sti_faddr_noxlate = 1;
}
/*
@@ -2141,46 +2434,46 @@ sotpi_connect(struct sonode *so,
* transport providers that do not support TI_GETPEERNAME.
* Also used for cached foreign address for TCP and UDP.
*/
- if (namelen > (t_uscalar_t)so->so_faddr_maxlen) {
+ if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
error = EINVAL;
goto done;
}
- so->so_faddr_len = (socklen_t)namelen;
- ASSERT(so->so_faddr_len <= so->so_faddr_maxlen);
- bcopy(name, so->so_faddr_sa, namelen);
- so->so_state |= SS_FADDR_VALID;
+ sti->sti_faddr_len = (socklen_t)namelen;
+ ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
+ bcopy(name, sti->sti_faddr_sa, namelen);
+ sti->sti_faddr_valid = 1;
if (so->so_family == AF_UNIX) {
- if (so->so_state & SS_FADDR_NOXLATE) {
+ if (sti->sti_faddr_noxlate) {
/*
* Already have a transport internal address. Do not
* pass any (transport internal) source address.
*/
- addr = so->so_faddr_sa;
- addrlen = (t_uscalar_t)so->so_faddr_len;
+ addr = sti->sti_faddr_sa;
+ addrlen = (t_uscalar_t)sti->sti_faddr_len;
src = NULL;
srclen = 0;
} else {
/*
* Pass the sockaddr_un source address as an option
* and translate the remote address.
- * Holding so_lock thus so_laddr_sa can not change.
+ * Holding so_lock thus sti_laddr_sa can not change.
*/
- src = so->so_laddr_sa;
- srclen = (t_uscalar_t)so->so_laddr_len;
+ src = sti->sti_laddr_sa;
+ srclen = (t_uscalar_t)sti->sti_laddr_len;
dprintso(so, 1,
("sotpi_connect UNIX: srclen %d, src %p\n",
srclen, src));
error = so_ux_addr_xlate(so,
- so->so_faddr_sa, (socklen_t)so->so_faddr_len,
+ sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
(flags & _SOCONNECT_XPG4_2),
&addr, &addrlen);
if (error)
goto bad;
}
} else {
- addr = so->so_faddr_sa;
- addrlen = (t_uscalar_t)so->so_faddr_len;
+ addr = sti->sti_faddr_sa;
+ addrlen = (t_uscalar_t)sti->sti_faddr_len;
src = NULL;
srclen = 0;
}
@@ -2209,7 +2502,7 @@ sotpi_connect(struct sonode *so,
val = 1;
(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
- &val, (t_uscalar_t)sizeof (val));
+ &val, (t_uscalar_t)sizeof (val), cr);
mutex_enter(&so->so_lock);
so_lock_single(so); /* Set SOLOCKED */
@@ -2225,8 +2518,8 @@ sotpi_connect(struct sonode *so,
*/
fflag = 0;
ASSERT(so->so_family != AF_UNIX);
- so->so_state &= ~SS_LADDR_VALID;
- } else if (so->so_laddr_len != 0) {
+ sti->sti_laddr_valid = 0;
+ } else if (sti->sti_laddr_len != 0) {
/*
* If the local address or port was "any" then it may be
* changed by the transport as a result of the
@@ -2234,21 +2527,22 @@ sotpi_connect(struct sonode *so,
*/
switch (so->so_family) {
case AF_INET:
- ASSERT(so->so_laddr_len == (socklen_t)sizeof (sin_t));
- if (((sin_t *)so->so_laddr_sa)->sin_addr.s_addr ==
+ ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
+ if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
INADDR_ANY ||
- ((sin_t *)so->so_laddr_sa)->sin_port == 0)
- so->so_state &= ~SS_LADDR_VALID;
+ ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
+ sti->sti_laddr_valid = 0;
break;
case AF_INET6:
- ASSERT(so->so_laddr_len == (socklen_t)sizeof (sin6_t));
+ ASSERT(sti->sti_laddr_len ==
+ (socklen_t)sizeof (sin6_t));
if (IN6_IS_ADDR_UNSPECIFIED(
- &((sin6_t *)so->so_laddr_sa) ->sin6_addr) ||
+ &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
IN6_IS_ADDR_V4MAPPED_ANY(
- &((sin6_t *)so->so_laddr_sa)->sin6_addr) ||
- ((sin6_t *)so->so_laddr_sa)->sin6_port == 0)
- so->so_state &= ~SS_LADDR_VALID;
+ &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
+ ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
+ sti->sti_laddr_valid = 0;
break;
default:
@@ -2337,30 +2631,18 @@ done:
case EISCONN:
case EINTR:
/* Non-fatal errors */
- so->so_state &= ~SS_LADDR_VALID;
+ sti->sti_laddr_valid = 0;
/* FALLTHRU */
case 0:
break;
-
- case EHOSTUNREACH:
- if (flags & _SOCONNECT_XPG4_2) {
- /*
- * X/Open specification contains a requirement that
- * ENETUNREACH be returned but does not require
- * EHOSTUNREACH. In order to keep the test suite
- * happy we mess with the errno here.
- */
- error = ENETUNREACH;
- }
- /* FALLTHRU */
-
default:
ASSERT(need_unlock);
/*
* Fatal errors: clear SS_ISCONNECTING in case it was set,
* and invalidate local-address cache
*/
- so->so_state &= ~(SS_ISCONNECTING | SS_LADDR_VALID);
+ so->so_state &= ~SS_ISCONNECTING;
+ sti->sti_laddr_valid = 0;
/* A discon_ind might have already unbound us */
if ((flags & _SOCONNECT_DID_BIND) &&
(so->so_state & SS_ISBOUND)) {
@@ -2379,18 +2661,20 @@ done:
mutex_exit(&so->so_lock);
return (error);
-so_bad: error = sogeterr(so);
+so_bad: error = sogeterr(so, B_TRUE);
bad: eprintsoline(so, error);
goto done;
}
+/* ARGSUSED */
int
-sotpi_shutdown(struct sonode *so, int how)
+sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
{
struct T_ordrel_req ordrel_req;
mblk_t *mp;
uint_t old_state, state_change;
int error = 0;
+ sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
(void *)so, how, pr_state(so->so_state, so->so_mode)));
@@ -2523,14 +2807,14 @@ sotpi_shutdown(struct sonode *so, int how)
* For SunOS 4.X compatibility we tell the other end
* that we are unable to receive at this point.
*/
- if (so->so_family == AF_UNIX && so->so_serv_type != T_CLTS)
+ if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
so_unix_close(so);
- if (so->so_serv_type == T_COTS)
+ if (sti->sti_serv_type == T_COTS)
error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
}
if ((state_change & SS_CANTSENDMORE) &&
- (so->so_serv_type == T_COTS_ORD)) {
+ (sti->sti_serv_type == T_COTS_ORD)) {
/* Send an orderly release */
ordrel_req.PRIM_type = T_ORDREL_REQ;
@@ -2582,6 +2866,7 @@ so_unix_close(struct sonode *so)
int error;
struct T_opthdr toh;
mblk_t *mp;
+ sotpi_info_t *sti = SOTOTPI(so);
ASSERT(MUTEX_HELD(&so->so_lock));
@@ -2632,35 +2917,35 @@ so_unix_close(struct sonode *so)
/*
* Length and family checks.
*/
- error = so_addr_verify(so, so->so_faddr_sa,
- (t_uscalar_t)so->so_faddr_len);
+ error = so_addr_verify(so, sti->sti_faddr_sa,
+ (t_uscalar_t)sti->sti_faddr_len);
if (error) {
eprintsoline(so, error);
return;
}
- if (so->so_state & SS_FADDR_NOXLATE) {
+ if (sti->sti_faddr_noxlate) {
/*
* Already have a transport internal address. Do not
* pass any (transport internal) source address.
*/
- addr = so->so_faddr_sa;
- addrlen = (t_uscalar_t)so->so_faddr_len;
+ addr = sti->sti_faddr_sa;
+ addrlen = (t_uscalar_t)sti->sti_faddr_len;
src = NULL;
srclen = 0;
} else {
/*
* Pass the sockaddr_un source address as an option
* and translate the remote address.
- * Holding so_lock thus so_laddr_sa can not change.
+ * Holding so_lock thus sti_laddr_sa can not change.
*/
- src = so->so_laddr_sa;
- srclen = (socklen_t)so->so_laddr_len;
+ src = sti->sti_laddr_sa;
+ srclen = (socklen_t)sti->sti_laddr_len;
dprintso(so, 1,
("so_ux_close: srclen %d, src %p\n",
srclen, src));
error = so_ux_addr_xlate(so,
- so->so_faddr_sa,
- (socklen_t)so->so_faddr_len, 0,
+ sti->sti_faddr_sa,
+ (socklen_t)sti->sti_faddr_len, 0,
&addr, &addrlen);
if (error) {
eprintsoline(so, error);
@@ -2717,93 +3002,6 @@ so_unix_close(struct sonode *so)
}
/*
- * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
- */
-int
-sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags)
-{
- mblk_t *mp, *nmp;
- int error;
-
- dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n",
- (void *)so, (void *)msg, flags));
-
- /*
- * There is never any oob data with addresses or control since
- * the T_EXDATA_IND does not carry any options.
- */
- msg->msg_controllen = 0;
- msg->msg_namelen = 0;
-
- mutex_enter(&so->so_lock);
- ASSERT(so_verify_oobstate(so));
- if ((so->so_options & SO_OOBINLINE) ||
- (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
- dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
- mutex_exit(&so->so_lock);
- return (EINVAL);
- }
- if (!(so->so_state & SS_HAVEOOBDATA)) {
- dprintso(so, 1, ("sorecvoob: no data yet\n"));
- mutex_exit(&so->so_lock);
- return (EWOULDBLOCK);
- }
- ASSERT(so->so_oobmsg != NULL);
- mp = so->so_oobmsg;
- if (flags & MSG_PEEK) {
- /*
- * Since recv* can not return ENOBUFS we can not use dupmsg.
- * Instead we revert to the consolidation private
- * allocb_wait plus bcopy.
- */
- mblk_t *mp1;
-
- mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
- ASSERT(mp1);
-
- while (mp != NULL) {
- ssize_t size;
-
- size = MBLKL(mp);
- bcopy(mp->b_rptr, mp1->b_wptr, size);
- mp1->b_wptr += size;
- ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
- mp = mp->b_cont;
- }
- mp = mp1;
- } else {
- /*
- * Update the state indicating that the data has been consumed.
- * Keep SS_OOBPEND set until data is consumed past the mark.
- */
- so->so_oobmsg = NULL;
- so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
- }
- dprintso(so, 1,
- ("after recvoob(%p): counts %d/%d state %s\n",
- (void *)so, so->so_oobsigcnt,
- so->so_oobcnt, pr_state(so->so_state, so->so_mode)));
- ASSERT(so_verify_oobstate(so));
- mutex_exit(&so->so_lock);
-
- error = 0;
- nmp = mp;
- while (nmp != NULL && uiop->uio_resid > 0) {
- ssize_t n = MBLKL(nmp);
-
- n = MIN(n, uiop->uio_resid);
- if (n > 0)
- error = uiomove(nmp->b_rptr, n,
- UIO_READ, uiop);
- if (error)
- break;
- nmp = nmp->b_cont;
- }
- freemsg(mp);
- return (error);
-}
-
-/*
* Called by sotpi_recvmsg when reading a non-zero amount of data.
* In addition, the caller typically verifies that there is some
* potential state to clear by checking
@@ -2811,7 +3009,7 @@ sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags)
* before calling this routine.
* Note that such a check can be made without holding so_lock since
* sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
- * decrements so_oobsigcnt.
+ * decrements sti_oobsigcnt.
*
* When data is read *after* the point that all pending
* oob data has been consumed the oob indication is cleared.
@@ -2823,13 +3021,15 @@ sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags)
static void
sorecv_update_oobstate(struct sonode *so)
{
+ sotpi_info_t *sti = SOTOTPI(so);
+
mutex_enter(&so->so_lock);
ASSERT(so_verify_oobstate(so));
dprintso(so, 1,
("sorecv_update_oobstate: counts %d/%d state %s\n",
- so->so_oobsigcnt,
- so->so_oobcnt, pr_state(so->so_state, so->so_mode)));
- if (so->so_oobsigcnt == 0) {
+ sti->sti_oobsigcnt,
+ sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
+ if (sti->sti_oobsigcnt == 0) {
/* No more pending oob indications */
so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
freemsg(so->so_oobmsg);
@@ -2845,10 +3045,11 @@ sorecv_update_oobstate(struct sonode *so)
static int
nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
{
+ sotpi_info_t *sti = SOTOTPI(so);
int error = 0;
mblk_t *tmp = NULL;
mblk_t *pmp = NULL;
- mblk_t *nmp = so->so_nl7c_rcv_mp;
+ mblk_t *nmp = sti->sti_nl7c_rcv_mp;
ASSERT(nmp != NULL);
@@ -2889,25 +3090,24 @@ nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
if (pmp != NULL) {
/* Free any mblk_t(s) which we have consumed */
pmp->b_cont = NULL;
- freemsg(so->so_nl7c_rcv_mp);
+ freemsg(sti->sti_nl7c_rcv_mp);
}
- if ((so->so_nl7c_rcv_mp = nmp) == NULL) {
+ if ((sti->sti_nl7c_rcv_mp = nmp) == NULL) {
/* Last mblk_t so return the saved kstrgetmsg() rval/error */
if (error == 0) {
- rval_t *p = (rval_t *)&so->so_nl7c_rcv_rval;
+ rval_t *p = (rval_t *)&sti->sti_nl7c_rcv_rval;
error = p->r_v.r_v2;
p->r_v.r_v2 = 0;
}
- rp->r_vals = so->so_nl7c_rcv_rval;
- so->so_nl7c_rcv_rval = 0;
+ rp->r_vals = sti->sti_nl7c_rcv_rval;
+ sti->sti_nl7c_rcv_rval = 0;
} else {
/* More mblk_t(s) to process so no rval to return */
rp->r_vals = 0;
}
return (error);
}
-
/*
* Receive the next message on the queue.
* If msg_controllen is non-zero when called the caller is interested in
@@ -2917,8 +3117,10 @@ nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
* The routine returns with msg_control and msg_name pointing to
* kmem_alloc'ed memory which the caller has to free.
*/
+/* ARGSUSED */
int
-sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
+sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
+ struct cred *cr)
{
union T_primitives *tpr;
mblk_t *mp;
@@ -2932,10 +3134,10 @@ sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
rval_t rval;
int flags;
clock_t timout;
- int first;
int error = 0;
+ int reterr = 0;
struct uio *suiop = NULL;
- sodirect_t *sodp = so->so_direct;
+ sotpi_info_t *sti = SOTOTPI(so);
flags = msg->msg_flags;
msg->msg_flags = 0;
@@ -2944,6 +3146,12 @@ sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
(void *)so, (void *)msg, flags,
pr_state(so->so_state, so->so_mode), so->so_error));
+ if (so->so_version == SOV_STREAM) {
+ so_update_attrs(so, SOACC);
+ /* The imaginary "sockmod" has been popped - act as a stream */
+ return (strread(SOTOV(so), uiop, cr));
+ }
+
/*
* If we are not connected because we have never been connected
* we return ENOTCONN. If we have been connected (but are no longer
@@ -2970,9 +3178,13 @@ sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
/* Check that the transport supports OOB */
if (!(so->so_mode & SM_EXDATA))
return (EOPNOTSUPP);
- return (sorecvoob(so, msg, uiop, flags));
+ so_update_attrs(so, SOACC);
+ return (sorecvoob(so, msg, uiop, flags,
+ (so->so_options & SO_OOBINLINE)));
}
+ so_update_attrs(so, SOACC);
+
/*
* Set msg_controllen and msg_namelen to zero here to make it
* simpler in the cases that no control or name is returned.
@@ -2989,31 +3201,32 @@ sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
/*
* If an NL7C enabled socket and not waiting for write data.
*/
- if ((so->so_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
+ if ((sti->sti_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
NL7C_ENABLED) {
- if (so->so_nl7c_uri) {
+ if (sti->sti_nl7c_uri) {
/* Close uri processing for a previous request */
nl7c_close(so);
}
- if ((so_state & SS_CANTRCVMORE) && so->so_nl7c_rcv_mp == NULL) {
+ if ((so_state & SS_CANTRCVMORE) &&
+ sti->sti_nl7c_rcv_mp == NULL) {
/* Nothing to process, EOF */
mutex_exit(&so->so_lock);
return (0);
- } else if (so->so_nl7c_flags & NL7C_SOPERSIST) {
+ } else if (sti->sti_nl7c_flags & NL7C_SOPERSIST) {
/* Persistent NL7C socket, try to process request */
boolean_t ret;
ret = nl7c_process(so,
(so->so_state & (SS_NONBLOCK|SS_NDELAY)));
- rval.r_vals = so->so_nl7c_rcv_rval;
+ rval.r_vals = sti->sti_nl7c_rcv_rval;
error = rval.r_v.r_v2;
if (error) {
/* Error of some sort, return it */
mutex_exit(&so->so_lock);
return (error);
}
- if (so->so_nl7c_flags &&
- ! (so->so_nl7c_flags & NL7C_WAITWRITE)) {
+ if (sti->sti_nl7c_flags &&
+ ! (sti->sti_nl7c_flags & NL7C_WAITWRITE)) {
/*
* Still an NL7C socket and no data
* to pass up to the caller.
@@ -3031,7 +3244,7 @@ sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
/*
* Not persistent so no further NL7C processing.
*/
- so->so_nl7c_flags = 0;
+ sti->sti_nl7c_flags = 0;
}
}
/*
@@ -3081,84 +3294,23 @@ sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
else
timout = -1;
opflag = pflag;
- first = 1;
- if (uiop->uio_resid >= uioasync.mincnt &&
- sodp != NULL && (sodp->sod_state & SOD_ENABLED) &&
- uioasync.enabled && !(flags & MSG_PEEK) &&
- !(so_state & SS_CANTRCVMORE)) {
- /*
- * Big enough I/O for uioa min setup and an sodirect socket
- * and sodirect enabled and uioa enabled and I/O will be done
- * and not EOF so initialize the sodirect_t uioa_t with "uiop".
- */
- mutex_enter(sodp->sod_lockp);
- if (!uioainit(uiop, &sodp->sod_uioa)) {
- /*
- * Successful uioainit() so the uio_t part of the
- * uioa_t will be used for all uio_t work to follow,
- * we save the original "uiop" in "suiop".
- */
- suiop = uiop;
- uiop = (uio_t *)&sodp->sod_uioa;
- /*
- * Before returning to the caller the passed in uio_t
- * "uiop" will be updated via a call to uioafini()
- * below.
- *
- * Note, the uioa.uioa_state isn't set to UIOA_ENABLED
- * here as first we have to uioamove() any currently
- * queued M_DATA mblk_t(s) so it will be done in
- * kstrgetmsg().
- */
- }
- /*
- * In either uioainit() success or not case note the number
- * of uio bytes the caller wants for sod framework and/or
- * transport (e.g. TCP) strategy.
- */
- sodp->sod_want = uiop->uio_resid;
- mutex_exit(sodp->sod_lockp);
- } else if (sodp != NULL && (sodp->sod_state & SOD_ENABLED)) {
- /*
- * No uioa but still using sodirect so note the number of
- * uio bytes the caller wants for sodirect framework and/or
- * transport (e.g. TCP) strategy.
- *
- * Note, sod_lockp not held, only writer is in this function
- * and only one thread at a time so not needed just to init.
- */
- sodp->sod_want = uiop->uio_resid;
- }
+ suiop = sod_rcv_init(so, flags, &uiop);
retry:
saved_resid = uiop->uio_resid;
pri = 0;
mp = NULL;
- if (so->so_nl7c_rcv_mp != NULL) {
+ if (sti->sti_nl7c_rcv_mp != NULL) {
/* Already kstrgetmsg()ed saved mblk(s) from NL7C */
error = nl7c_sorecv(so, &mp, uiop, &rval);
} else {
error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
timout, &rval);
}
- if (error) {
- switch (error) {
- case EINTR:
- case EWOULDBLOCK:
- if (!first)
- error = 0;
- break;
- case ETIME:
- /* Returned from kstrgetmsg when timeout expires */
- if (!first)
- error = 0;
- else
- error = EWOULDBLOCK;
- break;
- default:
- eprintsoline(so, error);
- break;
- }
+ if (error != 0) {
+ /* kstrgetmsg returns ETIME when timeout expires */
+ if (error == ETIME)
+ error = EWOULDBLOCK;
goto out;
}
/*
@@ -3198,7 +3350,6 @@ retry:
if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
mutex_exit(&so->so_lock);
- first = 0;
pflag = opflag | MSG_NOMARK;
goto retry;
}
@@ -3238,7 +3389,6 @@ retry:
if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
mutex_exit(&so->so_lock);
- first = 0;
pflag = opflag | MSG_NOMARK;
goto retry;
}
@@ -3436,7 +3586,6 @@ retry:
controllen == 0 &&
uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
mutex_exit(&so->so_lock);
- first = 0;
pflag = opflag | MSG_NOMARK;
goto retry;
}
@@ -3446,7 +3595,7 @@ retry:
dprintso(so, 1,
("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
"state %s\n",
- so->so_oobsigcnt, so->so_oobcnt,
+ sti->sti_oobsigcnt, sti->sti_oobcnt,
saved_resid - uiop->uio_resid,
pr_state(so->so_state, so->so_mode)));
/*
@@ -3476,8 +3625,8 @@ retry:
dprintso(so, 1,
("sotpi_recvmsg: consume EXDATA_IND "
"counts %d/%d state %s\n",
- so->so_oobsigcnt,
- so->so_oobcnt,
+ sti->sti_oobsigcnt,
+ sti->sti_oobcnt,
pr_state(so->so_state, so->so_mode)));
pflag = MSG_ANY | MSG_DELAYERROR;
@@ -3516,11 +3665,11 @@ retry:
*/
mutex_enter(&so->so_lock);
ASSERT(so_verify_oobstate(so));
- ASSERT(so->so_oobsigcnt >= so->so_oobcnt);
- ASSERT(so->so_oobsigcnt > 0);
- so->so_oobsigcnt--;
- ASSERT(so->so_oobcnt > 0);
- so->so_oobcnt--;
+ ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
+ ASSERT(sti->sti_oobsigcnt > 0);
+ sti->sti_oobsigcnt--;
+ ASSERT(sti->sti_oobcnt > 0);
+ sti->sti_oobcnt--;
/*
* Since the T_EXDATA_IND has been removed from the stream
* head, but we have not read data past the mark,
@@ -3533,12 +3682,14 @@ retry:
mutex_exit(&so->so_lock);
dprintso(so, 1,
("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
- so->so_oobsigcnt, so->so_oobcnt,
+ sti->sti_oobsigcnt, sti->sti_oobcnt,
pr_state(so->so_state, so->so_mode)));
pflag = opflag;
goto retry;
}
default:
+ cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
+ (void *)so, tpr->type, (void *)mp);
ASSERT(0);
freemsg(mp);
error = EPROTO;
@@ -3549,35 +3700,13 @@ retry:
out:
mutex_enter(&so->so_lock);
out_locked:
- if (sodp != NULL) {
- /* Finish any sodirect and uioa processing */
- mutex_enter(sodp->sod_lockp);
- if (suiop != NULL) {
- /* Finish any uioa_t processing */
- int ret;
-
- ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
- ret = uioafini(suiop, (uioa_t *)uiop);
- if (error == 0 && ret != 0) {
- /* If no error yet, set it */
- error = ret;
- }
- if ((mp = sodp->sod_uioafh) != NULL) {
- sodp->sod_uioafh = NULL;
- sodp->sod_uioaft = NULL;
- freemsg(mp);
- }
- }
- ASSERT(sodp->sod_uioafh == NULL);
- if (!(sodp->sod_state & SOD_WAKE_NOT)) {
- /* Awoke */
- sodp->sod_state &= SOD_WAKE_CLR;
- sodp->sod_state |= SOD_WAKE_NOT;
- }
- /* Last, clear sod_want value */
- sodp->sod_want = 0;
- mutex_exit(sodp->sod_lockp);
+ if (so->so_direct != NULL) {
+ mutex_enter(so->so_direct->sod_lockp);
+ reterr = sod_rcv_done(so, suiop, uiop);
+ mutex_exit(so->so_direct->sod_lockp);
}
+ if (reterr != 0 && error == 0)
+ error = reterr;
so_unlock_read(so); /* Clear SOREADLOCKED */
mutex_exit(&so->so_lock);
return (error);
@@ -3605,12 +3734,13 @@ sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
t_uscalar_t optlen;
void *fds;
int fdlen;
+ sotpi_info_t *sti = SOTOTPI(so);
ASSERT(name && namelen);
ASSERT(control && controllen);
len = uiop->uio_resid;
- if (len > (ssize_t)so->so_tidu_size) {
+ if (len > (ssize_t)sti->sti_tidu_size) {
return (EMSGSIZE);
}
@@ -3630,7 +3760,7 @@ sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
return (error);
}
if (so->so_family == AF_UNIX) {
- if (so->so_state & SS_FADDR_NOXLATE) {
+ if (sti->sti_faddr_noxlate) {
/*
* Already have a transport internal address. Do not
* pass any (transport internal) source address.
@@ -3644,14 +3774,14 @@ sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
* Pass the sockaddr_un source address as an option
* and translate the remote address.
*
- * Note that this code does not prevent so_laddr_sa
+ * Note that this code does not prevent sti_laddr_sa
* from changing while it is being used. Thus
* if an unbind+bind occurs concurrently with this
* send the peer might see a partially new and a
* partially old "from" address.
*/
- src = so->so_laddr_sa;
- srclen = (t_uscalar_t)so->so_laddr_len;
+ src = sti->sti_laddr_sa;
+ srclen = (t_uscalar_t)sti->sti_laddr_len;
dprintso(so, 1,
("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
srclen, src));
@@ -3762,24 +3892,20 @@ sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
* Assumes caller has verified that SS_ISCONNECTED is set.
*/
static int
-sosend_svccmsg(struct sonode *so,
- struct uio *uiop,
- int more,
- void *control,
- t_uscalar_t controllen,
- int flags)
+sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
+ t_uscalar_t controllen, int flags)
{
struct T_optdata_req tdr;
mblk_t *mp;
int error;
ssize_t iosize;
- int first = 1;
int size;
struct fdbuf *fdbuf;
t_uscalar_t optlen;
void *fds;
int fdlen;
struct T_opthdr toh;
+ sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 1,
("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
@@ -3801,7 +3927,7 @@ sosend_svccmsg(struct sonode *so,
* Error for transports with zero tidu_size.
*/
tdr.PRIM_type = T_OPTDATA_REQ;
- iosize = so->so_tidu_size;
+ iosize = sti->sti_tidu_size;
if (iosize <= 0)
return (EMSGSIZE);
if (uiop->uio_resid > iosize) {
@@ -3843,7 +3969,7 @@ sosend_svccmsg(struct sonode *so,
* Caught a signal waiting for memory.
* Let send* return EINTR.
*/
- return (first ? EINTR : 0);
+ return (EINTR);
}
}
soappendmsg(mp, &tdr, sizeof (tdr));
@@ -3869,13 +3995,10 @@ sosend_svccmsg(struct sonode *so,
error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
0, MSG_BAND, 0);
if (error) {
- if (!first && error == EWOULDBLOCK)
- return (0);
eprintsoline(so, error);
return (error);
}
control = NULL;
- first = 0;
if (uiop->uio_resid > 0) {
/*
* Recheck for fatal errors. Fail write even though
@@ -3883,13 +4006,12 @@ sosend_svccmsg(struct sonode *so,
* with strwrite semantics and BSD sockets semantics.
*/
if (so->so_state & SS_CANTSENDMORE) {
- tsignal(curthread, SIGPIPE);
eprintsoline(so, error);
return (EPIPE);
}
if (so->so_error != 0) {
mutex_enter(&so->so_lock);
- error = sogeterr(so);
+ error = sogeterr(so, B_TRUE);
mutex_exit(&so->so_lock);
if (error != 0) {
eprintsoline(so, error);
@@ -3920,11 +4042,12 @@ sosend_dgram(struct sonode *so, struct sockaddr *name, socklen_t namelen,
void *src;
socklen_t srclen;
ssize_t len;
+ sotpi_info_t *sti = SOTOTPI(so);
ASSERT(name != NULL && namelen != 0);
len = uiop->uio_resid;
- if (len > so->so_tidu_size) {
+ if (len > sti->sti_tidu_size) {
error = EMSGSIZE;
goto done;
}
@@ -3934,11 +4057,11 @@ sosend_dgram(struct sonode *so, struct sockaddr *name, socklen_t namelen,
if (error != 0)
goto done;
- if (so->so_state & SS_DIRECT)
+ if (sti->sti_direct)
return (sodgram_direct(so, name, namelen, uiop, flags));
if (so->so_family == AF_UNIX) {
- if (so->so_state & SS_FADDR_NOXLATE) {
+ if (sti->sti_faddr_noxlate) {
/*
* Already have a transport internal address. Do not
* pass any (transport internal) source address.
@@ -3952,14 +4075,14 @@ sosend_dgram(struct sonode *so, struct sockaddr *name, socklen_t namelen,
* Pass the sockaddr_un source address as an option
* and translate the remote address.
*
- * Note that this code does not prevent so_laddr_sa
+ * Note that this code does not prevent sti_laddr_sa
* from changing while it is being used. Thus
* if an unbind+bind occurs concurrently with this
* send the peer might see a partially new and a
* partially old "from" address.
*/
- src = so->so_laddr_sa;
- srclen = (socklen_t)so->so_laddr_len;
+ src = sti->sti_laddr_sa;
+ srclen = (socklen_t)sti->sti_laddr_len;
dprintso(so, 1,
("sosend_dgram UNIX: srclen %d, src %p\n",
srclen, src));
@@ -4048,17 +4171,14 @@ done:
* Assumes caller has verified that SS_ISCONNECTED is set.
*/
int
-sosend_svc(struct sonode *so,
- struct uio *uiop,
- t_scalar_t prim,
- int more,
- int sflag)
+sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
+ int sflag)
{
struct T_data_req tdr;
mblk_t *mp;
int error;
ssize_t iosize;
- int first = 1;
+ sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 1,
("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
@@ -4077,7 +4197,7 @@ sosend_svc(struct sonode *so,
* Error for transports with zero tidu_size.
*/
tdr.PRIM_type = prim;
- iosize = so->so_tidu_size;
+ iosize = sti->sti_tidu_size;
if (iosize <= 0)
return (EMSGSIZE);
if (uiop->uio_resid > iosize) {
@@ -4097,21 +4217,15 @@ sosend_svc(struct sonode *so,
* Caught a signal waiting for memory.
* Let send* return EINTR.
*/
- if (first)
- return (EINTR);
- else
- return (0);
+ return (EINTR);
}
error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
0, sflag | MSG_BAND, 0);
if (error) {
- if (!first && error == EWOULDBLOCK)
- return (0);
eprintsoline(so, error);
return (error);
}
- first = 0;
if (uiop->uio_resid > 0) {
/*
* Recheck for fatal errors. Fail write even though
@@ -4119,13 +4233,12 @@ sosend_svc(struct sonode *so,
* with strwrite semantics and BSD sockets semantics.
*/
if (so->so_state & SS_CANTSENDMORE) {
- tsignal(curthread, SIGPIPE);
eprintsoline(so, error);
return (EPIPE);
}
if (so->so_error != 0) {
mutex_enter(&so->so_lock);
- error = sogeterr(so);
+ error = sogeterr(so, B_TRUE);
mutex_exit(&so->so_lock);
if (error != 0) {
eprintsoline(so, error);
@@ -4145,7 +4258,8 @@ sosend_svc(struct sonode *so,
* after sending the message.
*/
static int
-sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
+sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
+ struct cred *cr)
{
int so_state;
int so_mode;
@@ -4154,22 +4268,28 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
t_uscalar_t namelen;
int dontroute;
int flags;
+ sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
(void *)so, (void *)msg, msg->msg_flags,
pr_state(so->so_state, so->so_mode), so->so_error));
+ if (so->so_version == SOV_STREAM) {
+ /* The imaginary "sockmod" has been popped - act as a stream */
+ so_update_attrs(so, SOMOD);
+ return (strwrite(SOTOV(so), uiop, cr));
+ }
+
mutex_enter(&so->so_lock);
so_state = so->so_state;
if (so_state & SS_CANTSENDMORE) {
mutex_exit(&so->so_lock);
- tsignal(curthread, SIGPIPE);
return (EPIPE);
}
if (so->so_error != 0) {
- error = sogeterr(so);
+ error = sogeterr(so, B_TRUE);
if (error != 0) {
mutex_exit(&so->so_lock);
return (error);
@@ -4194,15 +4314,15 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
namelen = 0;
} else {
/*
- * Note that this code does not prevent so_faddr_sa
+ * Note that this code does not prevent sti_faddr_sa
* from changing while it is being used. Thus
* if an "unconnect"+connect occurs concurrently with
* this send the datagram might be delivered to a
* garbaled address.
*/
- ASSERT(so->so_faddr_sa);
- name = so->so_faddr_sa;
- namelen = (t_uscalar_t)so->so_faddr_len;
+ ASSERT(sti->sti_faddr_sa);
+ name = sti->sti_faddr_sa;
+ namelen = (t_uscalar_t)sti->sti_faddr_len;
}
} else {
if (!(so_state & SS_ISCONNECTED) &&
@@ -4227,7 +4347,7 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
if (!(so_state & SS_ISBOUND)) {
so_lock_single(so); /* Set SOLOCKED */
error = sotpi_bind(so, NULL, 0,
- _SOBIND_UNSPEC|_SOBIND_LOCK_HELD);
+ _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
so_unlock_single(so, SOLOCKED);
if (error) {
mutex_exit(&so->so_lock);
@@ -4243,20 +4363,20 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
* If sending to some other address discard the delayed
* error indication.
*/
- if (so->so_delayed_error) {
+ if (sti->sti_delayed_error) {
struct T_uderror_ind *tudi;
void *addr;
t_uscalar_t addrlen;
boolean_t match = B_FALSE;
- ASSERT(so->so_eaddr_mp);
- error = so->so_delayed_error;
- so->so_delayed_error = 0;
- tudi = (struct T_uderror_ind *)so->so_eaddr_mp->b_rptr;
+ ASSERT(sti->sti_eaddr_mp);
+ error = sti->sti_delayed_error;
+ sti->sti_delayed_error = 0;
+ tudi =
+ (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
addrlen = tudi->DEST_length;
- addr = sogetoff(so->so_eaddr_mp,
- tudi->DEST_offset,
- addrlen, 1);
+ addr = sogetoff(sti->sti_eaddr_mp,
+ tudi->DEST_offset, addrlen, 1);
ASSERT(addr); /* Checked by strsock_proto */
switch (so->so_family) {
case AF_INET: {
@@ -4292,8 +4412,8 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
match = B_TRUE;
}
if (match) {
- freemsg(so->so_eaddr_mp);
- so->so_eaddr_mp = NULL;
+ freemsg(sti->sti_eaddr_mp);
+ sti->sti_eaddr_mp = NULL;
mutex_exit(&so->so_lock);
#ifdef DEBUG
dprintso(so, 0,
@@ -4303,8 +4423,8 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
#endif /* DEBUG */
return (error);
}
- freemsg(so->so_eaddr_mp);
- so->so_eaddr_mp = NULL;
+ freemsg(sti->sti_eaddr_mp);
+ sti->sti_eaddr_mp = NULL;
}
}
mutex_exit(&so->so_lock);
@@ -4316,7 +4436,7 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
val = 1;
error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
- &val, (t_uscalar_t)sizeof (val));
+ &val, (t_uscalar_t)sizeof (val), cr);
if (error)
return (error);
dontroute = 1;
@@ -4328,6 +4448,7 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
}
if (msg->msg_controllen != 0) {
if (!(so_mode & SM_CONNREQUIRED)) {
+ so_update_attrs(so, SOMOD);
error = sosend_dgramcmsg(so, name, namelen, uiop,
msg->msg_control, msg->msg_controllen, flags);
} else {
@@ -4336,6 +4457,7 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
error = EOPNOTSUPP;
goto done;
}
+ so_update_attrs(so, SOMOD);
error = sosend_svccmsg(so, uiop,
!(flags & MSG_EOR),
msg->msg_control, msg->msg_controllen,
@@ -4344,6 +4466,7 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
goto done;
}
+ so_update_attrs(so, SOMOD);
if (!(so_mode & SM_CONNREQUIRED)) {
/*
* If there is no SO_DONTROUTE to turn off return immediately
@@ -4368,20 +4491,25 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
} else {
if (so_mode & SM_BYTESTREAM) {
/* Byte stream transport - use write */
-
dprintso(so, 1, ("sotpi_sendmsg: write\n"));
+
+ /* Send M_DATA messages */
+ if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
+ (error = nl7c_data(so, uiop)) >= 0) {
+ /* NL7C consumed the data */
+ return (error);
+ }
/*
* If there is no SO_DONTROUTE to turn off,
- * SS_DIRECT is on, and there is no flow
+ * sti_direct is on, and there is no flow
* control, we can take the fast path.
*/
- if (!dontroute &&
- (so_state & SS_DIRECT) &&
+ if (!dontroute && sti->sti_direct != 0 &&
canputnext(SOTOV(so)->v_stream->sd_wrq)) {
return (sostream_direct(so, uiop,
- NULL, CRED()));
+ NULL, cr));
}
- error = strwrite(SOTOV(so), uiop, CRED());
+ error = strwrite(SOTOV(so), uiop, cr);
goto done;
}
prim = T_DATA_REQ;
@@ -4404,12 +4532,129 @@ done:
val = 0;
(void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
- &val, (t_uscalar_t)sizeof (val));
+ &val, (t_uscalar_t)sizeof (val), cr);
}
return (error);
}
/*
+ * kstrwritemp() has very similar semantics as that of strwrite().
+ * The main difference is it obtains mblks from the caller and also
+ * does not do any copy as done in strwrite() from user buffers to
+ * kernel buffers.
+ *
+ * Currently, this routine is used by sendfile to send data allocated
+ * within the kernel without any copying. This interface does not use the
+ * synchronous stream interface as synch. stream interface implies
+ * copying.
+ */
+int
+kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
+{
+ struct stdata *stp;
+ struct queue *wqp;
+ mblk_t *newmp;
+ char waitflag;
+ int tempmode;
+ int error = 0;
+ int done = 0;
+ struct sonode *so;
+ boolean_t direct;
+
+ ASSERT(vp->v_stream);
+ stp = vp->v_stream;
+
+ so = VTOSO(vp);
+ direct = _SOTOTPI(so)->sti_direct;
+
+ /*
+ * This is the sockfs direct fast path. canputnext() need
+ * not be accurate so we don't grab the sd_lock here. If
+ * we get flow-controlled, we grab sd_lock just before the
+ * do..while loop below to emulate what strwrite() does.
+ */
+ wqp = stp->sd_wrq;
+ if (canputnext(wqp) && direct &&
+ !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
+ return (sostream_direct(so, NULL, mp, CRED()));
+ } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
+ /* Fast check of flags before acquiring the lock */
+ mutex_enter(&stp->sd_lock);
+ error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
+ mutex_exit(&stp->sd_lock);
+ if (error != 0) {
+ if (!(stp->sd_flag & STPLEX) &&
+ (stp->sd_wput_opt & SW_SIGPIPE)) {
+ error = EPIPE;
+ }
+ return (error);
+ }
+ }
+
+ waitflag = WRITEWAIT;
+ if (stp->sd_flag & OLDNDELAY)
+ tempmode = fmode & ~FNDELAY;
+ else
+ tempmode = fmode;
+
+ mutex_enter(&stp->sd_lock);
+ do {
+ if (canputnext(wqp)) {
+ mutex_exit(&stp->sd_lock);
+ if (stp->sd_wputdatafunc != NULL) {
+ newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
+ NULL, NULL, NULL);
+ if (newmp == NULL) {
+ /* The caller will free mp */
+ return (ECOMM);
+ }
+ mp = newmp;
+ }
+ putnext(wqp, mp);
+ return (0);
+ }
+ error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
+ &done);
+ } while (error == 0 && !done);
+
+ mutex_exit(&stp->sd_lock);
+ /*
+ * EAGAIN tells the application to try again. ENOMEM
+ * is returned only if the memory allocation size
+ * exceeds the physical limits of the system. ENOMEM
+ * can't be true here.
+ */
+ if (error == ENOMEM)
+ error = EAGAIN;
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
+ struct cred *cr, mblk_t **mpp)
+{
+ int error;
+
+ if (so->so_family != AF_INET && so->so_family != AF_INET6)
+ return (EAFNOSUPPORT);
+
+ if (so->so_state & SS_CANTSENDMORE)
+ return (EPIPE);
+
+ if (so->so_type != SOCK_STREAM)
+ return (EOPNOTSUPP);
+
+ if ((so->so_state & SS_ISCONNECTED) == 0)
+ return (ENOTCONN);
+
+ error = kstrwritemp(so->so_vnode, *mpp, fflag);
+ if (error == 0)
+ *mpp = NULL;
+ return (error);
+}
+
+/*
* Sending data on a datagram socket.
* Assumes caller has verified that SS_ISBOUND etc. are set.
*/
@@ -4429,6 +4674,7 @@ sodgram_direct(struct sonode *so, struct sockaddr *name,
queue_t *udp_wq;
boolean_t connected;
mblk_t *mpdata = NULL;
+ sotpi_info_t *sti = SOTOTPI(so);
ASSERT(name != NULL && namelen != 0);
ASSERT(!(so->so_mode & SM_CONNREQUIRED));
@@ -4438,7 +4684,7 @@ sodgram_direct(struct sonode *so, struct sockaddr *name,
/* Caller checked for proper length */
len = uiop->uio_resid;
- ASSERT(len <= so->so_tidu_size);
+ ASSERT(len <= sti->sti_tidu_size);
/* Length and family checks have been done by caller */
ASSERT(name->sa_family == so->so_family);
@@ -4640,22 +4886,34 @@ slow_send:
}
/*
- * Update so_faddr by asking the transport (unless AF_UNIX).
+ * Update sti_faddr by asking the transport (unless AF_UNIX).
*/
+/* ARGSUSED */
int
-sotpi_getpeername(struct sonode *so)
+sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
+ boolean_t accept, struct cred *cr)
{
struct strbuf strbuf;
int error = 0, res;
void *addr;
t_uscalar_t addrlen;
k_sigset_t smask;
+ sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
(void *)so, pr_state(so->so_state, so->so_mode)));
+ ASSERT(*namelen > 0);
mutex_enter(&so->so_lock);
so_lock_single(so); /* Set SOLOCKED */
+
+ if (accept) {
+ bcopy(sti->sti_faddr_sa, name,
+ MIN(*namelen, sti->sti_faddr_len));
+ *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
+ goto done;
+ }
+
if (!(so->so_state & SS_ISCONNECTED)) {
error = ENOTCONN;
goto done;
@@ -4668,27 +4926,39 @@ sotpi_getpeername(struct sonode *so)
}
goto done;
}
+
+ if (sti->sti_faddr_valid) {
+ bcopy(sti->sti_faddr_sa, name,
+ MIN(*namelen, sti->sti_faddr_len));
+ *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
+ goto done;
+ }
+
#ifdef DEBUG
dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
- pr_addr(so->so_family, so->so_faddr_sa,
- (t_uscalar_t)so->so_faddr_len)));
+ pr_addr(so->so_family, sti->sti_faddr_sa,
+ (t_uscalar_t)sti->sti_faddr_len)));
#endif /* DEBUG */
if (so->so_family == AF_UNIX) {
/* Transport has different name space - return local info */
+ if (sti->sti_faddr_noxlate)
+ *namelen = 0;
error = 0;
goto done;
}
- ASSERT(so->so_faddr_sa);
+ ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
+
+ ASSERT(sti->sti_faddr_sa);
/* Allocate local buffer to use with ioctl */
- addrlen = (t_uscalar_t)so->so_faddr_maxlen;
+ addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
mutex_exit(&so->so_lock);
addr = kmem_alloc(addrlen, KM_SLEEP);
/*
* Issue TI_GETPEERNAME with signals masked.
- * Put the result in so_faddr_sa so that getpeername works after
+ * Put the result in sti_faddr_sa so that getpeername works after
* a shutdown(output).
* If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
* back to the socket.
@@ -4699,16 +4969,16 @@ sotpi_getpeername(struct sonode *so)
sigintr(&smask, 0);
res = 0;
- ASSERT(CRED());
+ ASSERT(cr);
error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
- 0, K_TO_K, CRED(), &res);
+ 0, K_TO_K, cr, &res);
sigunintr(&smask);
mutex_enter(&so->so_lock);
/*
* If there is an error record the error in so_error put don't fail
* the getpeername. Instead fallback on the recorded
- * so->so_faddr_sa.
+ * sti->sti_faddr_sa.
*/
if (error) {
/*
@@ -4732,16 +5002,19 @@ sotpi_getpeername(struct sonode *so)
error = 0;
} else if (res == 0 && strbuf.len > 0 &&
(so->so_state & SS_ISCONNECTED)) {
- ASSERT(strbuf.len <= (int)so->so_faddr_maxlen);
- so->so_faddr_len = (socklen_t)strbuf.len;
- bcopy(addr, so->so_faddr_sa, so->so_faddr_len);
- so->so_state |= SS_FADDR_VALID;
+ ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
+ sti->sti_faddr_len = (socklen_t)strbuf.len;
+ bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
+ sti->sti_faddr_valid = 1;
+
+ bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
+ *namelen = sti->sti_faddr_len;
}
kmem_free(addr, addrlen);
#ifdef DEBUG
dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
- pr_addr(so->so_family, so->so_faddr_sa,
- (t_uscalar_t)so->so_faddr_len)));
+ pr_addr(so->so_family, sti->sti_faddr_sa,
+ (t_uscalar_t)sti->sti_faddr_len)));
#endif /* DEBUG */
done:
so_unlock_single(so, SOLOCKED);
@@ -4750,42 +5023,39 @@ done:
}
/*
- * Update so_laddr by asking the transport (unless AF_UNIX).
+ * Update sti_laddr by asking the transport (unless AF_UNIX).
*/
int
-sotpi_getsockname(struct sonode *so)
+sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
+ struct cred *cr)
{
struct strbuf strbuf;
int error = 0, res;
void *addr;
t_uscalar_t addrlen;
k_sigset_t smask;
+ sotpi_info_t *sti = SOTOTPI(so);
dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
(void *)so, pr_state(so->so_state, so->so_mode)));
+ ASSERT(*namelen > 0);
mutex_enter(&so->so_lock);
so_lock_single(so); /* Set SOLOCKED */
- if (!(so->so_state & SS_ISBOUND) && so->so_family != AF_UNIX) {
- /* Return an all zero address except for the family */
- if (so->so_family == AF_INET)
- so->so_laddr_len = (socklen_t)sizeof (sin_t);
- else if (so->so_family == AF_INET6)
- so->so_laddr_len = (socklen_t)sizeof (sin6_t);
- ASSERT(so->so_laddr_len <= so->so_laddr_maxlen);
- bzero(so->so_laddr_sa, so->so_laddr_len);
- /*
- * Can not assume there is a sa_family for all
- * protocol families.
- */
- if (so->so_family == AF_INET || so->so_family == AF_INET6)
- so->so_laddr_sa->sa_family = so->so_family;
- }
+
#ifdef DEBUG
+
dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
- pr_addr(so->so_family, so->so_laddr_sa,
- (t_uscalar_t)so->so_laddr_len)));
+ pr_addr(so->so_family, sti->sti_laddr_sa,
+ (t_uscalar_t)sti->sti_laddr_len)));
#endif /* DEBUG */
+ if (sti->sti_laddr_valid) {
+ bcopy(sti->sti_laddr_sa, name,
+ MIN(*namelen, sti->sti_laddr_len));
+ *namelen = sti->sti_laddr_len;
+ goto done;
+ }
+
if (so->so_family == AF_UNIX) {
/* Transport has different name space - return local info */
error = 0;
@@ -4796,14 +5066,15 @@ sotpi_getsockname(struct sonode *so)
error = 0;
goto done;
}
+
/* Allocate local buffer to use with ioctl */
- addrlen = (t_uscalar_t)so->so_laddr_maxlen;
+ addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
mutex_exit(&so->so_lock);
addr = kmem_alloc(addrlen, KM_SLEEP);
/*
* Issue TI_GETMYNAME with signals masked.
- * Put the result in so_laddr_sa so that getsockname works after
+ * Put the result in sti_laddr_sa so that getsockname works after
* a shutdown(output).
* If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
* back to the socket.
@@ -4814,16 +5085,16 @@ sotpi_getsockname(struct sonode *so)
sigintr(&smask, 0);
res = 0;
- ASSERT(CRED());
+ ASSERT(cr);
error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
- 0, K_TO_K, CRED(), &res);
+ 0, K_TO_K, cr, &res);
sigunintr(&smask);
mutex_enter(&so->so_lock);
/*
* If there is an error record the error in so_error put don't fail
* the getsockname. Instead fallback on the recorded
- * so->so_laddr_sa.
+ * sti->sti_laddr_sa.
*/
if (error) {
/*
@@ -4844,16 +5115,19 @@ sotpi_getsockname(struct sonode *so)
error = 0;
} else if (res == 0 && strbuf.len > 0 &&
(so->so_state & SS_ISBOUND)) {
- ASSERT(strbuf.len <= (int)so->so_laddr_maxlen);
- so->so_laddr_len = (socklen_t)strbuf.len;
- bcopy(addr, so->so_laddr_sa, so->so_laddr_len);
- so->so_state |= SS_LADDR_VALID;
+ ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
+ sti->sti_laddr_len = (socklen_t)strbuf.len;
+ bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
+ sti->sti_laddr_valid = 1;
+
+ bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
+ *namelen = sti->sti_laddr_len;
}
kmem_free(addr, addrlen);
#ifdef DEBUG
dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
- pr_addr(so->so_family, so->so_laddr_sa,
- (t_uscalar_t)so->so_laddr_len)));
+ pr_addr(so->so_family, sti->sti_laddr_sa,
+ (t_uscalar_t)sti->sti_laddr_len)));
#endif /* DEBUG */
done:
so_unlock_single(so, SOLOCKED);
@@ -4868,9 +5142,10 @@ done:
*
* On the return most *optlenp bytes are copied to optval.
*/
+/* ARGSUSED */
int
sotpi_getsockopt(struct sonode *so, int level, int option_name,
- void *optval, socklen_t *optlenp, int flags)
+ void *optval, socklen_t *optlenp, int flags, struct cred *cr)
{
struct T_optmgmt_req optmgmt_req;
struct T_optmgmt_ack *optmgmt_ack;
@@ -4882,6 +5157,8 @@ sotpi_getsockopt(struct sonode *so, int level, int option_name,
t_uscalar_t maxlen = *optlenp;
t_uscalar_t len;
uint32_t value;
+ struct timeval tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
+ struct so_snd_bufinfo snd_bufinfo; /* used for zero copy */
dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
(void *)so, level, option_name, optval, (void *)optlenp,
@@ -4914,8 +5191,6 @@ sotpi_getsockopt(struct sonode *so, int level, int option_name,
#ifdef notyet
case SO_SNDLOWAT:
case SO_RCVLOWAT:
- case SO_SNDTIMEO:
- case SO_RCVTIMEO:
#endif /* notyet */
case SO_DOMAIN:
case SO_DGRAM_ERRIND:
@@ -4925,6 +5200,14 @@ sotpi_getsockopt(struct sonode *so, int level, int option_name,
goto done2;
}
break;
+ case SO_RCVTIMEO:
+ case SO_SNDTIMEO:
+ if (maxlen < (t_uscalar_t)sizeof (struct timeval)) {
+ error = EINVAL;
+ eprintsoline(so, error);
+ goto done2;
+ }
+ break;
case SO_LINGER:
if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
error = EINVAL;
@@ -4932,6 +5215,14 @@ sotpi_getsockopt(struct sonode *so, int level, int option_name,
goto done2;
}
break;
+ case SO_SND_BUFINFO:
+ if (maxlen < (t_uscalar_t)
+ sizeof (struct so_snd_bufinfo)) {
+ error = EINVAL;
+ eprintsoline(so, error);
+ goto done2;
+ }
+ break;
}
len = (t_uscalar_t)sizeof (uint32_t); /* Default */
@@ -4943,7 +5234,7 @@ sotpi_getsockopt(struct sonode *so, int level, int option_name,
goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
case SO_ERROR:
- value = sogeterr(so);
+ value = sogeterr(so, B_TRUE);
option = &value;
goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
@@ -5072,15 +5363,33 @@ sotpi_getsockopt(struct sonode *so, int level, int option_name,
value = so->so_rcvlowat;
option = &value;
break;
+#endif /* notyet */
case SO_SNDTIMEO:
- value = so->so_sndtimeo;
- option = &value;
+ case SO_RCVTIMEO: {
+ clock_t val;
+ if (option_name == SO_RCVTIMEO)
+ val = drv_hztousec(so->so_rcvtimeo);
+ else
+ val = drv_hztousec(so->so_sndtimeo);
+ tmo_val.tv_sec = val / (1000 * 1000);
+ tmo_val.tv_usec = val % (1000 * 1000);
+ option = &tmo_val;
+ len = (t_uscalar_t)sizeof (struct timeval);
break;
- case SO_RCVTIMEO:
- value = so->so_rcvtimeo;
- option = &value;
+ }
+ case SO_SND_BUFINFO: {
+ snd_bufinfo.sbi_wroff =
+ (so->so_proto_props).sopp_wroff;
+ snd_bufinfo.sbi_maxblk =
+ (so->so_proto_props).sopp_maxblk;
+ snd_bufinfo.sbi_maxpsz =
+ (so->so_proto_props).sopp_maxpsz;
+ snd_bufinfo.sbi_tail =
+ (so->so_proto_props).sopp_tail;
+ option = &snd_bufinfo;
+ len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
break;
-#endif /* notyet */
+ }
}
}
@@ -5159,6 +5468,7 @@ done:
done2:
so_unlock_single(so, SOLOCKED);
mutex_exit(&so->so_lock);
+
return (error);
}
@@ -5168,9 +5478,10 @@ done2:
* SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
* setsockopt has to work even if the transport does not support the option.
*/
+/* ARGSUSED */
int
sotpi_setsockopt(struct sonode *so, int level, int option_name,
- const void *optval, t_uscalar_t optlen)
+ const void *optval, t_uscalar_t optlen, struct cred *cr)
{
struct T_optmgmt_req optmgmt_req;
struct opthdr oh;
@@ -5182,7 +5493,6 @@ sotpi_setsockopt(struct sonode *so, int level, int option_name,
(void *)so, level, option_name, optval, optlen,
pr_state(so->so_state, so->so_mode)));
-
/* X/Open requires this check */
if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
if (xnet_check_print)
@@ -5190,12 +5500,6 @@ sotpi_setsockopt(struct sonode *so, int level, int option_name,
return (EINVAL);
}
- /* Caller allocates aligned optval, or passes null */
- ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
- /* If optval is null optlen is 0, and vice-versa */
- ASSERT(optval != NULL || optlen == 0);
- ASSERT(optlen != 0 || optval == NULL);
-
mutex_enter(&so->so_lock);
so_lock_single(so); /* Set SOLOCKED */
mutex_exit(&so->so_lock);
@@ -5207,8 +5511,9 @@ sotpi_setsockopt(struct sonode *so, int level, int option_name,
*/
if ((level == SOL_SOCKET || level == IPPROTO_TCP) &&
(so->so_family == AF_INET || so->so_family == AF_INET6) &&
- (so->so_version == SOV_SOCKSTREAM) && (so->so_priv != NULL)) {
- tcp_t *tcp = so->so_priv;
+ (so->so_version == SOV_SOCKSTREAM) &&
+ (so->so_proto_handle != NULL)) {
+ tcp_t *tcp = (tcp_t *)so->so_proto_handle;
boolean_t onoff;
#define intvalue (*(int32_t *)optval)
@@ -5233,6 +5538,18 @@ sotpi_setsockopt(struct sonode *so, int level, int option_name,
onoff = intvalue != 0;
handled = B_TRUE;
break;
+ case SO_SNDTIMEO:
+ case SO_RCVTIMEO:
+ if (optlen !=
+ (t_uscalar_t)sizeof (struct timeval)) {
+ error = EINVAL;
+ eprintsoline(so, error);
+ mutex_enter(&so->so_lock);
+ goto done2;
+ }
+ ASSERT(optval);
+ handled = B_TRUE;
+ break;
case SO_LINGER:
if (optlen !=
(t_uscalar_t)sizeof (struct linger)) {
@@ -5373,7 +5690,7 @@ sotpi_setsockopt(struct sonode *so, int level, int option_name,
mutex_enter(&so->so_lock);
if (error) {
eprintsoline(so, error);
- goto done;
+ goto done2;
}
error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
(t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
@@ -5406,8 +5723,6 @@ done:
#ifdef notyet
case SO_SNDLOWAT:
case SO_RCVLOWAT:
- case SO_SNDTIMEO:
- case SO_RCVTIMEO:
#endif /* notyet */
case SO_DGRAM_ERRIND:
if (optlen != (t_uscalar_t)sizeof (int32_t)) {
@@ -5418,6 +5733,16 @@ done:
ASSERT(optval);
handled = B_TRUE;
break;
+ case SO_SNDTIMEO:
+ case SO_RCVTIMEO:
+ if (optlen != (t_uscalar_t)sizeof (struct timeval)) {
+ error = EINVAL;
+ eprintsoline(so, error);
+ goto done2;
+ }
+ ASSERT(optval);
+ handled = B_TRUE;
+ break;
case SO_LINGER:
if (optlen != (t_uscalar_t)sizeof (struct linger)) {
error = EINVAL;
@@ -5474,19 +5799,19 @@ done:
case SO_DGRAM_ERRIND:
if (intvalue != 0) {
dprintso(so, 1,
- ("sotpi_setsockopt: setting 0x%x\n",
+ ("socket_setsockopt: setting 0x%x\n",
option_name));
so->so_options |= option_name;
} else {
dprintso(so, 1,
- ("sotpi_setsockopt: clearing 0x%x\n",
+ ("socket_setsockopt: clearing 0x%x\n",
option_name));
so->so_options &= ~option_name;
}
break;
/*
* The following options are only returned by us when the
- * T_SVR4_OPTMGMT_REQ fails.
+ * transport layer fails.
* XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
* since the transport might adjust the value and not
* return exactly what was set by the application.
@@ -5497,6 +5822,9 @@ done:
case SO_RCVBUF:
so->so_rcvbuf = intvalue;
break;
+ case SO_RCVPSH:
+ so->so_rcv_timer_interval = intvalue;
+ break;
#ifdef notyet
/*
* We do not implement the semantics of these options
@@ -5508,13 +5836,17 @@ done:
case SO_RCVLOWAT:
so->so_rcvlowat = intvalue;
break;
+#endif /* notyet */
case SO_SNDTIMEO:
- so->so_sndtimeo = intvalue;
- break;
- case SO_RCVTIMEO:
- so->so_rcvtimeo = intvalue;
+ case SO_RCVTIMEO: {
+ struct timeval *tl = (struct timeval *)optval;
+ clock_t val = tl->tv_sec * 1000 * 1000 + tl->tv_usec;
+ if (option_name == SO_RCVTIMEO)
+ so->so_rcvtimeo = drv_usectohz(val);
+ else
+ so->so_sndtimeo = drv_usectohz(val);
break;
-#endif /* notyet */
+ }
}
#undef intvalue
@@ -5529,8 +5861,1121 @@ done:
}
}
done2:
-ret:
so_unlock_single(so, SOLOCKED);
mutex_exit(&so->so_lock);
return (error);
}
+
+/* ARGSUSED */
+int
+sotpi_close(struct sonode *so, int flag, struct cred *cr)
+{
+ struct vnode *vp = SOTOV(so);
+ dev_t dev;
+ int error = 0;
+ sotpi_info_t *sti = SOTOTPI(so);
+
+ dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
+ (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
+
+ dev = sti->sti_dev;
+
+ ASSERT(STREAMSTAB(getmajor(dev)));
+
+ mutex_enter(&so->so_lock);
+ so_lock_single(so); /* Set SOLOCKED */
+
+ /*
+ * Only call NL7C's close on last open reference.
+ */
+ if (sti->sti_nl7c_flags & NL7C_ENABLED) {
+ sti->sti_nl7c_flags = 0;
+ nl7c_close(so);
+ }
+
+ /*
+ * Only call the close routine when the last open reference through
+ * any [s, v]node goes away.
+ */
+ if (vp->v_stream != NULL) {
+ vnode_t *ux_vp;
+
+ if (so->so_family == AF_UNIX) {
+ /* Could avoid this when CANTSENDMORE for !dgram */
+ so_unix_close(so);
+ }
+
+ mutex_exit(&so->so_lock);
+ /*
+ * Disassemble the linkage from the AF_UNIX underlying file
+ * system vnode to this socket (by atomically clearing
+ * v_stream in vn_rele_stream) before strclose clears sd_vnode
+ * and frees the stream head.
+ */
+ if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
+ ASSERT(ux_vp->v_stream);
+ sti->sti_ux_bound_vp = NULL;
+ vn_rele_stream(ux_vp);
+ }
+ if (so->so_family == AF_INET || so->so_family == AF_INET6) {
+ strsetrwputdatahooks(SOTOV(so), NULL, NULL);
+ if (sti->sti_kssl_ent != NULL) {
+ kssl_release_ent(sti->sti_kssl_ent, so,
+ sti->sti_kssl_type);
+ sti->sti_kssl_ent = NULL;
+ }
+ if (sti->sti_kssl_ctx != NULL) {
+ kssl_release_ctx(sti->sti_kssl_ctx);
+ sti->sti_kssl_ctx = NULL;
+ }
+ sti->sti_kssl_type = KSSL_NO_PROXY;
+ }
+ error = strclose(vp, flag, cr);
+ vp->v_stream = NULL;
+ mutex_enter(&so->so_lock);
+ }
+
+ /*
+ * Flush the T_DISCON_IND on sti_discon_ind_mp.
+ */
+ so_flush_discon_ind(so);
+
+ so_unlock_single(so, SOLOCKED);
+ mutex_exit(&so->so_lock);
+
+ /*
+ * Needed for STREAMs.
+ * Decrement the device driver's reference count for streams
+ * opened via the clone dip. The driver was held in clone_open().
+ * The absence of clone_close() forces this asymmetry.
+ */
+ if (so->so_flag & SOCLONE)
+ ddi_rele_driver(getmajor(dev));
+
+ return (error);
+}
+
+static int
+sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
+ struct cred *cr, int32_t *rvalp)
+{
+ struct vnode *vp = SOTOV(so);
+ sotpi_info_t *sti = SOTOTPI(so);
+ int error = 0;
+
+ dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
+ cmd, arg, pr_state(so->so_state, so->so_mode)));
+
+ switch (cmd) {
+ case _I_INSERT:
+ case _I_REMOVE:
+ /*
+ * Since there's no compelling reason to support these ioctls
+ * on sockets, and doing so would increase the complexity
+ * markedly, prevent it.
+ */
+ return (EOPNOTSUPP);
+
+ case I_FIND:
+ case I_LIST:
+ case I_LOOK:
+ case I_POP:
+ case I_PUSH:
+ /*
+ * To prevent races and inconsistencies between the actual
+ * state of the stream and the state according to the sonode,
+ * we serialize all operations which modify or operate on the
+ * list of modules on the socket's stream.
+ */
+ mutex_enter(&sti->sti_plumb_lock);
+ error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
+ mutex_exit(&sti->sti_plumb_lock);
+ return (error);
+
+ default:
+ if (so->so_version != SOV_STREAM)
+ break;
+
+ /*
+ * The imaginary "sockmod" has been popped; act as a stream.
+ */
+ return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
+ }
+
+ ASSERT(so->so_version != SOV_STREAM);
+
+ /*
+ * Process socket-specific ioctls.
+ */
+ switch (cmd) {
+ case FIONBIO: {
+ int32_t value;
+
+ if (so_copyin((void *)arg, &value, sizeof (int32_t),
+ (mode & (int)FKIOCTL)))
+ return (EFAULT);
+
+ mutex_enter(&so->so_lock);
+ if (value) {
+ so->so_state |= SS_NDELAY;
+ } else {
+ so->so_state &= ~SS_NDELAY;
+ }
+ mutex_exit(&so->so_lock);
+ return (0);
+ }
+
+ case FIOASYNC: {
+ int32_t value;
+
+ if (so_copyin((void *)arg, &value, sizeof (int32_t),
+ (mode & (int)FKIOCTL)))
+ return (EFAULT);
+
+ mutex_enter(&so->so_lock);
+ /*
+ * SS_ASYNC flag not already set correctly?
+ * (!value != !(so->so_state & SS_ASYNC))
+ * but some engineers find that too hard to read.
+ */
+ if (value == 0 && (so->so_state & SS_ASYNC) != 0 ||
+ value != 0 && (so->so_state & SS_ASYNC) == 0)
+ error = so_flip_async(so, vp, mode, cr);
+ mutex_exit(&so->so_lock);
+ return (error);
+ }
+
+ case SIOCSPGRP:
+ case FIOSETOWN: {
+ pid_t pgrp;
+
+ if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
+ (mode & (int)FKIOCTL)))
+ return (EFAULT);
+
+ mutex_enter(&so->so_lock);
+ dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
+ /* Any change? */
+ if (pgrp != so->so_pgrp)
+ error = so_set_siggrp(so, vp, pgrp, mode, cr);
+ mutex_exit(&so->so_lock);
+ return (error);
+ }
+ case SIOCGPGRP:
+ case FIOGETOWN:
+ if (so_copyout(&so->so_pgrp, (void *)arg,
+ sizeof (pid_t), (mode & (int)FKIOCTL)))
+ return (EFAULT);
+ return (0);
+
+ case SIOCATMARK: {
+ int retval;
+ uint_t so_state;
+
+ /*
+ * strwaitmark has a finite timeout after which it
+ * returns -1 if the mark state is undetermined.
+ * In order to avoid any race between the mark state
+ * in sockfs and the mark state in the stream head this
+ * routine loops until the mark state can be determined
+ * (or the urgent data indication has been removed by some
+ * other thread).
+ */
+ do {
+ mutex_enter(&so->so_lock);
+ so_state = so->so_state;
+ mutex_exit(&so->so_lock);
+ if (so_state & SS_RCVATMARK) {
+ retval = 1;
+ } else if (!(so_state & SS_OOBPEND)) {
+ /*
+ * No SIGURG has been generated -- there is no
+ * pending or present urgent data. Thus can't
+ * possibly be at the mark.
+ */
+ retval = 0;
+ } else {
+ /*
+ * Have the stream head wait until there is
+ * either some messages on the read queue, or
+ * STRATMARK or STRNOTATMARK gets set. The
+ * STRNOTATMARK flag is used so that the
+ * transport can send up a MSGNOTMARKNEXT
+ * M_DATA to indicate that it is not
+ * at the mark and additional data is not about
+ * to be send upstream.
+ *
+ * If the mark state is undetermined this will
+ * return -1 and we will loop rechecking the
+ * socket state.
+ */
+ retval = strwaitmark(vp);
+ }
+ } while (retval == -1);
+
+ if (so_copyout(&retval, (void *)arg, sizeof (int),
+ (mode & (int)FKIOCTL)))
+ return (EFAULT);
+ return (0);
+ }
+
+ case I_FDINSERT:
+ case I_SENDFD:
+ case I_RECVFD:
+ case I_ATMARK:
+ case _SIOCSOCKFALLBACK:
+ /*
+ * These ioctls do not apply to sockets. I_FDINSERT can be
+ * used to send M_PROTO messages without modifying the socket
+ * state. I_SENDFD/RECVFD should not be used for socket file
+ * descriptor passing since they assume a twisted stream.
+ * SIOCATMARK must be used instead of I_ATMARK.
+ *
+ * _SIOCSOCKFALLBACK from an application should never be
+ * processed. It is only generated by socktpi_open() or
+ * in response to I_POP or I_PUSH.
+ */
+#ifdef DEBUG
+ zcmn_err(getzoneid(), CE_WARN,
+ "Unsupported STREAMS ioctl 0x%x on socket. "
+ "Pid = %d\n", cmd, curproc->p_pid);
+#endif /* DEBUG */
+ return (EOPNOTSUPP);
+
+ case _I_GETPEERCRED:
+ if ((mode & FKIOCTL) == 0)
+ return (EINVAL);
+
+ mutex_enter(&so->so_lock);
+ if ((so->so_mode & SM_CONNREQUIRED) == 0) {
+ error = ENOTSUP;
+ } else if ((so->so_state & SS_ISCONNECTED) == 0) {
+ error = ENOTCONN;
+ } else if (so->so_peercred != NULL) {
+ k_peercred_t *kp = (k_peercred_t *)arg;
+ kp->pc_cr = so->so_peercred;
+ kp->pc_cpid = so->so_cpid;
+ crhold(so->so_peercred);
+ } else {
+ error = EINVAL;
+ }
+ mutex_exit(&so->so_lock);
+ return (error);
+
+ default:
+ /*
+ * Do the higher-order bits of the ioctl cmd indicate
+ * that it is an I_* streams ioctl?
+ */
+ if ((cmd & 0xffffff00U) == STR &&
+ so->so_version == SOV_SOCKBSD) {
+#ifdef DEBUG
+ zcmn_err(getzoneid(), CE_WARN,
+ "Unsupported STREAMS ioctl 0x%x on socket. "
+ "Pid = %d\n", cmd, curproc->p_pid);
+#endif /* DEBUG */
+ return (EOPNOTSUPP);
+ }
+ return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
+ }
+}
+
+/*
+ * Handle plumbing-related ioctls.
+ */
+static int
+socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
+ struct cred *cr, int32_t *rvalp)
+{
+ static const char sockmod_name[] = "sockmod";
+ struct sonode *so = VTOSO(vp);
+ char mname[FMNAMESZ + 1];
+ int error;
+ sotpi_info_t *sti = SOTOTPI(so);
+
+ ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
+
+ if (so->so_version == SOV_SOCKBSD)
+ return (EOPNOTSUPP);
+
+ if (so->so_version == SOV_STREAM) {
+ /*
+ * The imaginary "sockmod" has been popped - act as a stream.
+ * If this is a push of sockmod then change back to a socket.
+ */
+ if (cmd == I_PUSH) {
+ error = ((mode & FKIOCTL) ? copystr : copyinstr)(
+ (void *)arg, mname, sizeof (mname), NULL);
+
+ if (error == 0 && strcmp(mname, sockmod_name) == 0) {
+ dprintso(so, 0, ("socktpi_ioctl: going to "
+ "socket version\n"));
+ so_stream2sock(so);
+ return (0);
+ }
+ }
+ return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
+ }
+
+ switch (cmd) {
+ case I_PUSH:
+ if (sti->sti_direct) {
+ mutex_enter(&so->so_lock);
+ so_lock_single(so);
+ mutex_exit(&so->so_lock);
+
+ error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
+ CRED(), rvalp);
+
+ mutex_enter(&so->so_lock);
+ if (error == 0)
+ sti->sti_direct = 0;
+ so_unlock_single(so, SOLOCKED);
+ mutex_exit(&so->so_lock);
+
+ if (error != 0)
+ return (error);
+ }
+
+ error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
+ if (error == 0)
+ sti->sti_pushcnt++;
+ return (error);
+
+ case I_POP:
+ if (sti->sti_pushcnt == 0) {
+ /* Emulate sockmod being popped */
+ dprintso(so, 0,
+ ("socktpi_ioctl: going to STREAMS version\n"));
+ return (so_sock2stream(so));
+ }
+
+ error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
+ if (error == 0)
+ sti->sti_pushcnt--;
+ return (error);
+
+ case I_LIST: {
+ struct str_mlist *kmlistp, *umlistp;
+ struct str_list kstrlist;
+ ssize_t kstrlistsize;
+ int i, nmods;
+
+ STRUCT_DECL(str_list, ustrlist);
+ STRUCT_INIT(ustrlist, mode);
+
+ if (arg == NULL) {
+ error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
+ if (error == 0)
+ (*rvalp)++; /* Add one for sockmod */
+ return (error);
+ }
+
+ error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
+ STRUCT_SIZE(ustrlist), mode & FKIOCTL);
+ if (error != 0)
+ return (error);
+
+ nmods = STRUCT_FGET(ustrlist, sl_nmods);
+ if (nmods <= 0)
+ return (EINVAL);
+ /*
+ * Ceiling nmods at nstrpush to prevent someone from
+ * maliciously consuming lots of kernel memory.
+ */
+ nmods = MIN(nmods, nstrpush);
+
+ kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
+ kstrlist.sl_nmods = nmods;
+ kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
+
+ error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
+ cr, rvalp);
+ if (error != 0)
+ goto done;
+
+ /*
+ * Considering the module list as a 0-based array of sl_nmods
+ * modules, sockmod should conceptually exist at slot
+ * sti_pushcnt. Insert sockmod at this location by sliding all
+ * of the module names after so_pushcnt over by one. We know
+ * that there will be room to do this since we allocated
+ * sl_modlist with an additional slot.
+ */
+ for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
+ kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
+
+ (void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
+ kstrlist.sl_nmods++;
+
+ /*
+ * Copy all of the entries out to ustrlist.
+ */
+ kmlistp = kstrlist.sl_modlist;
+ umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
+ for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
+ error = so_copyout(kmlistp++, umlistp++,
+ sizeof (struct str_mlist), mode & FKIOCTL);
+ if (error != 0)
+ goto done;
+ }
+
+ error = so_copyout(&i, (void *)arg, sizeof (int32_t),
+ mode & FKIOCTL);
+ if (error == 0)
+ *rvalp = 0;
+ done:
+ kmem_free(kstrlist.sl_modlist, kstrlistsize);
+ return (error);
+ }
+ case I_LOOK:
+ if (sti->sti_pushcnt == 0) {
+ return (so_copyout(sockmod_name, (void *)arg,
+ sizeof (sockmod_name), mode & FKIOCTL));
+ }
+ return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
+
+ case I_FIND:
+ error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
+ if (error && error != EINVAL)
+ return (error);
+
+ /* if not found and string was sockmod return 1 */
+ if (*rvalp == 0 || error == EINVAL) {
+ error = ((mode & FKIOCTL) ? copystr : copyinstr)(
+ (void *)arg, mname, sizeof (mname), NULL);
+ if (error == ENAMETOOLONG)
+ error = EINVAL;
+
+ if (error == 0 && strcmp(mname, sockmod_name) == 0)
+ *rvalp = 1;
+ }
+ return (error);
+
+ default:
+ panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
+ break;
+ }
+
+ return (0);
+}
+
+/*
+ * Wrapper around the streams poll routine that implements socket poll
+ * semantics.
+ * The sockfs never calls pollwakeup itself - the stream head take care
+ * of all pollwakeups. Since sockfs never holds so_lock when calling the
+ * stream head there can never be a deadlock due to holding so_lock across
+ * pollwakeup and acquiring so_lock in this routine.
+ *
+ * However, since the performance of VOP_POLL is critical we avoid
+ * acquiring so_lock here. This is based on two assumptions:
+ * - The poll implementation holds locks to serialize the VOP_POLL call
+ * and a pollwakeup for the same pollhead. This ensures that should
+ * e.g. so_state change during a socktpi_poll call the pollwakeup
+ * (which strsock_* and strrput conspire to issue) is issued after
+ * the state change. Thus the pollwakeup will block until VOP_POLL has
+ * returned and then wake up poll and have it call VOP_POLL again.
+ * - The reading of so_state without holding so_lock does not result in
+ * stale data that is older than the latest state change that has dropped
+ * so_lock. This is ensured by the mutex_exit issuing the appropriate
+ * memory barrier to force the data into the coherency domain.
+ */
+static int
+sotpi_poll(
+ struct sonode *so,
+ short events,
+ int anyyet,
+ short *reventsp,
+ struct pollhead **phpp)
+{
+ short origevents = events;
+ struct vnode *vp = SOTOV(so);
+ int error;
+ int so_state = so->so_state; /* snapshot */
+ sotpi_info_t *sti = SOTOTPI(so);
+
+ dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
+ (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
+
+ ASSERT(vp->v_type == VSOCK);
+ ASSERT(vp->v_stream != NULL);
+
+ if (so->so_version == SOV_STREAM) {
+ /* The imaginary "sockmod" has been popped - act as a stream */
+ return (strpoll(vp->v_stream, events, anyyet,
+ reventsp, phpp));
+ }
+
+ if (!(so_state & SS_ISCONNECTED) &&
+ (so->so_mode & SM_CONNREQUIRED)) {
+ /* Not connected yet - turn off write side events */
+ events &= ~(POLLOUT|POLLWRBAND);
+ }
+ /*
+ * Check for errors without calling strpoll if the caller wants them.
+ * In sockets the errors are represented as input/output events
+ * and there is no need to ask the stream head for this information.
+ */
+ if (so->so_error != 0 &&
+ ((POLLIN|POLLRDNORM|POLLOUT) & origevents) != 0) {
+ *reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
+ return (0);
+ }
+ /*
+ * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
+ * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
+ * will not trigger a POLLIN event with POLLRDDATA set.
+ * The handling of urgent data (causing POLLRDBAND) is done by
+ * inspecting SS_OOBPEND below.
+ */
+ events |= POLLRDDATA;
+
+ /*
+ * After shutdown(output) a stream head write error is set.
+ * However, we should not return output events.
+ */
+ events |= POLLNOERR;
+ error = strpoll(vp->v_stream, events, anyyet,
+ reventsp, phpp);
+ if (error)
+ return (error);
+
+ ASSERT(!(*reventsp & POLLERR));
+
+ /*
+ * Notes on T_CONN_IND handling for sockets.
+ *
+ * If strpoll() returned without events, SR_POLLIN is guaranteed
+ * to be set, ensuring any subsequent strrput() runs pollwakeup().
+ *
+ * Since the so_lock is not held, soqueueconnind() may have run
+ * and a T_CONN_IND may be waiting. We now check for any queued
+ * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
+ * to ensure poll returns.
+ *
+ * However:
+ * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
+ * when strrput() does run for an arriving M_PROTO with T_CONN_IND
+ * the following actions will occur; taken together they ensure the
+ * syscall will return.
+ *
+ * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
+ * the accept() was run on a non-blocking socket sowaitconnind()
+ * may have already returned EWOULDBLOCK, so not be waiting to
+ * process the message. Additionally socktpi_poll() has probably
+ * proceeded past the sti_conn_ind_head check below.
+ * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
+ * this thread, however that could occur before poll_common()
+ * has entered cv_wait.
+ * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
+ *
+ * Before proceeding to cv_wait() in poll_common() for an event,
+ * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
+ * and if set, re-calls strpoll() to ensure the late arriving
+ * T_CONN_IND is recognized, and pollsys() returns.
+ */
+
+ if (sti->sti_conn_ind_head != NULL)
+ *reventsp |= (POLLIN|POLLRDNORM) & events;
+
+ if (so->so_state & SS_OOBPEND)
+ *reventsp |= POLLRDBAND & events;
+
+ if (sti->sti_nl7c_rcv_mp != NULL) {
+ *reventsp |= (POLLIN|POLLRDNORM) & events;
+ }
+ if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
+ ((POLLIN|POLLRDNORM) & *reventsp)) {
+ sti->sti_nl7c_flags |= NL7C_POLLIN;
+ }
+
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+socktpi_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
+ int error = 0;
+
+ error = sonode_constructor(buf, cdrarg, kmflags);
+ if (error != 0)
+ return (error);
+
+ error = i_sotpi_info_constructor(&st->st_info);
+ if (error != 0)
+ sonode_destructor(buf, cdrarg);
+
+ st->st_sonode.so_priv = &st->st_info;
+
+ return (error);
+}
+
+/*ARGSUSED1*/
+static void
+socktpi_destructor(void *buf, void *cdrarg)
+{
+ sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
+
+ ASSERT(st->st_sonode.so_priv == &st->st_info);
+ st->st_sonode.so_priv = NULL;
+
+ i_sotpi_info_destructor(&st->st_info);
+ sonode_destructor(buf, cdrarg);
+}
+
+static int
+socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ int retval;
+
+ if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
+ struct sonode *so = (struct sonode *)buf;
+ sotpi_info_t *sti = SOTOTPI(so);
+
+ mutex_enter(&socklist.sl_lock);
+
+ sti->sti_next_so = socklist.sl_list;
+ sti->sti_prev_so = NULL;
+ if (sti->sti_next_so != NULL)
+ SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
+ socklist.sl_list = so;
+
+ mutex_exit(&socklist.sl_lock);
+
+ }
+ return (retval);
+}
+
+static void
+socktpi_unix_destructor(void *buf, void *cdrarg)
+{
+ struct sonode *so = (struct sonode *)buf;
+ sotpi_info_t *sti = SOTOTPI(so);
+
+ mutex_enter(&socklist.sl_lock);
+
+ if (sti->sti_next_so != NULL)
+ SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
+ if (sti->sti_prev_so != NULL)
+ SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
+ else
+ socklist.sl_list = sti->sti_next_so;
+
+ mutex_exit(&socklist.sl_lock);
+
+ socktpi_destructor(buf, cdrarg);
+}
+
+int
+socktpi_init(void)
+{
+ /*
+ * Create sonode caches. We create a special one for AF_UNIX so
+ * that we can track them for netstat(1m).
+ */
+ socktpi_cache = kmem_cache_create("socktpi_cache",
+ sizeof (struct sotpi_sonode), 0, socktpi_constructor,
+ socktpi_destructor, NULL, NULL, NULL, 0);
+
+ socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
+ sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
+ socktpi_unix_destructor, NULL, NULL, NULL, 0);
+
+ return (0);
+}
+
+/*
+ * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
+ *
+ * Caller must still update state and mode using sotpi_update_state().
+ *
+ * Returns the STREAM queue that the protocol should use.
+ */
+queue_t *
+sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
+ boolean_t *direct, struct cred *cr)
+{
+ sotpi_info_t *sti;
+ struct sockparams *origsp = so->so_sockparams;
+ sock_lower_handle_t handle = so->so_proto_handle;
+ uint_t old_state = so->so_state;
+ struct stdata *stp;
+ struct vnode *vp;
+ queue_t *q;
+
+ *direct = B_FALSE;
+ so->so_sockparams = newsp;
+ /*
+ * Allocate and initalize fields required by TPI.
+ */
+ (void) sotpi_info_create(so, KM_SLEEP);
+ sotpi_info_init(so);
+
+ if (sotpi_init(so, NULL, cr, SO_FALLBACK) != 0) {
+ sotpi_info_fini(so);
+ sotpi_info_destroy(so);
+ so->so_state = old_state;
+ return (NULL);
+ }
+ ASSERT(handle == so->so_proto_handle);
+ sti = SOTOTPI(so);
+ if (sti->sti_direct != 0)
+ *direct = B_TRUE;
+
+ /*
+ * Keep the original sp around so we can properly dispose of the
+ * sonode when the socket is being closed.
+ */
+ sti->sti_orig_sp = origsp;
+
+ so_basic_strinit(so); /* skips the T_CAPABILITY_REQ */
+ so_alloc_addr(so, so->so_max_addr_len);
+
+ /*
+ * If the application has done a SIOCSPGRP, make sure the
+ * STREAM head is aware. This needs to take place before
+ * the protocol start sending up messages. Otherwise we
+ * might miss to generate SIGPOLL.
+ *
+ * It is possible that the application will receive duplicate
+ * signals if some were already generated for either data or
+ * connection indications.
+ */
+ if (so->so_pgrp != 0) {
+ mutex_enter(&so->so_lock);
+ if (so_set_events(so, so->so_vnode, cr) != 0)
+ so->so_pgrp = 0;
+ mutex_exit(&so->so_lock);
+ }
+
+ /*
+ * Determine which queue to use.
+ */
+ vp = SOTOV(so);
+ stp = vp->v_stream;
+ ASSERT(stp != NULL);
+ q = stp->sd_wrq->q_next;
+
+ /*
+ * Skip any modules that may have been auto pushed when the device
+ * was opened
+ */
+ while (q->q_next != NULL)
+ q = q->q_next;
+ q = _RD(q);
+
+ return (q);
+}
+
+void
+sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
+ struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
+ socklen_t faddrlen, short opts)
+{
+ sotpi_info_t *sti = SOTOTPI(so);
+
+ so_proc_tcapability_ack(so, tcap);
+
+ so->so_options |= opts;
+
+ /*
+ * Determine whether the foreign and local address are valid
+ */
+ if (laddrlen != 0) {
+ ASSERT(laddrlen <= sti->sti_laddr_maxlen);
+ sti->sti_laddr_len = laddrlen;
+ bcopy(laddr, sti->sti_laddr_sa, laddrlen);
+ sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
+ }
+
+ if (faddrlen != 0) {
+ ASSERT(faddrlen <= sti->sti_faddr_maxlen);
+ sti->sti_faddr_len = faddrlen;
+ bcopy(faddr, sti->sti_faddr_sa, faddrlen);
+ sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
+ }
+
+}
+
+/*
+ * Allocate enough space to cache the local and foreign addresses.
+ */
+void
+so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
+{
+ sotpi_info_t *sti = SOTOTPI(so);
+
+ ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
+ ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
+ sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
+ P2ROUNDUP(maxlen, KMEM_ALIGN);
+ so->so_max_addr_len = sti->sti_laddr_maxlen;
+ sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
+ sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
+ + sti->sti_laddr_maxlen);
+
+ if (so->so_family == AF_UNIX) {
+ /*
+ * Initialize AF_UNIX related fields.
+ */
+ bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
+ bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
+ }
+}
+
+
+sotpi_info_t *
+sotpi_sototpi(struct sonode *so)
+{
+ sotpi_info_t *sti;
+
+ if (so == NULL)
+ return (NULL);
+
+ sti = (sotpi_info_t *)so->so_priv;
+
+ ASSERT(sti != NULL);
+ ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
+
+ return (sti);
+}
+
+static int
+i_sotpi_info_constructor(sotpi_info_t *sti)
+{
+ sti->sti_magic = SOTPI_INFO_MAGIC;
+ sti->sti_ack_mp = NULL;
+ sti->sti_discon_ind_mp = NULL;
+ sti->sti_ux_bound_vp = NULL;
+ sti->sti_unbind_mp = NULL;
+
+ sti->sti_conn_ind_head = NULL;
+ sti->sti_conn_ind_tail = NULL;
+
+ sti->sti_laddr_sa = NULL;
+ sti->sti_faddr_sa = NULL;
+
+ sti->sti_nl7c_flags = 0;
+ sti->sti_nl7c_uri = NULL;
+ sti->sti_nl7c_rcv_mp = NULL;
+
+ mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
+
+ return (0);
+}
+
+static void
+i_sotpi_info_destructor(sotpi_info_t *sti)
+{
+ ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
+ ASSERT(sti->sti_ack_mp == NULL);
+ ASSERT(sti->sti_discon_ind_mp == NULL);
+ ASSERT(sti->sti_ux_bound_vp == NULL);
+ ASSERT(sti->sti_unbind_mp == NULL);
+
+ ASSERT(sti->sti_conn_ind_head == NULL);
+ ASSERT(sti->sti_conn_ind_tail == NULL);
+
+ ASSERT(sti->sti_laddr_sa == NULL);
+ ASSERT(sti->sti_faddr_sa == NULL);
+
+ ASSERT(sti->sti_nl7c_flags == 0);
+ ASSERT(sti->sti_nl7c_uri == NULL);
+ ASSERT(sti->sti_nl7c_rcv_mp == NULL);
+
+ mutex_destroy(&sti->sti_plumb_lock);
+ cv_destroy(&sti->sti_ack_cv);
+}
+
+/*
+ * Creates and attaches TPI information to the given sonode
+ */
+static boolean_t
+sotpi_info_create(struct sonode *so, int kmflags)
+{
+ sotpi_info_t *sti;
+
+ ASSERT(so->so_priv == NULL);
+
+ if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
+ return (B_FALSE);
+
+ if (i_sotpi_info_constructor(sti) != 0) {
+ kmem_free(sti, sizeof (*sti));
+ return (B_FALSE);
+ }
+
+ so->so_priv = (void *)sti;
+ return (B_TRUE);
+}
+
+/*
+ * Initializes the TPI information.
+ */
+static void
+sotpi_info_init(struct sonode *so)
+{
+ struct vnode *vp = SOTOV(so);
+ sotpi_info_t *sti = SOTOTPI(so);
+ time_t now;
+
+ sti->sti_dev = so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
+ vp->v_rdev = sti->sti_dev;
+
+ sti->sti_orig_sp = NULL;
+
+ sti->sti_pushcnt = 0;
+
+ now = gethrestime_sec();
+ sti->sti_atime = now;
+ sti->sti_mtime = now;
+ sti->sti_ctime = now;
+
+ sti->sti_eaddr_mp = NULL;
+ sti->sti_delayed_error = 0;
+
+ sti->sti_provinfo = NULL;
+
+ sti->sti_oobcnt = 0;
+ sti->sti_oobsigcnt = 0;
+
+ ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
+
+ sti->sti_laddr_sa = 0;
+ sti->sti_faddr_sa = 0;
+ sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
+ sti->sti_laddr_len = sti->sti_faddr_len = 0;
+
+ sti->sti_laddr_valid = 0;
+ sti->sti_faddr_valid = 0;
+ sti->sti_faddr_noxlate = 0;
+
+ sti->sti_direct = 0;
+
+ ASSERT(sti->sti_ack_mp == NULL);
+ ASSERT(sti->sti_ux_bound_vp == NULL);
+ ASSERT(sti->sti_unbind_mp == NULL);
+
+ ASSERT(sti->sti_conn_ind_head == NULL);
+ ASSERT(sti->sti_conn_ind_tail == NULL);
+
+ /* Initialize the kernel SSL proxy fields */
+ sti->sti_kssl_type = KSSL_NO_PROXY;
+ sti->sti_kssl_ent = NULL;
+ sti->sti_kssl_ctx = NULL;
+}
+
+/*
+ * Given a sonode, grab the TPI info and free any data.
+ */
+static void
+sotpi_info_fini(struct sonode *so)
+{
+ sotpi_info_t *sti = SOTOTPI(so);
+ mblk_t *mp;
+
+ ASSERT(sti->sti_discon_ind_mp == NULL);
+
+ if ((mp = sti->sti_conn_ind_head) != NULL) {
+ mblk_t *mp1;
+
+ while (mp) {
+ mp1 = mp->b_next;
+ mp->b_next = NULL;
+ freemsg(mp);
+ mp = mp1;
+ }
+ sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
+ }
+
+ /*
+ * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
+ * indirect them. It also uses so_count as a validity test.
+ */
+ mutex_enter(&so->so_lock);
+
+ if (sti->sti_laddr_sa) {
+ ASSERT((caddr_t)sti->sti_faddr_sa ==
+ (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
+ ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
+ sti->sti_laddr_valid = 0;
+ sti->sti_faddr_valid = 0;
+ kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
+ sti->sti_laddr_sa = NULL;
+ sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
+ sti->sti_faddr_sa = NULL;
+ sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
+ }
+
+ mutex_exit(&so->so_lock);
+
+ if ((mp = sti->sti_eaddr_mp) != NULL) {
+ freemsg(mp);
+ sti->sti_eaddr_mp = NULL;
+ sti->sti_delayed_error = 0;
+ }
+
+ if ((mp = sti->sti_ack_mp) != NULL) {
+ freemsg(mp);
+ sti->sti_ack_mp = NULL;
+ }
+
+ if ((mp = sti->sti_nl7c_rcv_mp) != NULL) {
+ sti->sti_nl7c_rcv_mp = NULL;
+ freemsg(mp);
+ }
+ sti->sti_nl7c_rcv_rval = 0;
+ if (sti->sti_nl7c_uri != NULL) {
+ nl7c_urifree(so);
+ /* urifree() cleared nl7c_uri */
+ }
+ if (sti->sti_nl7c_flags) {
+ sti->sti_nl7c_flags = 0;
+ }
+
+ ASSERT(sti->sti_ux_bound_vp == NULL);
+ if ((mp = sti->sti_unbind_mp) != NULL) {
+ freemsg(mp);
+ sti->sti_unbind_mp = NULL;
+ }
+}
+
+/*
+ * Destroys the TPI information attached to a sonode.
+ */
+static void
+sotpi_info_destroy(struct sonode *so)
+{
+ sotpi_info_t *sti = SOTOTPI(so);
+
+ i_sotpi_info_destructor(sti);
+ kmem_free(sti, sizeof (*sti));
+
+ so->so_priv = NULL;
+}
+
+/*
+ * Create the global sotpi socket module entry. It will never be free.
+ */
+smod_info_t *
+sotpi_smod_create(void)
+{
+ smod_info_t *smodp;
+
+ smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
+ smodp->smod_name = kmem_zalloc(strlen(SOTPI_SMOD_NAME), + 1);
+ (void *)strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
+ /*
+ * Initilization the refcnt to 1 so it will never be free.
+ */
+ smodp->smod_refcnt = 1;
+ smodp->smod_uc_version = SOCK_UC_VERSION;
+ smodp->smod_dc_version = SOCK_DC_VERSION;
+ smodp->smod_sock_create_func = &sotpi_create;
+ smodp->smod_sock_destroy_func = &sotpi_destroy;
+ return (smodp);
+}
diff --git a/usr/src/uts/common/fs/sockfs/socktpi.h b/usr/src/uts/common/fs/sockfs/socktpi.h
new file mode 100644
index 0000000000..4c1a5de268
--- /dev/null
+++ b/usr/src/uts/common/fs/sockfs/socktpi.h
@@ -0,0 +1,282 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SOCKFS_SOCKTPI_H
+#define _SOCKFS_SOCKTPI_H
+
+#include <inet/kssl/ksslapi.h>
+#include <sys/sodirect.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Internal representation used for addresses.
+ */
+struct soaddr {
+ struct sockaddr *soa_sa; /* Actual address */
+ t_uscalar_t soa_len; /* Length in bytes for kmem_free */
+ t_uscalar_t soa_maxlen; /* Allocated length */
+};
+/* Maximum size address for transports that have ADDR_size == 1 */
+#define SOA_DEFSIZE 128
+
+struct sonode;
+
+/*
+ * TPI Sockets
+ * ======================
+ *
+ * A TPI socket can be created by the TPI socket module, or as a
+ * result of fallback. In either case, the TPI related information is
+ * stored in a sotpi_info_t. Sockets that are TPI based from the
+ * beginning will use a sotpi_sonode_t, but fallback case the
+ * sotpi_info_t will be allocated when needed. However, the so_priv
+ * field in the sonode will always point to the sotpi_info_t, and the
+ * structure should only be accessed via so_priv. Use SOTOTPI().
+ *
+ * A TPI socket always corresponds to a VCHR stream representing the
+ * transport provider (e.g. /dev/tcp). This information is retrieved
+ * from the kernel socket configuration table and accessible via
+ * so_sockparams->sp_sdev_info. sockfs uses this to perform
+ * VOP_ACCESS checks before allowing an open of the transport
+ * provider.
+ *
+ * AF_UNIX Sockets
+ * -------------------------
+ *
+ * When an AF_UNIX socket is bound to a pathname the sockfs creates a
+ * VSOCK vnode in the underlying file system. However, the vnodeops
+ * etc in this VNODE remain those of the underlying file system.
+ * Sockfs uses the v_stream pointer in the underlying file system
+ * VSOCK node to find the sonode bound to the pathname. The bound
+ * pathname vnode is accessed through sti_ux_vp.
+ *
+ * Out of Band Data Handling
+ * -------------------------
+ *
+ * The counts (sti_oobcnt and sti_oobsigcnt) track the number of
+ * urgent indicates that are (logically) queued on the stream head
+ * read queue. The urgent data is queued on the stream head
+ * as follows.
+ *
+ * In the normal case the SIGURG is not generated until
+ * the T_EXDATA_IND arrives at the stream head. However, transports
+ * that have an early indication that urgent data is pending
+ * (e.g. TCP receiving a "new" urgent pointer value) can send up
+ * an M_PCPROTO/SIGURG message to generate the signal early.
+ *
+ * The mark is indicated by either:
+ * - a T_EXDATA_IND (with no M_DATA b_cont) with MSGMARK set.
+ * When this message is consumed by sorecvmsg the socket layer
+ * sets SS_RCVATMARK until data has been consumed past the mark.
+ * - a message with MSGMARKNEXT set (indicating that the
+ * first byte of the next message constitutes the mark). When
+ * the last byte of the MSGMARKNEXT message is consumed in
+ * the stream head the stream head sets STRATMARK. This flag
+ * is cleared when at least one byte is read. (Note that
+ * the MSGMARKNEXT messages can be of zero length when there
+ * is no previous data to which the marknext can be attached.)
+ *
+ * While the T_EXDATA_IND method is the common case which is used
+ * with all TPI transports, the MSGMARKNEXT method is needed to
+ * indicate the mark when e.g. the TCP urgent byte has not been
+ * received yet but the TCP urgent pointer has made TCP generate
+ * the M_PCSIG/SIGURG.
+ *
+ * The signal (the M_PCSIG carrying the SIGURG) and the mark
+ * indication can not be delivered as a single message, since
+ * the signal should be delivered as high priority and any mark
+ * indication must flow with the data. This implies that immediately
+ * when the SIGURG has been delivered if the stream head queue is
+ * empty it is impossible to determine if this will be the position
+ * of the mark. This race condition is resolved by using MSGNOTMARKNEXT
+ * messages and the STRNOTATMARK flag in the stream head. The
+ * SIOCATMARK code calls the stream head to wait for either a
+ * non-empty queue or one of the STR*ATMARK flags being set.
+ * This implies that any transport that is sending M_PCSIG(SIGURG)
+ * should send the appropriate MSGNOTMARKNEXT message (which can be
+ * zero length) after sending an M_PCSIG to prevent SIOCATMARK
+ * from sleeping unnecessarily.
+ */
+
+#define SOTPI_INFO_MAGIC 0x12345678
+
+/*
+ * Information used by TPI/STREAMS sockets
+ */
+typedef struct sotpi_info {
+ /*
+ * These fields are initialized once.
+ */
+ uint32_t sti_magic; /* always set to SOTPI_INFO_MAGIC */
+ dev_t sti_dev; /* device the sonode represents */
+
+ struct sockparams *sti_orig_sp; /* in case of fallback; the orig sp */
+
+ kmutex_t sti_plumb_lock; /* serializes plumbs, and the related */
+ /* so_pushcnt */
+ short sti_pushcnt; /* Number of modules above "sockmod" */
+
+ kcondvar_t sti_ack_cv; /* wait for TPI acks */
+
+ uint8_t
+ sti_laddr_valid : 1, /* sti_laddr valid for user */
+ sti_faddr_valid : 1, /* sti_faddr valid for user */
+ sti_faddr_noxlate : 1, /* No xlation of faddr for AF_UNIX */
+
+ sti_direct : 1, /* transport is directly below */
+
+ sti_pad_to_bit7 : 4;
+
+ mblk_t *sti_ack_mp; /* TPI ack received from below */
+ mblk_t *sti_unbind_mp; /* Preallocated T_UNBIND_REQ message */
+
+ time_t sti_atime; /* time of last access */
+ time_t sti_mtime; /* time of last modification */
+ time_t sti_ctime; /* time of last attributes change */
+
+ ushort_t sti_delayed_error; /* From T_uderror_ind */
+ mblk_t *sti_eaddr_mp; /* for so_delayed_error */
+ /* put here for delayed processing */
+
+ mblk_t *sti_conn_ind_head; /* b_next list of T_CONN_IND */
+ mblk_t *sti_conn_ind_tail;
+
+ uint_t sti_oobsigcnt; /* Number of SIGURG generated */
+ uint_t sti_oobcnt; /* Number of T_EXDATA_IND queued */
+
+ /* From T_info_ack */
+ t_uscalar_t sti_tsdu_size;
+ t_uscalar_t sti_etsdu_size;
+ t_scalar_t sti_addr_size;
+ t_uscalar_t sti_opt_size;
+ t_uscalar_t sti_tidu_size;
+ t_scalar_t sti_serv_type;
+
+ /* From T_capability_ack */
+ t_uscalar_t sti_acceptor_id;
+
+ /* Internal provider information */
+ struct tpi_provinfo *sti_provinfo;
+
+ /*
+ * The local and remote addresses have multiple purposes
+ * but one of the key reasons for their existence and careful
+ * tracking in sockfs is to support getsockname and getpeername
+ * when the transport does not handle the TI_GET*NAME ioctls
+ * and caching when it does (signalled by valid bits in so_state).
+ * When all transports support the new TPI (with T_ADDR_REQ)
+ * we can revisit this code.
+ *
+ * The other usage of sti_faddr is to keep the "connected to"
+ * address for datagram sockets.
+ *
+ * Finally, for AF_UNIX both local and remote addresses are used
+ * to record the sockaddr_un since we use a separate namespace
+ * in the loopback transport.
+ */
+ struct soaddr sti_laddr; /* Local address */
+ struct soaddr sti_faddr; /* Peer address */
+#define sti_laddr_sa sti_laddr.soa_sa
+#define sti_faddr_sa sti_faddr.soa_sa
+#define sti_laddr_len sti_laddr.soa_len
+#define sti_faddr_len sti_faddr.soa_len
+#define sti_laddr_maxlen sti_laddr.soa_maxlen
+#define sti_faddr_maxlen sti_faddr.soa_maxlen
+
+ /*
+ * For AF_UNIX sockets:
+ *
+ * sti_ux_laddr/faddr records the internal addresses used with the
+ * transport. sti_ux_vp and v_stream->sd_vnode form the
+ * cross-linkage between the underlying fs vnode corresponding
+ * to the bound sockaddr_un and the socket node.
+ */
+ struct so_ux_addr sti_ux_laddr; /* laddr bound with the transport */
+ struct so_ux_addr sti_ux_faddr; /* temporary peer address */
+ struct vnode *sti_ux_bound_vp; /* bound AF_UNIX file system vnode */
+ struct sonode *sti_next_so; /* next sonode on socklist */
+ struct sonode *sti_prev_so; /* previous sonode on socklist */
+ mblk_t *sti_discon_ind_mp; /* T_DISCON_IND received from below */
+
+ /*
+ * For NL7C sockets:
+ *
+ * sti_nl7c_flags the NL7C state of URL processing.
+ *
+ * sti_nl7c_rcv_mp mblk_t chain of already received data to be
+ * passed up to the app after NL7C gives up on
+ * a socket.
+ *
+ * sti_nl7c_rcv_rval returned rval for last mblk_t from above.
+ *
+ * sti_nl7c_uri the URI currently being processed.
+ *
+ * sti_nl7c_rtime URI request gethrestime_sec().
+ *
+ * sti_nl7c_addr pointer returned by nl7c_addr_lookup().
+ */
+ uint64_t sti_nl7c_flags;
+ mblk_t *sti_nl7c_rcv_mp;
+ int64_t sti_nl7c_rcv_rval;
+ void *sti_nl7c_uri;
+ time_t sti_nl7c_rtime;
+ void *sti_nl7c_addr;
+
+ /* For sockets acting as an in-kernel SSL proxy */
+ kssl_endpt_type_t sti_kssl_type; /* is proxy/is proxied/none */
+ kssl_ent_t sti_kssl_ent; /* SSL config entry */
+ kssl_ctx_t sti_kssl_ctx; /* SSL session context */
+} sotpi_info_t;
+
+struct T_capability_ack;
+
+extern sonodeops_t sotpi_sonodeops;
+
+extern int socktpi_init(void);
+extern queue_t *sotpi_convert_sonode(struct sonode *, struct sockparams *,
+ boolean_t *, struct cred *);
+extern void sotpi_update_state(struct sonode *, struct T_capability_ack *,
+ struct sockaddr *, socklen_t, struct sockaddr *, socklen_t,
+ short);
+
+extern sotpi_info_t *sotpi_sototpi(struct sonode *);
+#ifdef DEBUG
+#define SOTOTPI(so) (sotpi_sototpi(so))
+#else
+#define SOTOTPI(so) ((sotpi_info_t *)(so)->so_priv)
+#endif
+
+/* for consumers outside sockfs */
+#define _SOTOTPI(so) ((sotpi_info_t *)(so)->so_priv)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SOCKFS_SOCKTPI_H */
diff --git a/usr/src/uts/common/fs/sockfs/socktpi_impl.h b/usr/src/uts/common/fs/sockfs/socktpi_impl.h
new file mode 100644
index 0000000000..aa0b04bf1c
--- /dev/null
+++ b/usr/src/uts/common/fs/sockfs/socktpi_impl.h
@@ -0,0 +1,99 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SOCKFS_SOCKTPI_IMPL_H
+#define _SOCKFS_SOCKTPI_IMPL_H
+
+#include <sys/socketvar.h>
+#include <fs/sockfs/socktpi.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * so_priv will always be set to &st_info
+ */
+typedef struct sotpi_sonode {
+ struct sonode st_sonode;
+ struct sotpi_info st_info;
+} sotpi_sonode_t;
+
+extern void so_proc_tcapability_ack(struct sonode *,
+ struct T_capability_ack *);
+extern void so_basic_strinit(struct sonode *);
+extern void so_alloc_addr(struct sonode *, t_uscalar_t);
+extern int so_set_events(struct sonode *, vnode_t *, cred_t *);
+extern int so_sock2stream(struct sonode *);
+extern void so_stream2sock(struct sonode *);
+
+extern int so_strinit(struct sonode *, struct sonode *);
+extern void so_update_attrs(struct sonode *, int);
+extern int sogetrderr(vnode_t *, int, int *);
+extern int sogetwrerr(vnode_t *, int, int *);
+extern int so_addr_verify(struct sonode *, const struct sockaddr *,
+ socklen_t);
+extern int so_ux_addr_xlate(struct sonode *, struct sockaddr *,
+ socklen_t, int, void **, socklen_t *);
+extern void so_unix_close(struct sonode *);
+
+extern int sowaitprim(struct sonode *, t_scalar_t, t_scalar_t,
+ t_uscalar_t, mblk_t **, clock_t);
+extern int sowaitokack(struct sonode *, t_scalar_t);
+extern int sowaitack(struct sonode *, mblk_t **, clock_t);
+extern void soqueueack(struct sonode *, mblk_t *);
+extern int sowaitconnind(struct sonode *, int, mblk_t **);
+extern void soqueueconnind(struct sonode *, mblk_t *);
+extern int soflushconnind(struct sonode *, t_scalar_t);
+extern void so_drain_discon_ind(struct sonode *);
+extern void so_flush_discon_ind(struct sonode *);
+
+extern mblk_t *soallocproto(size_t, int);
+extern mblk_t *soallocproto1(const void *, ssize_t, ssize_t, int);
+extern void soappendmsg(mblk_t *, const void *, ssize_t);
+extern mblk_t *soallocproto2(const void *, ssize_t, const void *, ssize_t,
+ ssize_t, int);
+extern mblk_t *soallocproto3(const void *, ssize_t, const void *, ssize_t,
+ const void *, ssize_t, ssize_t, int);
+
+extern int so_set_asyncsigs(vnode_t *, pid_t, int, int, cred_t *);
+extern int so_flip_async(struct sonode *, vnode_t *, int, cred_t *);
+extern int so_set_siggrp(struct sonode *, vnode_t *, pid_t, int, cred_t *);
+
+extern void so_installhooks(struct sonode *);
+
+extern int kstrwritemp(struct vnode *, mblk_t *, ushort_t);
+extern int sostream_direct(struct sonode *, struct uio *,
+ mblk_t *, cred_t *);
+extern int sosend_dgram(struct sonode *, struct sockaddr *,
+ socklen_t, struct uio *, int);
+extern int sosend_svc(struct sonode *, struct uio *, t_scalar_t, int, int);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SOCKFS_SOCKTPI_IMPL_H */
diff --git a/usr/src/uts/common/fs/sockfs/sockvnops.c b/usr/src/uts/common/fs/sockfs/sockvnops.c
deleted file mode 100644
index e9195c5e11..0000000000
--- a/usr/src/uts/common/fs/sockfs/sockvnops.c
+++ /dev/null
@@ -1,1438 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-
-/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
- */
-
-#pragma ident "%Z%%M% %I% %E% SMI"
-
-#include <sys/types.h>
-#include <sys/thread.h>
-#include <sys/t_lock.h>
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/bitmap.h>
-#include <sys/buf.h>
-#include <sys/cmn_err.h>
-#include <sys/conf.h>
-#include <sys/debug.h>
-#include <sys/errno.h>
-#include <sys/time.h>
-#include <sys/fcntl.h>
-#include <sys/flock.h>
-#include <sys/file.h>
-#include <sys/kmem.h>
-#include <sys/mman.h>
-#include <sys/open.h>
-#include <sys/swap.h>
-#include <sys/sysmacros.h>
-#include <sys/uio.h>
-#include <sys/vfs.h>
-#include <sys/vfs_opreg.h>
-#include <sys/vnode.h>
-#include <sys/poll.h>
-#include <sys/stropts.h>
-#include <sys/stream.h>
-#include <sys/strsubr.h>
-#include <sys/strsun.h>
-#include <sys/suntpi.h>
-#include <sys/ioctl.h>
-#include <sys/sockio.h>
-#include <sys/filio.h>
-#include <sys/stat.h>
-#include <sys/proc.h>
-#include <sys/user.h>
-#include <sys/session.h>
-#include <sys/vmsystm.h>
-#include <sys/vtrace.h>
-#include <sys/policy.h>
-
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-#include <netinet/in.h>
-#include <sys/un.h>
-
-#define _SUN_TPI_VERSION 2
-#include <sys/tihdr.h>
-
-#include <vm/seg.h>
-#include <vm/seg_map.h>
-#include <vm/page.h>
-#include <vm/pvn.h>
-#include <vm/seg_dev.h>
-#include <vm/seg_vn.h>
-
-#include <fs/fs_subr.h>
-
-#include <sys/esunddi.h>
-#include <sys/autoconf.h>
-
-#include <fs/sockfs/nl7c.h>
-#include <fs/sockfs/nl7curi.h>
-
-#include <inet/udp_impl.h>
-#include <inet/tcp_impl.h>
-
-#include <inet/kssl/ksslapi.h>
-
-static int socktpi_close(struct vnode *, int, int, offset_t, struct cred *,
- caller_context_t *);
-static int socktpi_read(struct vnode *, struct uio *, int, struct cred *,
- caller_context_t *);
-static int socktpi_write(struct vnode *, struct uio *, int, struct cred *,
- caller_context_t *);
-static int socktpi_plumbioctl(struct vnode *, int, intptr_t, int, struct cred *,
- int32_t *);
-static void socktpi_inactive(struct vnode *, struct cred *, caller_context_t *);
-static int socktpi_poll(struct vnode *, short, int, short *,
- struct pollhead **, caller_context_t *);
-
-struct vnodeops *socktpi_vnodeops;
-
-const fs_operation_def_t socktpi_vnodeops_template[] = {
- VOPNAME_OPEN, { .vop_open = socktpi_open },
- VOPNAME_CLOSE, { .vop_close = socktpi_close },
- VOPNAME_READ, { .vop_read = socktpi_read },
- VOPNAME_WRITE, { .vop_write = socktpi_write },
- VOPNAME_IOCTL, { .vop_ioctl = socktpi_ioctl },
- VOPNAME_SETFL, { .vop_setfl = socktpi_setfl },
- VOPNAME_GETATTR, { .vop_getattr = socktpi_getattr },
- VOPNAME_SETATTR, { .vop_setattr = socktpi_setattr },
- VOPNAME_ACCESS, { .vop_access = socktpi_access },
- VOPNAME_FSYNC, { .vop_fsync = socktpi_fsync },
- VOPNAME_INACTIVE, { .vop_inactive = socktpi_inactive },
- VOPNAME_FID, { .vop_fid = socktpi_fid },
- VOPNAME_SEEK, { .vop_seek = socktpi_seek },
- VOPNAME_POLL, { .vop_poll = socktpi_poll },
- VOPNAME_DISPOSE, { .error = fs_error },
- NULL, NULL
-};
-
-/*
- * Do direct function call to the transport layer below; this would
- * also allow the transport to utilize read-side synchronous stream
- * interface if necessary. This is a /etc/system tunable that must
- * not be modified on a running system. By default this is enabled
- * for performance reasons and may be disabled for debugging purposes.
- */
-boolean_t socktpi_direct = B_TRUE;
-
-/*
- * Open routine used by socket() call. Note that vn_open checks for
- * VSOCK and fails the open (and VOP_OPEN is fs_nosys). The VSOCK check is
- * needed since VSOCK type vnodes exist in various underlying filesystems as
- * a result of an AF_UNIX bind to a pathname.
- *
- * Sockets assume that the driver will clone (either itself
- * or by using the clone driver) i.e. a socket() call will always
- * result in a new vnode being created. This routine single-threads
- * open/closes for a given vnode which is probably not needed.
- */
-int
-socktpi_open(struct vnode **vpp, int flag, struct cred *cr,
- caller_context_t *ct)
-{
- major_t maj;
- dev_t newdev;
- struct vnode *vp = *vpp;
- struct sonode *so;
- int error = 0;
- struct stdata *stp;
-
- dprint(1, ("socktpi_open()\n"));
- flag &= ~FCREAT; /* paranoia */
-
- so = VTOSO(vp);
-
- mutex_enter(&so->so_lock);
- so->so_count++; /* one more open reference */
- ASSERT(so->so_count != 0); /* wraparound */
- if (so->so_count == 1)
- so->so_zoneid = getzoneid();
- mutex_exit(&so->so_lock);
-
- ASSERT(vp->v_type == VSOCK);
-
- newdev = vp->v_rdev;
- maj = getmajor(newdev);
- ASSERT(STREAMSTAB(maj));
-
- mutex_enter(&so->so_lock);
- so_lock_single(so); /* Set SOLOCKED */
- mutex_exit(&so->so_lock);
-
- error = stropen(vp, &newdev, flag, cr);
-
- stp = vp->v_stream;
- if (error == 0) {
- if (so->so_flag & SOCLONE)
- ASSERT(newdev != vp->v_rdev);
- mutex_enter(&so->so_lock);
- so->so_dev = newdev;
- vp->v_rdev = newdev;
- so_unlock_single(so, SOLOCKED);
- mutex_exit(&so->so_lock);
-
- if (stp->sd_flag & STRISTTY) {
- /*
- * this is a post SVR4 tty driver - a socket can not
- * be a controlling terminal. Fail the open.
- */
- (void) socktpi_close(vp, flag, 1, (offset_t)0, cr, ct);
- return (ENOTTY); /* XXX */
- }
-
- ASSERT(stp->sd_wrq != NULL);
- so->so_provinfo = tpi_findprov(stp->sd_wrq);
-
- /*
- * If caller is interested in doing direct function call
- * interface to/from transport module, probe the module
- * directly beneath the streamhead to see if it qualifies.
- *
- * We turn off the direct interface when qualifications fail.
- * In the acceptor case, we simply turn off the SS_DIRECT
- * flag on the socket. We do the fallback after the accept
- * has completed, before the new socket is returned to the
- * application.
- */
- if (so->so_state & SS_DIRECT) {
- queue_t *tq = stp->sd_wrq->q_next;
-
- /*
- * SS_DIRECT is currently supported and tested
- * only for tcp/udp; this is the main reason to
- * have the following assertions.
- */
- ASSERT(so->so_family == AF_INET ||
- so->so_family == AF_INET6);
- ASSERT(so->so_protocol == IPPROTO_UDP ||
- so->so_protocol == IPPROTO_TCP ||
- so->so_protocol == IPPROTO_IP);
- ASSERT(so->so_type == SOCK_DGRAM ||
- so->so_type == SOCK_STREAM);
-
- /*
- * Abort direct call interface if the module directly
- * underneath the stream head is not defined with the
- * _D_DIRECT flag. This could happen in the tcp or
- * udp case, when some other module is autopushed
- * above it, or for some reasons the expected module
- * isn't purely D_MP (which is the main requirement).
- *
- * Else, SS_DIRECT is valid. If the read-side Q has
- * _QSODIRECT set then and uioasync is enabled then
- * set SS_SODIRECT to enable sodirect.
- */
- if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
- !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
- int rval;
-
- /* Continue on without direct calls */
- so->so_state &= ~SS_DIRECT;
- if (!(flag & SO_ACCEPTOR)) {
- if ((error = strioctl(vp,
- _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
- CRED(), &rval)) != 0) {
- (void) socktpi_close(vp, flag,
- 1, (offset_t)0, cr, ct);
- return (error);
- }
- }
- } else if ((_OTHERQ(tq)->q_flag & _QSODIRECT) &&
- uioasync.enabled) {
- /* Enable sodirect */
- so->so_state |= SS_SODIRECT;
- }
- }
- } else {
- /*
- * While the same socket can not be reopened (unlike specfs)
- * the stream head sets STREOPENFAIL when the autopush fails.
- */
- if ((stp != NULL) &&
- (stp->sd_flag & STREOPENFAIL)) {
- /*
- * Open failed part way through.
- */
- mutex_enter(&stp->sd_lock);
- stp->sd_flag &= ~STREOPENFAIL;
- mutex_exit(&stp->sd_lock);
-
- mutex_enter(&so->so_lock);
- so_unlock_single(so, SOLOCKED);
- mutex_exit(&so->so_lock);
- (void) socktpi_close(vp, flag, 1,
- (offset_t)0, cr, ct);
- return (error);
- /*NOTREACHED*/
- }
- ASSERT(stp == NULL);
- mutex_enter(&so->so_lock);
- so_unlock_single(so, SOLOCKED);
- ASSERT(so->so_count > 0);
- so->so_count--; /* one less open reference */
- mutex_exit(&so->so_lock);
- }
- TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
- "sockfs open:maj %d vp %p so %p error %d", maj,
- vp, so, error);
- return (error);
-}
-
-/*ARGSUSED2*/
-static int
-socktpi_close(
- struct vnode *vp,
- int flag,
- int count,
- offset_t offset,
- struct cred *cr,
- caller_context_t *ct)
-{
- struct sonode *so;
- dev_t dev;
- int error = 0;
-
- so = VTOSO(vp);
-
- dprintso(so, 1, ("socktpi_close(%p, %x, %d) %s\n",
- (void *)vp, flag, count, pr_state(so->so_state, so->so_mode)));
-
- cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
- cleanshares(vp, ttoproc(curthread)->p_pid);
- if (vp->v_stream)
- strclean(vp);
- if (count > 1)
- return (0);
-
- dev = so->so_dev;
-
- ASSERT(vp->v_type == VSOCK);
- ASSERT(STREAMSTAB(getmajor(dev)));
-
- mutex_enter(&so->so_lock);
- so_lock_single(so); /* Set SOLOCKED */
- ASSERT(so->so_count > 0);
- so->so_count--; /* one fewer open reference */
-
- /*
- * Only call NL7C's close on last open reference.
- */
- if (so->so_count == 0 && (so->so_nl7c_flags & NL7C_ENABLED)) {
- so->so_nl7c_flags = 0;
- nl7c_close(so);
- }
-
- /*
- * Only call the close routine when the last open reference through
- * any [s, v]node goes away.
- */
- if (so->so_count == 0 && vp->v_stream != NULL) {
- vnode_t *ux_vp;
-
- if (so->so_family == AF_UNIX) {
- /* Could avoid this when CANTSENDMORE for !dgram */
- so_unix_close(so);
- }
-
- mutex_exit(&so->so_lock);
- /*
- * Disassemble the linkage from the AF_UNIX underlying file
- * system vnode to this socket (by atomically clearing
- * v_stream in vn_rele_stream) before strclose clears sd_vnode
- * and frees the stream head.
- */
- if ((ux_vp = so->so_ux_bound_vp) != NULL) {
- ASSERT(ux_vp->v_stream);
- so->so_ux_bound_vp = NULL;
- vn_rele_stream(ux_vp);
- }
- if (so->so_family == AF_INET || so->so_family == AF_INET6) {
- strsetrwputdatahooks(SOTOV(so), NULL, NULL);
- if (so->so_kssl_ent != NULL) {
- kssl_release_ent(so->so_kssl_ent, so,
- so->so_kssl_type);
- so->so_kssl_ent = NULL;
- }
- if (so->so_kssl_ctx != NULL) {
- kssl_release_ctx(so->so_kssl_ctx);
- so->so_kssl_ctx = NULL;
- }
- so->so_kssl_type = KSSL_NO_PROXY;
- }
- error = strclose(vp, flag, cr);
- vp->v_stream = NULL;
- mutex_enter(&so->so_lock);
- }
-
- /*
- * Flush the T_DISCON_IND on so_discon_ind_mp.
- */
- if (so->so_count == 0)
- so_flush_discon_ind(so);
-
- so_unlock_single(so, SOLOCKED);
- mutex_exit(&so->so_lock);
-
- /*
- * Decrement the device driver's reference count for streams
- * opened via the clone dip. The driver was held in clone_open().
- * The absence of clone_close() forces this asymmetry.
- */
- if (so->so_flag & SOCLONE)
- ddi_rele_driver(getmajor(dev));
-
- return (error);
-}
-
-/*ARGSUSED2*/
-static int
-socktpi_read(
- struct vnode *vp,
- struct uio *uiop,
- int ioflag,
- struct cred *cr,
- caller_context_t *ct)
-{
- struct sonode *so = VTOSO(vp);
- struct nmsghdr lmsg;
-
- dprintso(so, 1, ("socktpi_read(%p) %s\n",
- (void *)so, pr_state(so->so_state, so->so_mode)));
-
- ASSERT(vp->v_type == VSOCK);
- so_update_attrs(so, SOACC);
-
- uiop->uio_extflg |= UIO_COPY_CACHED;
-
- if (so->so_version == SOV_STREAM) {
- /* The imaginary "sockmod" has been popped - act as a stream */
- return (strread(vp, uiop, cr));
- }
- lmsg.msg_namelen = 0;
- lmsg.msg_controllen = 0;
- lmsg.msg_flags = 0;
- return (sotpi_recvmsg(so, &lmsg, uiop));
-}
-
-/* ARGSUSED2 */
-static int
-socktpi_write(
- struct vnode *vp,
- struct uio *uiop,
- int ioflag,
- struct cred *cr,
- caller_context_t *ct)
-{
- struct sonode *so = VTOSO(vp);
- int so_state;
- int so_mode;
- int error;
-
- dprintso(so, 1, ("socktpi_write(%p) %s\n",
- (void *)so, pr_state(so->so_state, so->so_mode)));
-
- ASSERT(vp->v_type == VSOCK);
-
- if (so->so_family == AF_UNIX)
- uiop->uio_extflg |= UIO_COPY_CACHED;
- else
- uiop->uio_extflg &= ~UIO_COPY_CACHED;
- if (so->so_version == SOV_STREAM) {
- /* The imaginary "sockmod" has been popped - act as a stream */
- so_update_attrs(so, SOMOD);
- return (strwrite(vp, uiop, cr));
- }
- /* State checks */
- so_state = so->so_state;
- so_mode = so->so_mode;
- if (so_state & SS_CANTSENDMORE) {
- tsignal(curthread, SIGPIPE);
- return (EPIPE);
- }
-
- if (so->so_error != 0) {
- mutex_enter(&so->so_lock);
- error = sogeterr(so);
- if (error != 0) {
- mutex_exit(&so->so_lock);
- return (error);
- }
- mutex_exit(&so->so_lock);
- }
-
- if ((so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
- (SS_ISCONNECTED|SS_ISBOUND)) {
- if (so_mode & SM_CONNREQUIRED)
- return (ENOTCONN);
- else
- return (EDESTADDRREQ);
- }
-
- if (!(so_mode & SM_CONNREQUIRED)) {
- /*
- * Note that this code does not prevent so_faddr_sa
- * from changing while it is being used. Thus
- * if an "unconnect"+connect occurs concurrently with
- * this write the datagram might be delivered to a
- * garbled address.
- */
- so_update_attrs(so, SOMOD);
- return (sosend_dgram(so, so->so_faddr_sa,
- (t_uscalar_t)so->so_faddr_len, uiop, 0));
- }
- so_update_attrs(so, SOMOD);
-
- if (so_mode & SM_BYTESTREAM) {
- /* Send M_DATA messages */
- if ((so->so_nl7c_flags & NL7C_ENABLED) &&
- (error = nl7c_data(so, uiop)) >= 0) {
- /* NL7C consumed the data */
- return (error);
- }
- if ((so_state & SS_DIRECT) &&
- canputnext(vp->v_stream->sd_wrq)) {
- return (sostream_direct(so, uiop, NULL, cr));
- }
- return (strwrite(vp, uiop, cr));
- } else {
- /* Send T_DATA_REQ messages without MORE_flag set */
- return (sosend_svc(so, uiop, T_DATA_REQ, 0, 0));
- }
-}
-
-int
-so_copyin(const void *from, void *to, size_t size, int fromkernel)
-{
- if (fromkernel) {
- bcopy(from, to, size);
- return (0);
- }
- return (xcopyin(from, to, size));
-}
-
-int
-so_copyout(const void *from, void *to, size_t size, int tokernel)
-{
- if (tokernel) {
- bcopy(from, to, size);
- return (0);
- }
- return (xcopyout(from, to, size));
-}
-
-/*ARGSUSED6*/
-int
-socktpi_ioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
- struct cred *cr, int32_t *rvalp, caller_context_t *ct)
-{
- struct sonode *so = VTOSO(vp);
- int error = 0;
-
- ASSERT(vp->v_type == VSOCK);
- dprintso(so, 0, ("socktpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
- cmd, arg, pr_state(so->so_state, so->so_mode)));
-
- switch (cmd) {
- case _I_INSERT:
- case _I_REMOVE:
- /*
- * Since there's no compelling reason to support these ioctls
- * on sockets, and doing so would increase the complexity
- * markedly, prevent it.
- */
- return (EOPNOTSUPP);
-
- case I_FIND:
- case I_LIST:
- case I_LOOK:
- case I_POP:
- case I_PUSH:
- /*
- * To prevent races and inconsistencies between the actual
- * state of the stream and the state according to the sonode,
- * we serialize all operations which modify or operate on the
- * list of modules on the socket's stream.
- */
- mutex_enter(&so->so_plumb_lock);
- error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
- mutex_exit(&so->so_plumb_lock);
- return (error);
-
- default:
- if (so->so_version != SOV_STREAM)
- break;
-
- /*
- * The imaginary "sockmod" has been popped; act as a stream.
- */
- return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
- }
-
- ASSERT(so->so_version != SOV_STREAM);
-
- /*
- * Process socket-specific ioctls.
- */
- switch (cmd) {
- case FIONBIO: {
- int32_t value;
-
- if (so_copyin((void *)arg, &value, sizeof (int32_t),
- (mode & (int)FKIOCTL)))
- return (EFAULT);
-
- mutex_enter(&so->so_lock);
- if (value) {
- so->so_state |= SS_NDELAY;
- } else {
- so->so_state &= ~SS_NDELAY;
- }
- mutex_exit(&so->so_lock);
- return (0);
- }
-
- case FIOASYNC: {
- int32_t value;
-
- if (so_copyin((void *)arg, &value, sizeof (int32_t),
- (mode & (int)FKIOCTL)))
- return (EFAULT);
-
- mutex_enter(&so->so_lock);
- /*
- * SS_ASYNC flag not already set correctly?
- * (!value != !(so->so_state & SS_ASYNC))
- * but some engineers find that too hard to read.
- */
- if (value == 0 && (so->so_state & SS_ASYNC) != 0 ||
- value != 0 && (so->so_state & SS_ASYNC) == 0)
- error = so_flip_async(so, vp, mode, cr);
- mutex_exit(&so->so_lock);
- return (error);
- }
-
- case SIOCSPGRP:
- case FIOSETOWN: {
- pid_t pgrp;
-
- if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
- (mode & (int)FKIOCTL)))
- return (EFAULT);
-
- mutex_enter(&so->so_lock);
- dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
- /* Any change? */
- if (pgrp != so->so_pgrp)
- error = so_set_siggrp(so, vp, pgrp, mode, cr);
- mutex_exit(&so->so_lock);
- return (error);
- }
- case SIOCGPGRP:
- case FIOGETOWN:
- if (so_copyout(&so->so_pgrp, (void *)arg,
- sizeof (pid_t), (mode & (int)FKIOCTL)))
- return (EFAULT);
- return (0);
-
- case SIOCATMARK: {
- int retval;
- uint_t so_state;
-
- /*
- * strwaitmark has a finite timeout after which it
- * returns -1 if the mark state is undetermined.
- * In order to avoid any race between the mark state
- * in sockfs and the mark state in the stream head this
- * routine loops until the mark state can be determined
- * (or the urgent data indication has been removed by some
- * other thread).
- */
- do {
- mutex_enter(&so->so_lock);
- so_state = so->so_state;
- mutex_exit(&so->so_lock);
- if (so_state & SS_RCVATMARK) {
- retval = 1;
- } else if (!(so_state & SS_OOBPEND)) {
- /*
- * No SIGURG has been generated -- there is no
- * pending or present urgent data. Thus can't
- * possibly be at the mark.
- */
- retval = 0;
- } else {
- /*
- * Have the stream head wait until there is
- * either some messages on the read queue, or
- * STRATMARK or STRNOTATMARK gets set. The
- * STRNOTATMARK flag is used so that the
- * transport can send up a MSGNOTMARKNEXT
- * M_DATA to indicate that it is not
- * at the mark and additional data is not about
- * to be send upstream.
- *
- * If the mark state is undetermined this will
- * return -1 and we will loop rechecking the
- * socket state.
- */
- retval = strwaitmark(vp);
- }
- } while (retval == -1);
-
- if (so_copyout(&retval, (void *)arg, sizeof (int),
- (mode & (int)FKIOCTL)))
- return (EFAULT);
- return (0);
- }
-
- case I_FDINSERT:
- case I_SENDFD:
- case I_RECVFD:
- case I_ATMARK:
- case _SIOCSOCKFALLBACK:
- /*
- * These ioctls do not apply to sockets. I_FDINSERT can be
- * used to send M_PROTO messages without modifying the socket
- * state. I_SENDFD/RECVFD should not be used for socket file
- * descriptor passing since they assume a twisted stream.
- * SIOCATMARK must be used instead of I_ATMARK.
- *
- * _SIOCSOCKFALLBACK from an application should never be
- * processed. It is only generated by socktpi_open() or
- * in response to I_POP or I_PUSH.
- */
-#ifdef DEBUG
- zcmn_err(getzoneid(), CE_WARN,
- "Unsupported STREAMS ioctl 0x%x on socket. "
- "Pid = %d\n", cmd, curproc->p_pid);
-#endif /* DEBUG */
- return (EOPNOTSUPP);
-
- case _I_GETPEERCRED:
- if ((mode & FKIOCTL) == 0)
- return (EINVAL);
-
- mutex_enter(&so->so_lock);
- if ((so->so_mode & SM_CONNREQUIRED) == 0) {
- error = ENOTSUP;
- } else if ((so->so_state & SS_ISCONNECTED) == 0) {
- error = ENOTCONN;
- } else if (so->so_peercred != NULL) {
- k_peercred_t *kp = (k_peercred_t *)arg;
- kp->pc_cr = so->so_peercred;
- kp->pc_cpid = so->so_cpid;
- crhold(so->so_peercred);
- } else {
- error = EINVAL;
- }
- mutex_exit(&so->so_lock);
- return (error);
-
- default:
- /*
- * Do the higher-order bits of the ioctl cmd indicate
- * that it is an I_* streams ioctl?
- */
- if ((cmd & 0xffffff00U) == STR &&
- so->so_version == SOV_SOCKBSD) {
-#ifdef DEBUG
- zcmn_err(getzoneid(), CE_WARN,
- "Unsupported STREAMS ioctl 0x%x on socket. "
- "Pid = %d\n", cmd, curproc->p_pid);
-#endif /* DEBUG */
- return (EOPNOTSUPP);
- }
- return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
- }
-}
-
-/*
- * Handle plumbing-related ioctls.
- */
-static int
-socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
- struct cred *cr, int32_t *rvalp)
-{
- static const char sockmod_name[] = "sockmod";
- struct sonode *so = VTOSO(vp);
- char mname[FMNAMESZ + 1];
- int error;
-
- ASSERT(MUTEX_HELD(&so->so_plumb_lock));
-
- if (so->so_version == SOV_SOCKBSD)
- return (EOPNOTSUPP);
-
- if (so->so_version == SOV_STREAM) {
- /*
- * The imaginary "sockmod" has been popped - act as a stream.
- * If this is a push of sockmod then change back to a socket.
- */
- if (cmd == I_PUSH) {
- error = ((mode & FKIOCTL) ? copystr : copyinstr)(
- (void *)arg, mname, sizeof (mname), NULL);
-
- if (error == 0 && strcmp(mname, sockmod_name) == 0) {
- dprintso(so, 0, ("socktpi_ioctl: going to "
- "socket version\n"));
- so_stream2sock(so);
- return (0);
- }
- }
- return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
- }
-
- switch (cmd) {
- case I_PUSH:
- if (so->so_state & SS_DIRECT) {
- mutex_enter(&so->so_lock);
- so_lock_single(so);
- mutex_exit(&so->so_lock);
-
- error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
- CRED(), rvalp);
-
- mutex_enter(&so->so_lock);
- if (error == 0)
- so->so_state &= ~SS_DIRECT;
- so_unlock_single(so, SOLOCKED);
- mutex_exit(&so->so_lock);
-
- if (error != 0)
- return (error);
- }
-
- error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
- if (error == 0)
- so->so_pushcnt++;
- return (error);
-
- case I_POP:
- if (so->so_pushcnt == 0) {
- /* Emulate sockmod being popped */
- dprintso(so, 0,
- ("socktpi_ioctl: going to STREAMS version\n"));
- return (so_sock2stream(so));
- }
-
- error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
- if (error == 0)
- so->so_pushcnt--;
- return (error);
-
- case I_LIST: {
- struct str_mlist *kmlistp, *umlistp;
- struct str_list kstrlist;
- ssize_t kstrlistsize;
- int i, nmods;
-
- STRUCT_DECL(str_list, ustrlist);
- STRUCT_INIT(ustrlist, mode);
-
- if (arg == NULL) {
- error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
- if (error == 0)
- (*rvalp)++; /* Add one for sockmod */
- return (error);
- }
-
- error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
- STRUCT_SIZE(ustrlist), mode & FKIOCTL);
- if (error != 0)
- return (error);
-
- nmods = STRUCT_FGET(ustrlist, sl_nmods);
- if (nmods <= 0)
- return (EINVAL);
- /*
- * Ceiling nmods at nstrpush to prevent someone from
- * maliciously consuming lots of kernel memory.
- */
- nmods = MIN(nmods, nstrpush);
-
- kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
- kstrlist.sl_nmods = nmods;
- kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
-
- error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
- cr, rvalp);
- if (error != 0)
- goto done;
-
- /*
- * Considering the module list as a 0-based array of sl_nmods
- * modules, sockmod should conceptually exist at slot
- * so_pushcnt. Insert sockmod at this location by sliding all
- * of the module names after so_pushcnt over by one. We know
- * that there will be room to do this since we allocated
- * sl_modlist with an additional slot.
- */
- for (i = kstrlist.sl_nmods; i > so->so_pushcnt; i--)
- kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
-
- (void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
- kstrlist.sl_nmods++;
-
- /*
- * Copy all of the entries out to ustrlist.
- */
- kmlistp = kstrlist.sl_modlist;
- umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
- for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
- error = so_copyout(kmlistp++, umlistp++,
- sizeof (struct str_mlist), mode & FKIOCTL);
- if (error != 0)
- goto done;
- }
-
- error = so_copyout(&i, (void *)arg, sizeof (int32_t),
- mode & FKIOCTL);
- if (error == 0)
- *rvalp = 0;
- done:
- kmem_free(kstrlist.sl_modlist, kstrlistsize);
- return (error);
- }
- case I_LOOK:
- if (so->so_pushcnt == 0) {
- return (so_copyout(sockmod_name, (void *)arg,
- sizeof (sockmod_name), mode & FKIOCTL));
- }
- return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
-
- case I_FIND:
- error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
- if (error && error != EINVAL)
- return (error);
-
- /* if not found and string was sockmod return 1 */
- if (*rvalp == 0 || error == EINVAL) {
- error = ((mode & FKIOCTL) ? copystr : copyinstr)(
- (void *)arg, mname, sizeof (mname), NULL);
- if (error == ENAMETOOLONG)
- error = EINVAL;
-
- if (error == 0 && strcmp(mname, sockmod_name) == 0)
- *rvalp = 1;
- }
- return (error);
-
- default:
- panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
- break;
- }
-
- return (0);
-}
-
-/*
- * Allow any flags. Record FNDELAY and FNONBLOCK so that they can be inherited
- * from listener to acceptor.
- */
-/* ARGSUSED */
-int
-socktpi_setfl(vnode_t *vp, int oflags, int nflags, cred_t *cr,
- caller_context_t *ct)
-{
- struct sonode *so;
- int error = 0;
-
- so = VTOSO(vp);
-
- dprintso(so, 0, ("socktpi_setfl: oflags 0x%x, nflags 0x%x, state %s\n",
- oflags, nflags, pr_state(so->so_state, so->so_mode)));
- mutex_enter(&so->so_lock);
- if (nflags & FNDELAY)
- so->so_state |= SS_NDELAY;
- else
- so->so_state &= ~SS_NDELAY;
- if (nflags & FNONBLOCK)
- so->so_state |= SS_NONBLOCK;
- else
- so->so_state &= ~SS_NONBLOCK;
- mutex_exit(&so->so_lock);
-
- /*
- * Sets/clears the SS_ASYNC flag based on the presence/absence
- * of the FASYNC flag passed to fcntl(F_SETFL).
- * This exists solely for BSD fcntl() FASYNC compatibility.
- */
- so = VTOSO(vp->v_stream->sd_vnode);
-
- if (so->so_version != SOV_STREAM) {
- mutex_enter(&so->so_lock);
-
- /*
- * SS_ASYNC flag not already set correctly?
- * (!(nflags & FASYNC) != !(so->so_state & SS_ASYNC))
- * but some engineers find that too hard to read.
- */
- if ((nflags & FASYNC) == 0 && (so->so_state & SS_ASYNC) != 0 ||
- (nflags & FASYNC) != 0 && (so->so_state & SS_ASYNC) == 0)
- error = so_flip_async(so, SOTOV(so), 0, CRED());
- mutex_exit(&so->so_lock);
- }
- return (error);
-}
-
-/*
- * Get the made up attributes for the vnode.
- * 4.3BSD returns the current time for all the timestamps.
- * 4.4BSD returns 0 for all the timestamps.
- * Here we use the access and modified times recorded in the sonode.
- *
- * Just like in BSD there is not effect on the underlying file system node
- * bound to an AF_UNIX pathname.
- *
- * When sockmod has been popped this will act just like a stream. Since
- * a socket is always a clone there is no need to inspect the attributes
- * of the "realvp".
- */
-/* ARGSUSED */
-int
-socktpi_getattr(
- struct vnode *vp,
- struct vattr *vap,
- int flags,
- struct cred *cr,
- caller_context_t *ct)
-{
- dev_t fsid;
- struct sonode *so;
- static int sonode_shift = 0;
-
- /*
- * Calculate the amount of bitshift to a sonode pointer which will
- * still keep it unique. See below.
- */
- if (sonode_shift == 0)
- sonode_shift = highbit(sizeof (struct sonode));
- ASSERT(sonode_shift > 0);
-
- so = VTOSO(vp);
- fsid = so->so_fsid;
-
- if (so->so_version == SOV_STREAM) {
- /*
- * The imaginary "sockmod" has been popped - act
- * as a stream
- */
- vap->va_type = VCHR;
- vap->va_mode = 0;
- } else {
- vap->va_type = vp->v_type;
- vap->va_mode = S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|
- S_IROTH|S_IWOTH;
- }
- vap->va_uid = vap->va_gid = 0;
- vap->va_fsid = fsid;
- /*
- * If the va_nodeid is > MAX_USHORT, then i386 stats might fail.
- * So we shift down the sonode pointer to try and get the most
- * uniqueness into 16-bits.
- */
- vap->va_nodeid = ((ino_t)so >> sonode_shift) & 0xFFFF;
- vap->va_nlink = 0;
- vap->va_size = 0;
-
- /*
- * We need to zero out the va_rdev to avoid some fstats getting
- * EOVERFLOW. This also mimics SunOS 4.x and BSD behavior.
- */
- vap->va_rdev = (dev_t)0;
- vap->va_blksize = MAXBSIZE;
- vap->va_nblocks = btod(vap->va_size);
-
- mutex_enter(&so->so_lock);
- vap->va_atime.tv_sec = so->so_atime;
- vap->va_mtime.tv_sec = so->so_mtime;
- vap->va_ctime.tv_sec = so->so_ctime;
- mutex_exit(&so->so_lock);
-
- vap->va_atime.tv_nsec = 0;
- vap->va_mtime.tv_nsec = 0;
- vap->va_ctime.tv_nsec = 0;
- vap->va_seq = 0;
-
- return (0);
-}
-
-/*
- * Set attributes.
- * Just like in BSD there is not effect on the underlying file system node
- * bound to an AF_UNIX pathname.
- *
- * When sockmod has been popped this will act just like a stream. Since
- * a socket is always a clone there is no need to modify the attributes
- * of the "realvp".
- */
-/* ARGSUSED */
-int
-socktpi_setattr(
- struct vnode *vp,
- struct vattr *vap,
- int flags,
- struct cred *cr,
- caller_context_t *ct)
-{
- struct sonode *so = VTOSO(vp);
-
- /*
- * If times were changed, update sonode.
- */
- mutex_enter(&so->so_lock);
- if (vap->va_mask & AT_ATIME)
- so->so_atime = vap->va_atime.tv_sec;
- if (vap->va_mask & AT_MTIME) {
- so->so_mtime = vap->va_mtime.tv_sec;
- so->so_ctime = gethrestime_sec();
- }
- mutex_exit(&so->so_lock);
-
- return (0);
-}
-
-int
-socktpi_access(struct vnode *vp, int mode, int flags, struct cred *cr,
- caller_context_t *ct)
-{
- struct vnode *accessvp;
- struct sonode *so = VTOSO(vp);
-
- if ((accessvp = so->so_accessvp) != NULL)
- return (VOP_ACCESS(accessvp, mode, flags, cr, ct));
- else
- return (0); /* Allow all access. */
-}
-
-/*
- * 4.3BSD and 4.4BSD fail a fsync on a socket with EINVAL.
- * This code does the same to be compatible and also to not give an
- * application the impression that the data has actually been "synced"
- * to the other end of the connection.
- */
-/* ARGSUSED */
-int
-socktpi_fsync(struct vnode *vp, int syncflag, struct cred *cr,
- caller_context_t *ct)
-{
- return (EINVAL);
-}
-
-/* ARGSUSED */
-static void
-socktpi_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
-{
- struct sonode *so = VTOSO(vp);
-
- mutex_enter(&vp->v_lock);
- /*
- * If no one has reclaimed the vnode, remove from the
- * cache now.
- */
- if (vp->v_count < 1)
- cmn_err(CE_PANIC, "socktpi_inactive: Bad v_count");
-
- /*
- * Drop the temporary hold by vn_rele now
- */
- if (--vp->v_count != 0) {
- mutex_exit(&vp->v_lock);
- return;
- }
- mutex_exit(&vp->v_lock);
-
- /* We are the sole owner of so now */
-
- ASSERT(!vn_has_cached_data(vp));
- sockfree(so);
-}
-
-/* ARGSUSED */
-int
-socktpi_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
-{
- return (EINVAL);
-}
-
-/*
- * Sockets are not seekable.
- * (and there is a bug to fix STREAMS to make them fail this as well).
- */
-/*ARGSUSED*/
-int
-socktpi_seek(struct vnode *vp, offset_t ooff, offset_t *noffp,
- caller_context_t *ct)
-{
- return (ESPIPE);
-}
-
-/*
- * Wrapper around the streams poll routine that implements socket poll
- * semantics.
- * Sockfs never calls pollwakeup() itself - the stream head takes care
- * of all pollwakeups. Since sockfs never holds so_lock when calling the
- * stream head there can never be a deadlock due to holding so_lock across
- * pollwakeup and acquiring so_lock in this routine.
- *
- * However, since the performance of VOP_POLL is critical we avoid
- * acquiring so_lock here. This is based on the following assumptions:
- * - The poll implementation holds locks to serialize the VOP_POLL call
- * and a pollwakeup for the same pollhead. This ensures that should
- * so_state etc change during a socktpi_poll() call, the pollwakeup()
- * (which strsock_* and strrput() conspire to issue) is issued after
- * the state change. Thus the pollwakeup will block until VOP_POLL has
- * returned, and then wake up poll and have it call VOP_POLL again.
- *
- * - The reading of so_state without holding so_lock does not result in
- * stale data (older than the latest state change that has dropped
- * so_lock). This is ensured as mutex_exit() issues the appropriate
- * memory barrier to force the data into the coherency domain.
- *
- * - Whilst so_state may change during the VOP_POLL call, (SS_HASCONNIND
- * may have been set by an arriving connection), the above two factors
- * guarantee validity of SS_ISCONNECTED/SM_CONNREQUIRED in the entry
- * time snapshot. In order to capture the arrival of a connection while
- * VOP_POLL was in progress, we then check real so_state, (so->so_state)
- * for SS_HASCONNIND and set appropriate events to ensure poll_common()
- * will not sleep.
- */
-/*ARGSUSED5*/
-static int
-socktpi_poll(
- struct vnode *vp,
- short events,
- int anyyet,
- short *reventsp,
- struct pollhead **phpp,
- caller_context_t *ct)
-{
- short origevents = events;
- struct sonode *so = VTOSO(vp);
- int error;
- int so_state = so->so_state; /* snapshot */
-
- dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
- (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
-
- ASSERT(vp->v_type == VSOCK);
- ASSERT(vp->v_stream != NULL);
-
- if (so->so_version == SOV_STREAM) {
- /* The imaginary "sockmod" has been popped - act as a stream */
- return (strpoll(vp->v_stream, events, anyyet,
- reventsp, phpp));
- }
-
- if (!(so_state & SS_ISCONNECTED) &&
- (so->so_mode & SM_CONNREQUIRED)) {
- /* Not connected yet - turn off write side events */
- events &= ~(POLLOUT|POLLWRBAND);
- }
- /*
- * Check for errors without calling strpoll if the caller wants them.
- * In sockets the errors are represented as input/output events
- * and there is no need to ask the stream head for this information.
- */
- if (so->so_error != 0 &&
- ((POLLIN|POLLRDNORM|POLLOUT) & origevents) != 0) {
- *reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
- return (0);
- }
- /*
- * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
- * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
- * will not trigger a POLLIN event with POLLRDDATA set.
- * The handling of urgent data (causing POLLRDBAND) is done by
- * inspecting SS_OOBPEND below.
- */
- events |= POLLRDDATA;
-
- /*
- * After shutdown(output) a stream head write error is set.
- * However, we should not return output events.
- */
- events |= POLLNOERR;
- error = strpoll(vp->v_stream, events, anyyet,
- reventsp, phpp);
- if (error)
- return (error);
-
- ASSERT(!(*reventsp & POLLERR));
-
- /*
- * Notes on T_CONN_IND handling for sockets.
- *
- * If strpoll() returned without events, SR_POLLIN is guaranteed
- * to be set, ensuring any subsequent strrput() runs pollwakeup().
- *
- * Since the so_lock is not held, soqueueconnind() may have run
- * and a T_CONN_IND may be waiting. We now check for SS_HASCONNIND
- * in the current so_state and set appropriate events to ensure poll
- * returns.
- *
- * However:
- * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
- * when strrput() does run for an arriving M_PROTO with T_CONN_IND
- * the following actions will occur; taken together they ensure the
- * syscall will return.
- *
- * 1. If a socket, soqueueconnind() will set SS_HASCONNIND but if
- * the accept() was run on a non-blocking socket sowaitconnind()
- * may have already returned EWOULDBLOCK, so not be waiting to
- * process the message. Additionally socktpi_poll() has probably
- * proceeded past the SS_HASCONNIND check below.
- * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
- * this thread, however that could occur before poll_common()
- * has entered cv_wait.
- * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
- *
- * Before proceeding to cv_wait() in poll_common() for an event,
- * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
- * and if set, re-calls strpoll() to ensure the late arriving
- * T_CONN_IND is recognized, and pollsys() returns.
- */
- if (so->so_state & (SS_HASCONNIND|SS_OOBPEND)) {
- if (so->so_state & SS_HASCONNIND)
- *reventsp |= (POLLIN|POLLRDNORM) & events;
- if (so->so_state & SS_OOBPEND)
- *reventsp |= POLLRDBAND & events;
- }
-
- if (so->so_nl7c_rcv_mp != NULL) {
- *reventsp |= (POLLIN|POLLRDNORM) & events;
- }
- if ((so->so_nl7c_flags & NL7C_ENABLED) &&
- ((POLLIN|POLLRDNORM) & *reventsp)) {
- so->so_nl7c_flags |= NL7C_POLLIN;
- }
-
- return (0);
-}
-
-/*
- * Wrapper for getmsg. If the socket has been converted to a stream
- * pass the request to the stream head.
- */
-int
-sock_getmsg(
- struct vnode *vp,
- struct strbuf *mctl,
- struct strbuf *mdata,
- uchar_t *prip,
- int *flagsp,
- int fmode,
- rval_t *rvp
-)
-{
- struct sonode *so;
-
- ASSERT(vp->v_type == VSOCK);
- /*
- * Use the stream head to find the real socket vnode.
- * This is needed when namefs sits above sockfs. Some
- * sockets (like SCTP) are not streams.
- */
- if (!vp->v_stream) {
- return (ENOSTR);
- }
- ASSERT(vp->v_stream->sd_vnode);
- vp = vp->v_stream->sd_vnode;
- ASSERT(vn_matchops(vp, socktpi_vnodeops));
- so = VTOSO(vp);
-
- dprintso(so, 1, ("sock_getmsg(%p) %s\n",
- (void *)so, pr_state(so->so_state, so->so_mode)));
-
- if (so->so_version == SOV_STREAM) {
- /* The imaginary "sockmod" has been popped - act as a stream */
- return (strgetmsg(vp, mctl, mdata, prip, flagsp, fmode, rvp));
- }
- eprintsoline(so, ENOSTR);
- return (ENOSTR);
-}
-
-/*
- * Wrapper for putmsg. If the socket has been converted to a stream
- * pass the request to the stream head.
- *
- * Note that a while a regular socket (SOV_SOCKSTREAM) does support the
- * streams ioctl set it does not support putmsg and getmsg.
- * Allowing putmsg would prevent sockfs from tracking the state of
- * the socket/transport and would also invalidate the locking in sockfs.
- */
-int
-sock_putmsg(
- struct vnode *vp,
- struct strbuf *mctl,
- struct strbuf *mdata,
- uchar_t pri,
- int flag,
- int fmode
-)
-{
- struct sonode *so;
-
- ASSERT(vp->v_type == VSOCK);
- /*
- * Use the stream head to find the real socket vnode.
- * This is needed when namefs sits above sockfs.
- */
- if (!vp->v_stream) {
- return (ENOSTR);
- }
- ASSERT(vp->v_stream->sd_vnode);
- vp = vp->v_stream->sd_vnode;
- ASSERT(vn_matchops(vp, socktpi_vnodeops));
- so = VTOSO(vp);
-
- dprintso(so, 1, ("sock_putmsg(%p) %s\n",
- (void *)so, pr_state(so->so_state, so->so_mode)));
-
- if (so->so_version == SOV_STREAM) {
- /* The imaginary "sockmod" has been popped - act as a stream */
- return (strputmsg(vp, mctl, mdata, pri, flag, fmode));
- }
- eprintsoline(so, ENOSTR);
- return (ENOSTR);
-}
-
-/*
- * Special function called only from f_getfl().
- * Returns FASYNC if the SS_ASYNC flag is set on a socket, else 0.
- * No locks are acquired here, so it is safe to use while uf_lock is held.
- * This exists solely for BSD fcntl() FASYNC compatibility.
- */
-int
-sock_getfasync(vnode_t *vp)
-{
- struct sonode *so;
-
- ASSERT(vp->v_type == VSOCK);
- so = VTOSO(vp->v_stream->sd_vnode);
- if (so->so_version == SOV_STREAM || !(so->so_state & SS_ASYNC))
- return (0);
- return (FASYNC);
-}
diff --git a/usr/src/uts/common/inet/inetddi.c b/usr/src/uts/common/inet/inetddi.c
index 48a9e3aa2e..6b0cd5839a 100644
--- a/usr/src/uts/common/inet/inetddi.c
+++ b/usr/src/uts/common/inet/inetddi.c
@@ -23,7 +23,6 @@
* Use is subject to license terms.
*/
-
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/stream.h>
@@ -57,12 +56,23 @@
*
* Drivers that need to masquerade as IP should set INET_DEVMTFLAGS to
* IP_DEVMTFLAGS and set INET_DEVSTRTAB to ipinfo.
+ *
+ * The symbols that all socket modules must define are:
+ *
+ * INET_SOCKDESC The one-line description for this socket module
+ * INET_SOCK_PROTO_CREATE_FUNC The function used to create PCBs
+ *
+ * In addition, socket modules that can be converted to TPI must define:
+ *
+ * INET_SOCK_PROTO_FB_FUNC The function used to fallback to TPI
*/
#if !defined(INET_NAME)
#error inetddi.c: INET_NAME is not defined!
-#elif !defined(INET_DEVDESC) && !defined(INET_MODDESC)
-#error inetddi.c: at least one of INET_DEVDESC or INET_MODDESC must be defined!
+#elif !defined(INET_DEVDESC) && !defined(INET_MODDESC) && \
+ !defined(INET_SOCKDESC)
+#error inetddi.c: at least one of INET_DEVDESC or INET_MODDESC or \
+INET_SOCKDESC must be defined!
#elif defined(INET_DEVDESC) && !defined(INET_DEVSTRTAB)
#error inetddi.c: INET_DEVDESC is defined but INET_DEVSTRTAB is not!
#elif defined(INET_DEVDESC) && !defined(INET_DEVMTFLAGS)
@@ -73,6 +83,11 @@
#error inetddi.c: INET_MODDESC is defined but INET_MODSTRTAB is not!
#elif defined(INET_MODDESC) && !defined(INET_MODMTFLAGS)
#error inetddi.c: INET_MODDESC is defined but INET_MODMTFLAGS is not!
+#elif defined(INET_SOCKDESC) && !defined(SOCKMOD_VERSION)
+#error inetddi.c: INET_SOCKDESC is defined but SOCKMOD_VERSION is not!
+#elif defined(INET_SOCKDESC) && !defined(INET_SOCK_PROTO_CREATE_FUNC)
+#error inetddi.c: INET_SOCKDESC is defined but INET_SOCK_PROTO_CREATE_FUNC \
+is not!
#endif
#ifdef INET_DEVDESC
@@ -192,8 +207,39 @@ static struct modlstrmod modlstrmod = {
INET_MODDESC,
&fsw
};
+
#endif /* INET_MODDESC */
+#ifdef INET_SOCKDESC
+
+#ifdef INET_SOCK_PROTO_FB_FUNC
+static __smod_priv_t smodpriv = {
+ NULL,
+ NULL,
+ INET_SOCK_PROTO_FB_FUNC
+};
+#endif /* INET_SOCK_PROTO_FB_FUNC */
+
+static struct smod_reg_s smodreg = {
+ SOCKMOD_VERSION,
+ INET_NAME,
+ SOCK_UC_VERSION,
+ SOCK_DC_VERSION,
+ INET_SOCK_PROTO_CREATE_FUNC,
+#ifdef INET_SOCK_PROTO_FB_FUNC
+ &smodpriv
+#else
+ NULL
+#endif /* INET_SOCK_PROTO_FB_FUNC */
+};
+
+static struct modlsockmod modlsockmod = {
+ &mod_sockmodops,
+ INET_SOCKDESC,
+ &smodreg
+};
+#endif /* INET_SOCKDESC */
+
static struct modlinkage modlinkage = {
MODREV_1,
#ifdef INET_DEVDESC
@@ -202,5 +248,8 @@ static struct modlinkage modlinkage = {
#ifdef INET_MODDESC
&modlstrmod,
#endif
+#ifdef INET_SOCKDESC
+ &modlsockmod,
+#endif
NULL
};
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h
index c7ccff8a14..323c8fd0de 100644
--- a/usr/src/uts/common/inet/ip.h
+++ b/usr/src/uts/common/inet/ip.h
@@ -2771,7 +2771,7 @@ typedef struct ip_pktinfo {
#define ILL_LOOKUP_FAILED 1 /* Used as error code */
#define IPIF_LOOKUP_FAILED 2 /* Used as error code */
-#define ILL_CAN_LOOKUP(ill) \
+#define ILL_CAN_LOOKUP(ill) \
(!((ill)->ill_state_flags & (ILL_CONDEMNED | ILL_CHANGING)) || \
IAM_WRITER_ILL(ill))
@@ -2781,7 +2781,7 @@ typedef struct ip_pktinfo {
#define ILL_CAN_LOOKUP_WALKER(ill) \
(!((ill)->ill_state_flags & ILL_CONDEMNED))
-#define IPIF_CAN_LOOKUP(ipif) \
+#define IPIF_CAN_LOOKUP(ipif) \
(!((ipif)->ipif_state_flags & (IPIF_CONDEMNED | IPIF_CHANGING)) || \
IAM_WRITER_IPIF(ipif))
@@ -3166,11 +3166,15 @@ extern void icmp_unreachable(queue_t *, mblk_t *, uint8_t, zoneid_t,
ip_stack_t *);
extern mblk_t *ip_add_info(mblk_t *, ill_t *, uint_t, zoneid_t, ip_stack_t *);
extern mblk_t *ip_bind_v4(queue_t *, mblk_t *, conn_t *);
-extern int ip_bind_connected(conn_t *, mblk_t *, ipaddr_t *, uint16_t,
- ipaddr_t, uint16_t, boolean_t, boolean_t, boolean_t, boolean_t);
-extern boolean_t ip_bind_ipsec_policy_set(conn_t *, mblk_t *);
-extern int ip_bind_laddr(conn_t *, mblk_t *, ipaddr_t, uint16_t,
- boolean_t, boolean_t, boolean_t);
+extern boolean_t ip_bind_ipsec_policy_set(conn_t *, mblk_t *);
+extern int ip_bind_laddr_v4(conn_t *, mblk_t **, uint8_t, ipaddr_t,
+ uint16_t, boolean_t);
+extern int ip_proto_bind_laddr_v4(conn_t *, mblk_t **, uint8_t, ipaddr_t,
+ uint16_t, boolean_t);
+extern int ip_proto_bind_connected_v4(conn_t *, mblk_t **,
+ uint8_t, ipaddr_t *, uint16_t, ipaddr_t, uint16_t, boolean_t, boolean_t);
+extern int ip_bind_connected_v4(conn_t *, mblk_t **, uint8_t, ipaddr_t *,
+ uint16_t, ipaddr_t, uint16_t, boolean_t, boolean_t);
extern uint_t ip_cksum(mblk_t *, int, uint32_t);
extern int ip_close(queue_t *, int);
extern uint16_t ip_csum_hdr(ipha_t *);
@@ -3308,7 +3312,7 @@ extern boolean_t ip_md_hcksum_attr(struct multidata_s *, struct pdesc_s *,
uint32_t, uint32_t, uint32_t, uint32_t);
extern boolean_t ip_md_zcopy_attr(struct multidata_s *, struct pdesc_s *,
uint_t);
-extern mblk_t *ip_unbind(queue_t *, mblk_t *);
+extern void ip_unbind(conn_t *connp);
extern phyint_t *phyint_lookup_group(char *, boolean_t, ip_stack_t *);
extern phyint_t *phyint_lookup_group_ifindex(uint_t, ip_stack_t *);
@@ -3577,7 +3581,6 @@ extern void ip_squeue_quiesce_ring(ill_t *, ill_rx_ring_t *);
extern void ip_squeue_restart_ring(ill_t *, ill_rx_ring_t *);
extern void ip_squeue_clean_all(ill_t *);
-extern void ip_resume_tcp_bind(void *, mblk_t *, void *);
extern void tcp_wput(queue_t *, mblk_t *);
extern int ip_fill_mtuinfo(struct in6_addr *, in_port_t,
@@ -3635,6 +3638,8 @@ typedef void (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *);
#define SQTAG_IP_INPUT_RX_RING 39
#define SQTAG_SQUEUE_CHANGE 40
#define SQTAG_CONNECT_FINISH 41
+#define SQTAG_SYNCHRONOUS_OP 42
+#define SQTAG_TCP_SHUTDOWN_OUTPUT 43
#define NOT_OVER_IP(ip_wq) \
(ip_wq->q_next != NULL || \
@@ -3643,6 +3648,7 @@ typedef void (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *);
IP_MOD_NAME) != 0 || \
ip_wq->q_qinfo->qi_minfo->mi_idnum != IP_MOD_ID)
+#define PROTO_FLOW_CNTRLD(connp) (connp->conn_flow_cntrld)
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c
index 90cc6a51d5..c728a687d4 100644
--- a/usr/src/uts/common/inet/ip/icmp.c
+++ b/usr/src/uts/common/inet/ip/icmp.c
@@ -43,7 +43,9 @@
#include <sys/zone.h>
#include <sys/time.h>
+#include <sys/sockio.h>
#include <sys/socket.h>
+#include <sys/socketvar.h>
#include <sys/isa_defs.h>
#include <sys/suntpi.h>
#include <sys/xti_inet.h>
@@ -58,7 +60,7 @@
#include <inet/common.h>
#include <inet/ip.h>
#include <inet/ip6.h>
-#include <inet/mi.h>
+#include <inet/proto_set.h>
#include <inet/nd.h>
#include <inet/optcom.h>
#include <inet/snmpcom.h>
@@ -78,6 +80,7 @@
#include <inet/ip_if.h>
#include <inet/ip_impl.h>
+#include <sys/disp.h>
/*
* Synchronization notes:
@@ -99,41 +102,34 @@
*/
static void icmp_addr_req(queue_t *q, mblk_t *mp);
-static void icmp_bind(queue_t *q, mblk_t *mp);
-static void icmp_bind_proto(queue_t *q);
-static void icmp_bind_result(conn_t *, mblk_t *);
-static void icmp_bind_ack(conn_t *, mblk_t *mp);
-static void icmp_bind_error(conn_t *, mblk_t *mp);
+static void icmp_tpi_bind(queue_t *q, mblk_t *mp);
+static int icmp_bind_proto(conn_t *connp);
static int icmp_build_hdrs(icmp_t *icmp);
static void icmp_capability_req(queue_t *q, mblk_t *mp);
-static int icmp_close(queue_t *q);
-static void icmp_connect(queue_t *q, mblk_t *mp);
-static void icmp_disconnect(queue_t *q, mblk_t *mp);
+static int icmp_close(queue_t *q, int flags);
+static void icmp_tpi_connect(queue_t *q, mblk_t *mp);
+static void icmp_tpi_disconnect(queue_t *q, mblk_t *mp);
static void icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
int sys_error);
static void icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
t_scalar_t t_error, int sys_error);
-static void icmp_icmp_error(queue_t *q, mblk_t *mp);
-static void icmp_icmp_error_ipv6(queue_t *q, mblk_t *mp);
+static void icmp_icmp_error(conn_t *connp, mblk_t *mp);
+static void icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp);
static void icmp_info_req(queue_t *q, mblk_t *mp);
static void icmp_input(void *, mblk_t *, void *);
-static mblk_t *icmp_ip_bind_mp(icmp_t *icmp, t_scalar_t bind_prim,
- t_scalar_t addr_length, in_port_t);
-static int icmp_open(queue_t *q, dev_t *devp, int flag, int sflag,
- cred_t *credp, boolean_t isv6);
+static conn_t *icmp_open(int family, cred_t *credp, int *err, int flags);
static int icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag,
cred_t *credp);
static int icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag,
cred_t *credp);
-static void icmp_output(queue_t *q, mblk_t *mp);
static int icmp_unitdata_opt_process(queue_t *q, mblk_t *mp,
int *errorp, void *thisdg_attrs);
static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
-int icmp_opt_set(queue_t *q, uint_t optset_context,
+int icmp_opt_set(conn_t *connp, uint_t optset_context,
int level, int name, uint_t inlen,
uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
- void *thisdg_attrs, cred_t *cr, mblk_t *mblk);
-int icmp_opt_get(queue_t *q, int level, int name,
+ void *thisdg_attrs, cred_t *cr);
+int icmp_opt_get(conn_t *connp, int level, int name,
uchar_t *ptr);
static int icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
static boolean_t icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt);
@@ -144,10 +140,13 @@ static int icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
static int icmp_status_report(queue_t *q, mblk_t *mp, caddr_t cp,
cred_t *cr);
static void icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
-static void icmp_unbind(queue_t *q, mblk_t *mp);
+static void icmp_tpi_unbind(queue_t *q, mblk_t *mp);
static void icmp_wput(queue_t *q, mblk_t *mp);
-static void icmp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6,
- t_scalar_t tudr_optlen);
+static void icmp_wput_fallback(queue_t *q, mblk_t *mp);
+static int raw_ip_send_data_v6(queue_t *q, conn_t *connp, mblk_t *mp,
+ sin6_t *sin6, ip6_pkt_t *ipp);
+static int raw_ip_send_data_v4(queue_t *q, conn_t *connp, mblk_t *mp,
+ ipaddr_t v4dst, ip4_pkt_t *pktinfop);
static void icmp_wput_other(queue_t *q, mblk_t *mp);
static void icmp_wput_iocdata(queue_t *q, mblk_t *mp);
static void icmp_wput_restricted(queue_t *q, mblk_t *mp);
@@ -158,7 +157,16 @@ static void rawip_stack_fini(netstackid_t stackid, void *arg);
static void *rawip_kstat_init(netstackid_t stackid);
static void rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp);
static int rawip_kstat_update(kstat_t *kp, int rw);
+static void rawip_stack_shutdown(netstackid_t stackid, void *arg);
+static int rawip_do_getsockname(icmp_t *icmp, struct sockaddr *sa,
+ uint_t *salenp);
+static int rawip_do_getpeername(icmp_t *icmp, struct sockaddr *sa,
+ uint_t *salenp);
+int rawip_getsockname(sock_lower_handle_t, struct sockaddr *,
+ socklen_t *, cred_t *);
+int rawip_getpeername(sock_lower_handle_t, struct sockaddr *,
+ socklen_t *, cred_t *);
static struct module_info icmp_mod_info = {
5707, "icmp", 1, INFPSZ, 512, 128
@@ -177,7 +185,12 @@ static struct qinit icmprinitv6 = {
};
static struct qinit icmpwinit = {
- (pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info
+ (pfi_t)icmp_wput, NULL, NULL, NULL, NULL, &icmp_mod_info
+};
+
+/* ICMP entry point during fallback */
+static struct qinit icmp_fallback_sock_winit = {
+ (pfi_t)icmp_wput_fallback, NULL, NULL, NULL, NULL, &icmp_mod_info
};
/* For AF_INET aka /dev/icmp */
@@ -233,6 +246,11 @@ static icmpparam_t icmp_param_arr[] = {
#define is_recv_hiwat is_param_arr[6].icmp_param_value
#define is_max_buf is_param_arr[7].icmp_param_value
+static int rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len);
+static int rawip_do_connect(conn_t *connp, const struct sockaddr *sa,
+ socklen_t len);
+static void rawip_post_ip_bind_connect(icmp_t *icmp, mblk_t *ire_mp, int error);
+
/*
* This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message
* passed to icmp_wput.
@@ -241,14 +259,17 @@ static icmpparam_t icmp_param_arr[] = {
* message is returned by ip_bind_v4/v6.
*/
static void
-icmp_bind(queue_t *q, mblk_t *mp)
+icmp_tpi_bind(queue_t *q, mblk_t *mp)
{
+ int error;
+ struct sockaddr *sa;
+ struct T_bind_req *tbr;
+ socklen_t len;
sin_t *sin;
sin6_t *sin6;
- mblk_t *mp1;
- struct T_bind_req *tbr;
- icmp_t *icmp;
+ icmp_t *icmp;
conn_t *connp = Q_TO_CONN(q);
+ mblk_t *mp1;
icmp = connp->conn_icmp;
if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
@@ -258,12 +279,14 @@ icmp_bind(queue_t *q, mblk_t *mp)
icmp_err_ack(q, mp, TPROTO, 0);
return;
}
+
if (icmp->icmp_state != TS_UNBND) {
(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
"icmp_bind: bad state, %d", icmp->icmp_state);
icmp_err_ack(q, mp, TOUTSTATE, 0);
return;
}
+
/*
* Reallocate the message to make sure we have enough room for an
* address and the protocol type.
@@ -274,9 +297,13 @@ icmp_bind(queue_t *q, mblk_t *mp)
return;
}
mp = mp1;
+
+ /* Reset the message type in preparation for shipping it back. */
+ DB_TYPE(mp) = M_PCPROTO;
tbr = (struct T_bind_req *)mp->b_rptr;
- switch (tbr->ADDR_length) {
- case 0: /* Generic request */
+ len = tbr->ADDR_length;
+ switch (len) {
+ case 0: /* request for a generic port */
tbr->ADDR_offset = sizeof (struct T_bind_req);
if (icmp->icmp_family == AF_INET) {
tbr->ADDR_length = sizeof (sin_t);
@@ -284,6 +311,8 @@ icmp_bind(queue_t *q, mblk_t *mp)
*sin = sin_null;
sin->sin_family = AF_INET;
mp->b_wptr = (uchar_t *)&sin[1];
+ sa = (struct sockaddr *)sin;
+ len = sizeof (sin_t);
} else {
ASSERT(icmp->icmp_family == AF_INET6);
tbr->ADDR_length = sizeof (sin6_t);
@@ -291,39 +320,21 @@ icmp_bind(queue_t *q, mblk_t *mp)
*sin6 = sin6_null;
sin6->sin6_family = AF_INET6;
mp->b_wptr = (uchar_t *)&sin6[1];
+ sa = (struct sockaddr *)sin6;
+ len = sizeof (sin6_t);
}
break;
- case sizeof (sin_t): /* Complete IP address */
- sin = (sin_t *)mi_offset_param(mp, tbr->ADDR_offset,
+
+ case sizeof (sin_t): /* Complete IPv4 address */
+ sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
sizeof (sin_t));
- if (sin == NULL || !OK_32PTR((char *)sin)) {
- icmp_err_ack(q, mp, TSYSERR, EINVAL);
- return;
- }
- if (icmp->icmp_family != AF_INET ||
- sin->sin_family != AF_INET) {
- icmp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
- return;
- }
break;
- case sizeof (sin6_t): /* Complete IP address */
- sin6 = (sin6_t *)mi_offset_param(mp, tbr->ADDR_offset,
- sizeof (sin6_t));
- if (sin6 == NULL || !OK_32PTR((char *)sin6)) {
- icmp_err_ack(q, mp, TSYSERR, EINVAL);
- return;
- }
- if (icmp->icmp_family != AF_INET6 ||
- sin6->sin6_family != AF_INET6) {
- icmp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
- return;
- }
- /* No support for mapped addresses on raw sockets */
- if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
- icmp_err_ack(q, mp, TSYSERR, EADDRNOTAVAIL);
- return;
- }
+
+ case sizeof (sin6_t): /* Complete IPv6 address */
+ sa = (struct sockaddr *)mi_offset_param(mp,
+ tbr->ADDR_offset, sizeof (sin6_t));
break;
+
default:
(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
"icmp_bind: bad ADDR_length %d", tbr->ADDR_length);
@@ -331,6 +342,37 @@ icmp_bind(queue_t *q, mblk_t *mp)
return;
}
+ error = rawip_do_bind(connp, sa, len);
+done:
+ ASSERT(mp->b_cont == NULL);
+ if (error != 0) {
+ if (error > 0) {
+ icmp_err_ack(q, mp, TSYSERR, error);
+ } else {
+ icmp_err_ack(q, mp, -error, 0);
+ }
+ } else {
+ tbr->PRIM_type = T_BIND_ACK;
+ qreply(q, mp);
+ }
+}
+
+static int
+rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len)
+{
+ sin_t *sin;
+ sin6_t *sin6;
+ icmp_t *icmp;
+ int error = 0;
+ mblk_t *ire_mp;
+
+
+ icmp = connp->conn_icmp;
+
+ if (sa == NULL || !OK_32PTR((char *)sa)) {
+ return (EINVAL);
+ }
+
/*
* The state must be TS_UNBND. TPI mandates that users must send
* TPI primitives only 1 at a time and wait for the response before
@@ -338,24 +380,53 @@ icmp_bind(queue_t *q, mblk_t *mp)
*/
rw_enter(&icmp->icmp_rwlock, RW_WRITER);
if (icmp->icmp_state != TS_UNBND || icmp->icmp_pending_op != -1) {
- rw_exit(&icmp->icmp_rwlock);
- (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
- "icmp_bind: bad state, %d", icmp->icmp_state);
- icmp_err_ack(q, mp, TOUTSTATE, 0);
- return;
+ error = -TOUTSTATE;
+ goto done;
+ }
+
+ ASSERT(len != 0);
+ switch (len) {
+ case sizeof (sin_t): /* Complete IPv4 address */
+ sin = (sin_t *)sa;
+ if (sin->sin_family != AF_INET ||
+ icmp->icmp_family != AF_INET) {
+ /* TSYSERR, EAFNOSUPPORT */
+ error = EAFNOSUPPORT;
+ goto done;
+ }
+ break;
+ case sizeof (sin6_t): /* Complete IPv6 address */
+ sin6 = (sin6_t *)sa;
+ if (sin6->sin6_family != AF_INET6 ||
+ icmp->icmp_family != AF_INET6) {
+ /* TSYSERR, EAFNOSUPPORT */
+ error = EAFNOSUPPORT;
+ goto done;
+ }
+ /* No support for mapped addresses on raw sockets */
+ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ /* TSYSERR, EADDRNOTAVAIL */
+ error = EADDRNOTAVAIL;
+ goto done;
+ }
+ break;
+
+ default:
+ /* TBADADDR */
+ error = EADDRNOTAVAIL;
+ goto done;
}
- icmp->icmp_pending_op = tbr->PRIM_type;
+ icmp->icmp_pending_op = T_BIND_REQ;
+ icmp->icmp_state = TS_IDLE;
/*
* Copy the source address into our icmp structure. This address
* may still be zero; if so, ip will fill in the correct address
* each time an outbound packet is passed to it.
* If we are binding to a broadcast or multicast address then
- * icmp_bind_ack will clear the source address when it receives
- * the T_BIND_ACK.
+ * rawip_post_ip_bind_connect will clear the source address.
*/
- icmp->icmp_state = TS_IDLE;
if (icmp->icmp_family == AF_INET) {
ASSERT(sin != NULL);
@@ -378,147 +449,136 @@ icmp_bind(queue_t *q, mblk_t *mp)
error = icmp_build_hdrs(icmp);
if (error != 0) {
icmp->icmp_pending_op = -1;
- rw_exit(&icmp->icmp_rwlock);
- icmp_err_ack(q, mp, TSYSERR, error);
- return;
+ /*
+ * TSYSERR
+ */
+ goto done;
}
}
- /*
- * Place protocol type in the O_T_BIND_REQ/T_BIND_REQ following
- * the address.
- */
- *mp->b_wptr++ = icmp->icmp_proto;
+
+ ire_mp = NULL;
if (!(V6_OR_V4_INADDR_ANY(icmp->icmp_v6src))) {
/*
- * Append a request for an IRE if src not 0 (INADDR_ANY)
+ * request an IRE if src not 0 (INADDR_ANY)
*/
- mp->b_cont = allocb(sizeof (ire_t), BPRI_HI);
- if (!mp->b_cont) {
+ ire_mp = allocb(sizeof (ire_t), BPRI_HI);
+ if (ire_mp == NULL) {
icmp->icmp_pending_op = -1;
- rw_exit(&icmp->icmp_rwlock);
- icmp_err_ack(q, mp, TSYSERR, ENOMEM);
- return;
+ error = ENOMEM;
+ goto done;
}
- mp->b_cont->b_wptr += sizeof (ire_t);
- mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
+ DB_TYPE(ire_mp) = IRE_DB_REQ_TYPE;
}
+done:
rw_exit(&icmp->icmp_rwlock);
+ if (error != 0)
+ return (error);
- /* Pass the O_T_BIND_REQ/T_BIND_REQ to ip. */
- if (icmp->icmp_family == AF_INET6)
- mp = ip_bind_v6(q, mp, connp, NULL);
- else
- mp = ip_bind_v4(q, mp, connp);
-
- /* The above return NULL if the bind needs to be deferred */
- if (mp != NULL)
- icmp_bind_result(connp, mp);
- else
- CONN_INC_REF(connp);
+ if (icmp->icmp_family == AF_INET6) {
+ error = ip_proto_bind_laddr_v6(connp, &ire_mp, icmp->icmp_proto,
+ &sin6->sin6_addr, sin6->sin6_port, B_TRUE);
+ } else {
+ error = ip_proto_bind_laddr_v4(connp, &ire_mp, icmp->icmp_proto,
+ sin->sin_addr.s_addr, sin->sin_port, B_TRUE);
+ }
+ rawip_post_ip_bind_connect(icmp, ire_mp, error);
+ return (error);
}
-/*
- * Send message to IP to just bind to the protocol.
- */
static void
-icmp_bind_proto(queue_t *q)
+rawip_post_ip_bind_connect(icmp_t *icmp, mblk_t *ire_mp, int error)
{
- mblk_t *mp;
- struct T_bind_req *tbr;
- icmp_t *icmp;
- conn_t *connp = Q_TO_CONN(q);
-
- icmp = connp->conn_icmp;
-
- mp = allocb(sizeof (struct T_bind_req) + sizeof (sin6_t) + 1,
- BPRI_MED);
- if (!mp) {
+ rw_enter(&icmp->icmp_rwlock, RW_WRITER);
+ if (icmp->icmp_state == TS_UNBND) {
+ /*
+ * not yet bound - bind sent by icmp_bind_proto.
+ */
+ rw_exit(&icmp->icmp_rwlock);
return;
}
- mp->b_datap->db_type = M_PROTO;
- tbr = (struct T_bind_req *)mp->b_rptr;
- tbr->PRIM_type = O_T_BIND_REQ; /* change to T_BIND_REQ ? */
- tbr->ADDR_offset = sizeof (struct T_bind_req);
-
- rw_enter(&icmp->icmp_rwlock, RW_WRITER);
- if (icmp->icmp_ipversion == IPV4_VERSION) {
- sin_t *sin;
+ ASSERT(icmp->icmp_pending_op != -1);
+ icmp->icmp_pending_op = -1;
- tbr->ADDR_length = sizeof (sin_t);
- sin = (sin_t *)&tbr[1];
- *sin = sin_null;
- sin->sin_family = AF_INET;
- mp->b_wptr = (uchar_t *)&sin[1];
+ if (error != 0) {
+ if (icmp->icmp_state == TS_DATA_XFER) {
+ /* Connect failed */
+ /* Revert back to the bound source */
+ icmp->icmp_v6src = icmp->icmp_bound_v6src;
+ icmp->icmp_state = TS_IDLE;
+ if (icmp->icmp_family == AF_INET6)
+ (void) icmp_build_hdrs(icmp);
+ } else {
+ V6_SET_ZERO(icmp->icmp_v6src);
+ V6_SET_ZERO(icmp->icmp_bound_v6src);
+ icmp->icmp_state = TS_UNBND;
+ if (icmp->icmp_family == AF_INET6)
+ (void) icmp_build_hdrs(icmp);
+ }
} else {
- sin6_t *sin6;
+ if (ire_mp != NULL && ire_mp->b_datap->db_type == IRE_DB_TYPE) {
+ ire_t *ire;
- ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
- tbr->ADDR_length = sizeof (sin6_t);
- sin6 = (sin6_t *)&tbr[1];
- *sin6 = sin6_null;
- sin6->sin6_family = AF_INET6;
- mp->b_wptr = (uchar_t *)&sin6[1];
- }
+ ire = (ire_t *)ire_mp->b_rptr;
+ /*
+ * If a broadcast/multicast address was bound set
+ * the source address to 0.
+ * This ensures no datagrams with broadcast address
+ * as source address are emitted (which would violate
+ * RFC1122 - Hosts requirements)
+ * Note: we get IRE_BROADCAST for IPv6
+ * to "mark" a multicast local address.
+ */
- /* Place protocol type in the O_T_BIND_REQ following the address. */
- *mp->b_wptr++ = icmp->icmp_proto;
- rw_exit(&icmp->icmp_rwlock);
- /* Pass the O_T_BIND_REQ to ip. */
- if (icmp->icmp_family == AF_INET6)
- mp = ip_bind_v6(q, mp, connp, NULL);
- else
- mp = ip_bind_v4(q, mp, connp);
+ if (ire->ire_type == IRE_BROADCAST &&
+ icmp->icmp_state != TS_DATA_XFER) {
+ /*
+ * This was just a local bind to a
+ * MC/broadcast addr
+ */
+ V6_SET_ZERO(icmp->icmp_v6src);
+ if (icmp->icmp_family == AF_INET6)
+ (void) icmp_build_hdrs(icmp);
+ }
+ }
- /* The above return NULL if the bind needs to be deferred */
- if (mp != NULL)
- icmp_bind_result(connp, mp);
- else
- CONN_INC_REF(connp);
+ }
+ rw_exit(&icmp->icmp_rwlock);
+ if (ire_mp != NULL)
+ freeb(ire_mp);
}
/*
- * This is called from ip_wput_nondata to handle the results of a
- * deferred RAWIP bind. It is called once the bind has been completed.
+ * Send message to IP to just bind to the protocol.
*/
-void
-rawip_resume_bind(conn_t *connp, mblk_t *mp)
+static int
+icmp_bind_proto(conn_t *connp)
{
- ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
+ icmp_t *icmp;
+ int error;
+
+ icmp = connp->conn_icmp;
- icmp_bind_result(connp, mp);
+ if (icmp->icmp_family == AF_INET6)
+ error = ip_proto_bind_laddr_v6(connp, NULL, icmp->icmp_proto,
+ &sin6_null.sin6_addr, 0, B_TRUE);
+ else
+ error = ip_proto_bind_laddr_v4(connp, NULL, icmp->icmp_proto,
+ sin_null.sin_addr.s_addr, 0, B_TRUE);
- CONN_OPER_PENDING_DONE(connp);
+ rawip_post_ip_bind_connect(icmp, NULL, error);
+ return (error);
}
-/*
- * This routine handles each T_CONN_REQ message passed to icmp. It
- * associates a default destination address with the stream.
- *
- * This routine sends down a T_BIND_REQ to IP with the following mblks:
- * T_BIND_REQ - specifying local and remote address.
- * IRE_DB_REQ_TYPE - to get an IRE back containing ire_type and src
- * T_OK_ACK - for the T_CONN_REQ
- * T_CONN_CON - to keep the TPI user happy
- *
- * The connect completes in icmp_bind_result.
- * When a T_BIND_ACK is received information is extracted from the IRE
- * and the two appended messages are sent to the TPI user.
- * Should icmp_bind_result receive T_ERROR_ACK for the T_BIND_REQ it will
- * convert it to an error ack for the appropriate primitive.
- */
static void
-icmp_connect(queue_t *q, mblk_t *mp)
+icmp_tpi_connect(queue_t *q, mblk_t *mp)
{
- sin_t *sin;
- sin6_t *sin6;
- mblk_t *mp1, *mp2;
+ conn_t *connp = Q_TO_CONN(q);
struct T_conn_req *tcr;
icmp_t *icmp;
- ipaddr_t v4dst;
- in6_addr_t v6dst;
- uint32_t flowinfo;
- conn_t *connp = Q_TO_CONN(q);
+ struct sockaddr *sa;
+ socklen_t len;
+ int error;
icmp = connp->conn_icmp;
tcr = (struct T_conn_req *)mp->b_rptr;
@@ -533,54 +593,111 @@ icmp_connect(queue_t *q, mblk_t *mp)
return;
}
- switch (tcr->DEST_length) {
+ len = tcr->DEST_length;
+
+ switch (len) {
default:
icmp_err_ack(q, mp, TBADADDR, 0);
return;
-
case sizeof (sin_t):
- sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
+ sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
sizeof (sin_t));
- if (sin == NULL || !OK_32PTR((char *)sin)) {
- icmp_err_ack(q, mp, TSYSERR, EINVAL);
- return;
- }
- if (icmp->icmp_family != AF_INET ||
- sin->sin_family != AF_INET) {
- icmp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
- return;
- }
- v4dst = sin->sin_addr.s_addr;
- IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
- ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
- icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
- icmp->icmp_ip_snd_options_len;
break;
-
case sizeof (sin6_t):
- sin6 = (sin6_t *)mi_offset_param(mp, tcr->DEST_offset,
- sizeof (sin6_t));
- if (sin6 == NULL || !OK_32PTR((char *)sin6)) {
- icmp_err_ack(q, mp, TSYSERR, EINVAL);
- return;
+ sa = (struct sockaddr *)mi_offset_param(mp,
+ tcr->DEST_offset, sizeof (sin6_t));
+ break;
+ }
+
+ error = proto_verify_ip_addr(icmp->icmp_family, sa, len);
+ if (error != 0) {
+ icmp_err_ack(q, mp, TSYSERR, error);
+ return;
+ }
+
+ error = rawip_do_connect(connp, sa, len);
+ if (error != 0) {
+ if (error < 0) {
+ icmp_err_ack(q, mp, -error, 0);
+ } else {
+ icmp_err_ack(q, mp, 0, error);
}
- if (icmp->icmp_family != AF_INET6 ||
- sin6->sin6_family != AF_INET6) {
- icmp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
+ } else {
+ mblk_t *mp1;
+
+ /*
+ * We have to send a connection confirmation to
+ * keep TLI happy.
+ */
+ if (icmp->icmp_family == AF_INET) {
+ mp1 = mi_tpi_conn_con(NULL, (char *)sa,
+ sizeof (sin_t), NULL, 0);
+ } else {
+ ASSERT(icmp->icmp_family == AF_INET6);
+ mp1 = mi_tpi_conn_con(NULL, (char *)sa,
+ sizeof (sin6_t), NULL, 0);
+ }
+ if (mp1 == NULL) {
+ rw_exit(&icmp->icmp_rwlock);
+ icmp_err_ack(q, mp, TSYSERR, ENOMEM);
return;
}
- /* No support for mapped addresses on raw sockets */
- if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
- icmp_err_ack(q, mp, TSYSERR, EADDRNOTAVAIL);
+
+ /*
+ * Send ok_ack for T_CONN_REQ
+ */
+ mp = mi_tpi_ok_ack_alloc(mp);
+ if (mp == NULL) {
+ /* Unable to reuse the T_CONN_REQ for the ack. */
+ freemsg(mp1);
+ icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM);
return;
}
- v6dst = sin6->sin6_addr;
- ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
- icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len;
- flowinfo = sin6->sin6_flowinfo;
- break;
+ putnext(connp->conn_rq, mp);
+ putnext(connp->conn_rq, mp1);
+ }
+}
+
+static int
+rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len)
+{
+ icmp_t *icmp;
+ sin_t *sin;
+ sin6_t *sin6;
+ mblk_t *ire_mp;
+ int error;
+ ipaddr_t v4dst;
+ in6_addr_t v6dst;
+
+ icmp = connp->conn_icmp;
+
+ if (sa == NULL || !OK_32PTR((char *)sa)) {
+ return (EINVAL);
+ }
+
+ ire_mp = allocb(sizeof (ire_t), BPRI_HI);
+ if (ire_mp == NULL)
+ return (ENOMEM);
+ DB_TYPE(ire_mp) = IRE_DB_REQ_TYPE;
+
+
+ ASSERT(sa != NULL && len != 0);
+
+ rw_enter(&icmp->icmp_rwlock, RW_WRITER);
+ if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) {
+ rw_exit(&icmp->icmp_rwlock);
+ freeb(ire_mp);
+ return (-TOUTSTATE);
}
- if (icmp->icmp_ipversion == IPV4_VERSION) {
+
+ switch (len) {
+ case sizeof (sin_t):
+ sin = (sin_t *)sa;
+
+ ASSERT(icmp->icmp_family == AF_INET);
+ ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
+
+ v4dst = sin->sin_addr.s_addr;
/*
* Interpret a zero destination to mean loopback.
* Update the T_CONN_REQ (sin/sin6) since it is used to
@@ -588,15 +705,16 @@ icmp_connect(queue_t *q, mblk_t *mp)
*/
if (v4dst == INADDR_ANY) {
v4dst = htonl(INADDR_LOOPBACK);
- IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
- if (icmp->icmp_family == AF_INET) {
- sin->sin_addr.s_addr = v4dst;
- } else {
- sin6->sin6_addr = v6dst;
- }
}
- icmp->icmp_v6dst = v6dst;
- icmp->icmp_flowinfo = 0;
+
+ IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
+ ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
+ icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
+ icmp->icmp_ip_snd_options_len;
+ icmp->icmp_v6dst.sin6_addr = v6dst;
+ icmp->icmp_v6dst.sin6_family = AF_INET6;
+ icmp->icmp_v6dst.sin6_flowinfo = 0;
+ icmp->icmp_v6dst.sin6_port = 0;
/*
* If the destination address is multicast and
@@ -610,35 +728,42 @@ icmp_connect(queue_t *q, mblk_t *mp)
IN6_IPADDR_TO_V4MAPPED(icmp->icmp_multicast_if_addr,
&icmp->icmp_v6src);
}
- } else {
+ break;
+ case sizeof (sin6_t):
+ sin6 = (sin6_t *)sa;
+
+ /* No support for mapped addresses on raw sockets */
+ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ rw_exit(&icmp->icmp_rwlock);
+ freeb(ire_mp);
+ return (EADDRNOTAVAIL);
+ }
+
ASSERT(icmp->icmp_ipversion == IPV6_VERSION);
+ ASSERT(icmp->icmp_family == AF_INET6);
+
+ icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len;
+
+ icmp->icmp_v6dst = *sin6;
+ icmp->icmp_v6dst.sin6_port = 0;
+
/*
* Interpret a zero destination to mean loopback.
* Update the T_CONN_REQ (sin/sin6) since it is used to
* generate the T_CONN_CON.
*/
- if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) {
- v6dst = ipv6_loopback;
- sin6->sin6_addr = v6dst;
+ if (IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6dst.sin6_addr)) {
+ icmp->icmp_v6dst.sin6_addr = ipv6_loopback;
}
- icmp->icmp_v6dst = v6dst;
- icmp->icmp_flowinfo = flowinfo;
/*
* If the destination address is multicast and
* an outgoing multicast interface has been set,
* then the ip bind logic will pick the correct source
* address (i.e. matching the outgoing multicast interface).
*/
+ break;
}
- rw_enter(&icmp->icmp_rwlock, RW_WRITER);
- if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) {
- rw_exit(&icmp->icmp_rwlock);
- (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
- "icmp_connect: bad state, %d", icmp->icmp_state);
- icmp_err_ack(q, mp, TOUTSTATE, 0);
- return;
- }
icmp->icmp_pending_op = T_CONN_REQ;
if (icmp->icmp_state == TS_DATA_XFER) {
@@ -647,74 +772,22 @@ icmp_connect(queue_t *q, mblk_t *mp)
icmp->icmp_state = TS_IDLE;
}
- /*
- * Send down bind to IP to verify that there is a route
- * and to determine the source address.
- * This will come back as T_BIND_ACK with an IRE_DB_TYPE in rput.
- */
- if (icmp->icmp_family == AF_INET) {
- mp1 = icmp_ip_bind_mp(icmp, O_T_BIND_REQ, sizeof (ipa_conn_t),
- sin->sin_port);
- } else {
- ASSERT(icmp->icmp_family == AF_INET6);
- mp1 = icmp_ip_bind_mp(icmp, O_T_BIND_REQ, sizeof (ipa6_conn_t),
- sin6->sin6_port);
- }
- if (mp1 == NULL) {
- icmp->icmp_pending_op = -1;
- rw_exit(&icmp->icmp_rwlock);
- icmp_err_ack(q, mp, TSYSERR, ENOMEM);
- return;
- }
-
- /*
- * We also have to send a connection confirmation to
- * keep TLI happy. Prepare it for icmp_bind_result.
- */
- if (icmp->icmp_family == AF_INET) {
- mp2 = mi_tpi_conn_con(NULL, (char *)sin, sizeof (*sin), NULL,
- 0);
- } else {
- ASSERT(icmp->icmp_family == AF_INET6);
- mp2 = mi_tpi_conn_con(NULL, (char *)sin6, sizeof (*sin6), NULL,
- 0);
- }
- if (mp2 == NULL) {
- freemsg(mp1);
- icmp->icmp_pending_op = -1;
- rw_exit(&icmp->icmp_rwlock);
- icmp_err_ack(q, mp, TSYSERR, ENOMEM);
- return;
- }
-
- mp = mi_tpi_ok_ack_alloc(mp);
- if (mp == NULL) {
- /* Unable to reuse the T_CONN_REQ for the ack. */
- freemsg(mp2);
- icmp->icmp_pending_op = -1;
- rw_exit(&icmp->icmp_rwlock);
- icmp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM);
- return;
- }
-
icmp->icmp_state = TS_DATA_XFER;
rw_exit(&icmp->icmp_rwlock);
- /* Hang onto the T_OK_ACK and T_CONN_CON for later. */
- linkb(mp1, mp);
- linkb(mp1, mp2);
-
- mblk_setcred(mp1, connp->conn_cred);
- if (icmp->icmp_family == AF_INET)
- mp1 = ip_bind_v4(q, mp1, connp);
- else
- mp1 = ip_bind_v6(q, mp1, connp, NULL);
-
- /* The above return NULL if the bind needs to be deferred */
- if (mp1 != NULL)
- icmp_bind_result(connp, mp1);
- else
- CONN_INC_REF(connp);
+ if (icmp->icmp_family == AF_INET6) {
+ error = ip_proto_bind_connected_v6(connp, &ire_mp,
+ icmp->icmp_proto, &icmp->icmp_v6src, 0,
+ &icmp->icmp_v6dst.sin6_addr,
+ NULL, sin6->sin6_port, B_TRUE, B_TRUE);
+ } else {
+ error = ip_proto_bind_connected_v4(connp, &ire_mp,
+ icmp->icmp_proto, &V4_PART_OF_V6(icmp->icmp_v6src), 0,
+ V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr), sin->sin_port,
+ B_TRUE, B_TRUE);
+ }
+ rawip_post_ip_bind_connect(icmp, ire_mp, error);
+ return (error);
}
static void
@@ -733,6 +806,7 @@ icmp_close_free(conn_t *connp)
kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t));
icmp->icmp_filter = NULL;
}
+
/* Free memory associated with sticky options */
if (icmp->icmp_sticky_hdrs_len != 0) {
kmem_free(icmp->icmp_sticky_hdrs,
@@ -754,16 +828,18 @@ icmp_close_free(conn_t *connp)
}
static int
-icmp_close(queue_t *q)
+rawip_do_close(conn_t *connp)
{
- conn_t *connp = (conn_t *)q->q_ptr;
-
ASSERT(connp != NULL && IPCL_IS_RAWIP(connp));
ip_quiesce_conn(connp);
- qprocsoff(connp->conn_rq);
+ if (!IPCL_IS_NONSTR(connp)) {
+ qprocsoff(connp->conn_rq);
+ }
+ ASSERT(connp->conn_icmp->icmp_fallback_queue_head == NULL &&
+ connp->conn_icmp->icmp_fallback_queue_tail == NULL);
icmp_close_free(connp);
/*
@@ -778,11 +854,36 @@ icmp_close(queue_t *q)
*/
ASSERT(connp->conn_ref == 1);
- inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
+ if (!IPCL_IS_NONSTR(connp)) {
+ inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
+ } else {
+ ip_close_helper_stream(connp);
+ }
connp->conn_ref--;
ipcl_conn_destroy(connp);
+ return (0);
+}
+
+static int
+icmp_close(queue_t *q, int flags)
+{
+ conn_t *connp;
+
+ if (flags & SO_FALLBACK) {
+ /*
+ * stream is being closed while in fallback
+ * simply free the resources that were allocated
+ */
+ inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
+ qprocsoff(q);
+ goto done;
+ }
+
+ connp = Q_TO_CONN(q);
+ (void) rawip_do_close(connp);
+done:
q->q_ptr = WR(q)->q_ptr = NULL;
return (0);
}
@@ -793,88 +894,102 @@ icmp_close(queue_t *q)
* in sending a T_BIND_REQ to IP to restore the binding to just
* the local address.
*
- * This routine sends down a T_BIND_REQ to IP with the following mblks:
- * T_BIND_REQ - specifying just the local address.
- * T_OK_ACK - for the T_DISCON_REQ
- *
- * The disconnect completes in icmp_bind_result.
- * When a T_BIND_ACK is received the appended T_OK_ACK is sent to the TPI user.
- * Should icmp_bind_result receive T_ERROR_ACK for the T_BIND_REQ it will
- * convert it to an error ack for the appropriate primitive.
+ * The disconnect completes in rawip_post_ip_bind_connect.
*/
-static void
-icmp_disconnect(queue_t *q, mblk_t *mp)
+static int
+icmp_do_disconnect(conn_t *connp)
{
icmp_t *icmp;
- mblk_t *mp1;
- conn_t *connp = Q_TO_CONN(q);
+ mblk_t *ire_mp;
+ int error;
icmp = connp->conn_icmp;
rw_enter(&icmp->icmp_rwlock, RW_WRITER);
if (icmp->icmp_state != TS_DATA_XFER || icmp->icmp_pending_op != -1) {
rw_exit(&icmp->icmp_rwlock);
- (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
- "icmp_disconnect: bad state, %d", icmp->icmp_state);
- icmp_err_ack(q, mp, TOUTSTATE, 0);
- return;
+ return (-TOUTSTATE);
}
icmp->icmp_pending_op = T_DISCON_REQ;
icmp->icmp_v6src = icmp->icmp_bound_v6src;
icmp->icmp_state = TS_IDLE;
- /*
- * Send down bind to IP to remove the full binding and revert
- * to the local address binding.
- */
- if (icmp->icmp_family == AF_INET) {
- mp1 = icmp_ip_bind_mp(icmp, O_T_BIND_REQ, sizeof (sin_t), 0);
- } else {
- ASSERT(icmp->icmp_family == AF_INET6);
- mp1 = icmp_ip_bind_mp(icmp, O_T_BIND_REQ, sizeof (sin6_t), 0);
- }
- if (mp1 == NULL) {
- icmp->icmp_pending_op = -1;
- rw_exit(&icmp->icmp_rwlock);
- icmp_err_ack(q, mp, TSYSERR, ENOMEM);
- return;
- }
- mp = mi_tpi_ok_ack_alloc(mp);
- if (mp == NULL) {
- /* Unable to reuse the T_DISCON_REQ for the ack. */
- icmp->icmp_pending_op = -1;
- rw_exit(&icmp->icmp_rwlock);
- icmp_err_ack_prim(q, mp1, T_DISCON_REQ, TSYSERR, ENOMEM);
- return;
- }
if (icmp->icmp_family == AF_INET6) {
- int error;
-
/* Rebuild the header template */
error = icmp_build_hdrs(icmp);
if (error != 0) {
icmp->icmp_pending_op = -1;
rw_exit(&icmp->icmp_rwlock);
- icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, error);
- freemsg(mp1);
- return;
+ return (error);
}
}
rw_exit(&icmp->icmp_rwlock);
- /* Append the T_OK_ACK to the T_BIND_REQ for icmp_bind_result */
- linkb(mp1, mp);
+ ire_mp = allocb(sizeof (ire_t), BPRI_HI);
+ if (ire_mp == NULL) {
+ return (ENOMEM);
+ }
- if (icmp->icmp_family == AF_INET6)
- mp1 = ip_bind_v6(q, mp1, connp, NULL);
- else
- mp1 = ip_bind_v4(q, mp1, connp);
+ if (icmp->icmp_family == AF_INET6) {
+ error = ip_proto_bind_laddr_v6(connp, &ire_mp, icmp->icmp_proto,
+ &icmp->icmp_bound_v6src, 0, B_TRUE);
+ } else {
- /* The above return NULL if the bind needs to be deferred */
- if (mp1 != NULL)
- icmp_bind_result(connp, mp1);
- else
- CONN_INC_REF(connp);
+ error = ip_proto_bind_laddr_v4(connp, &ire_mp, icmp->icmp_proto,
+ V4_PART_OF_V6(icmp->icmp_bound_v6src), 0, B_TRUE);
+ }
+
+ rawip_post_ip_bind_connect(icmp, ire_mp, error);
+
+ return (error);
+}
+
+static void
+icmp_tpi_disconnect(queue_t *q, mblk_t *mp)
+{
+ conn_t *connp = Q_TO_CONN(q);
+ int error;
+
+ /*
+ * Allocate the largest primitive we need to send back
+ * T_error_ack is > than T_ok_ack
+ */
+ mp = reallocb(mp, sizeof (struct T_error_ack), 1);
+ if (mp == NULL) {
+ /* Unable to reuse the T_DISCON_REQ for the ack. */
+ icmp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM);
+ return;
+ }
+
+ error = icmp_do_disconnect(connp);
+
+ if (error != 0) {
+ if (error > 0) {
+ icmp_err_ack(q, mp, 0, error);
+ } else {
+ icmp_err_ack(q, mp, -error, 0);
+ }
+ } else {
+ mp = mi_tpi_ok_ack_alloc(mp);
+ ASSERT(mp != NULL);
+ qreply(q, mp);
+ }
+
+}
+
+static int
+icmp_disconnect(conn_t *connp)
+{
+ int error;
+ icmp_t *icmp = connp->conn_icmp;
+
+ icmp->icmp_dgram_errind = B_FALSE;
+
+ error = icmp_do_disconnect(connp);
+
+ if (error < 0)
+ error = proto_tlitosyserr(-error);
+ return (error);
}
/* This routine creates a T_ERROR_ACK message and passes it upstream. */
@@ -905,22 +1020,20 @@ icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive,
/*
* icmp_icmp_error is called by icmp_input to process ICMP
* messages passed up by IP.
- * Generates the appropriate T_UDERROR_IND for permanent
- * (non-transient) errors.
+ * Generates the appropriate permanent (non-transient) errors.
* Assumes that IP has pulled up everything up to and including
* the ICMP header.
*/
static void
-icmp_icmp_error(queue_t *q, mblk_t *mp)
+icmp_icmp_error(conn_t *connp, mblk_t *mp)
{
icmph_t *icmph;
ipha_t *ipha;
int iph_hdr_length;
sin_t sin;
- sin6_t sin6;
mblk_t *mp1;
int error = 0;
- icmp_t *icmp = Q_TO_ICMP(q);
+ icmp_t *icmp = connp->conn_icmp;
ipha = (ipha_t *)mp->b_rptr;
@@ -928,10 +1041,19 @@ icmp_icmp_error(queue_t *q, mblk_t *mp)
if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
- icmp_icmp_error_ipv6(q, mp);
+ icmp_icmp_error_ipv6(connp, mp);
return;
}
- ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
+
+ /*
+ * icmp does not support v4 mapped addresses
+ * so we can never be here for a V6 socket
+ * i.e. icmp_family == AF_INET6
+ */
+ ASSERT((IPH_HDR_VERSION(ipha) == IPV4_VERSION) &&
+ (icmp->icmp_family == AF_INET));
+
+ ASSERT(icmp->icmp_family == AF_INET);
/* Skip past the outer IP and ICMP headers */
iph_hdr_length = IPH_HDR_LENGTH(ipha);
@@ -974,25 +1096,32 @@ icmp_icmp_error(queue_t *q, mblk_t *mp)
return;
}
- switch (icmp->icmp_family) {
- case AF_INET:
- sin = sin_null;
- sin.sin_family = AF_INET;
- sin.sin_addr.s_addr = ipha->ipha_dst;
- mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0,
- error);
- break;
- case AF_INET6:
- sin6 = sin6_null;
- sin6.sin6_family = AF_INET6;
- IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &sin6.sin6_addr);
+ sin = sin_null;
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = ipha->ipha_dst;
+ if (IPCL_IS_NONSTR(connp)) {
+ rw_enter(&icmp->icmp_rwlock, RW_WRITER);
+ if (icmp->icmp_state == TS_DATA_XFER) {
+ if (sin.sin_addr.s_addr ==
+ V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr)) {
+ rw_exit(&icmp->icmp_rwlock);
+ (*connp->conn_upcalls->su_set_error)
+ (connp->conn_upper_handle, error);
+ goto done;
+ }
+ } else {
+ icmp->icmp_delayed_error = error;
+ *((sin_t *)&icmp->icmp_delayed_addr) = sin;
+ }
+ rw_exit(&icmp->icmp_rwlock);
+ } else {
- mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
- NULL, 0, error);
- break;
+ mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL,
+ 0, error);
+ if (mp1 != NULL)
+ putnext(connp->conn_rq, mp1);
}
- if (mp1)
- putnext(q, mp1);
+done:
freemsg(mp);
}
@@ -1004,7 +1133,7 @@ icmp_icmp_error(queue_t *q, mblk_t *mp)
* as the ICMPv6 header.
*/
static void
-icmp_icmp_error_ipv6(queue_t *q, mblk_t *mp)
+icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
{
icmp6_t *icmp6;
ip6_t *ip6h, *outer_ip6h;
@@ -1013,7 +1142,7 @@ icmp_icmp_error_ipv6(queue_t *q, mblk_t *mp)
sin6_t sin6;
mblk_t *mp1;
int error = 0;
- icmp_t *icmp = Q_TO_ICMP(q);
+ icmp_t *icmp = connp->conn_icmp;
outer_ip6h = (ip6_t *)mp->b_rptr;
if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6)
@@ -1085,7 +1214,7 @@ icmp_icmp_error_ipv6(queue_t *q, mblk_t *mp)
sin6 = (sin6_t *)&tudi[1];
bzero(sin6, sizeof (sin6_t));
sin6->sin6_family = AF_INET6;
- sin6->sin6_addr = icmp->icmp_v6dst;
+ sin6->sin6_addr = icmp->icmp_v6dst.sin6_addr;
toh = (struct T_opthdr *)&sin6[1];
toh->level = IPPROTO_IPV6;
@@ -1103,7 +1232,14 @@ icmp_icmp_error_ipv6(queue_t *q, mblk_t *mp)
* message. Free it, then send our empty message.
*/
freemsg(mp);
- putnext(q, newmp);
+ if (!IPCL_IS_NONSTR(connp)) {
+ putnext(connp->conn_rq, newmp);
+ } else {
+ (*connp->conn_upcalls->su_recv)
+ (connp->conn_upper_handle, newmp, 0, 0, &error,
+ NULL);
+ ASSERT(error == 0);
+ }
return;
}
case ICMP6_TIME_EXCEEDED:
@@ -1138,10 +1274,29 @@ icmp_icmp_error_ipv6(queue_t *q, mblk_t *mp)
sin6.sin6_addr = ip6h->ip6_dst;
sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
- mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), NULL, 0,
- error);
- if (mp1)
- putnext(q, mp1);
+ if (IPCL_IS_NONSTR(connp)) {
+ rw_enter(&icmp->icmp_rwlock, RW_WRITER);
+ if (icmp->icmp_state == TS_DATA_XFER) {
+ if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
+ &icmp->icmp_v6dst.sin6_addr)) {
+ rw_exit(&icmp->icmp_rwlock);
+ (*connp->conn_upcalls->su_set_error)
+ (connp->conn_upper_handle, error);
+ goto done;
+ }
+ } else {
+ icmp->icmp_delayed_error = error;
+ *((sin6_t *)&icmp->icmp_delayed_addr) = sin6;
+ }
+ rw_exit(&icmp->icmp_rwlock);
+ } else {
+
+ mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
+ NULL, 0, error);
+ if (mp1 != NULL)
+ putnext(connp->conn_rq, mp1);
+ }
+done:
freemsg(mp);
}
@@ -1249,6 +1404,18 @@ icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp)
tap->OPT_size = icmp_max_optsize;
}
+static void
+icmp_do_capability_ack(icmp_t *icmp, struct T_capability_ack *tcap,
+ t_uscalar_t cap_bits1)
+{
+ tcap->CAP_bits1 = 0;
+
+ if (cap_bits1 & TC1_INFO) {
+ icmp_copy_info(&tcap->INFO_ack, icmp);
+ tcap->CAP_bits1 |= TC1_INFO;
+ }
+}
+
/*
* This routine responds to T_CAPABILITY_REQ messages. It is called by
* icmp_wput. Much of the T_CAPABILITY_ACK information is copied from
@@ -1270,12 +1437,8 @@ icmp_capability_req(queue_t *q, mblk_t *mp)
return;
tcap = (struct T_capability_ack *)mp->b_rptr;
- tcap->CAP_bits1 = 0;
- if (cap_bits1 & TC1_INFO) {
- icmp_copy_info(&tcap->INFO_ack, icmp);
- tcap->CAP_bits1 |= TC1_INFO;
- }
+ icmp_do_capability_ack(icmp, tcap, cap_bits1);
qreply(q, mp);
}
@@ -1298,182 +1461,131 @@ icmp_info_req(queue_t *q, mblk_t *mp)
qreply(q, mp);
}
-/*
- * IP recognizes seven kinds of bind requests:
- *
- * - A zero-length address binds only to the protocol number.
- *
- * - A 4-byte address is treated as a request to
- * validate that the address is a valid local IPv4
- * address, appropriate for an application to bind to.
- * IP does the verification, but does not make any note
- * of the address at this time.
- *
- * - A 16-byte address contains is treated as a request
- * to validate a local IPv6 address, as the 4-byte
- * address case above.
- *
- * - A 16-byte sockaddr_in to validate the local IPv4 address and also
- * use it for the inbound fanout of packets.
- *
- * - A 24-byte sockaddr_in6 to validate the local IPv6 address and also
- * use it for the inbound fanout of packets.
- *
- * - A 12-byte address (ipa_conn_t) containing complete IPv4 fanout
- * information consisting of local and remote addresses
- * and ports (unused for raw sockets). In this case, the addresses are both
- * validated as appropriate for this operation, and, if
- * so, the information is retained for use in the
- * inbound fanout.
- *
- * - A 36-byte address address (ipa6_conn_t) containing complete IPv6
- * fanout information, like the 12-byte case above.
- *
- * IP will also fill in the IRE request mblk with information
- * regarding our peer. In all cases, we notify IP of our protocol
- * type by appending a single protocol byte to the bind request.
- */
-static mblk_t *
-icmp_ip_bind_mp(icmp_t *icmp, t_scalar_t bind_prim, t_scalar_t addr_length,
- in_port_t fport)
+/* For /dev/icmp aka AF_INET open */
+static int
+icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
+ int family)
{
- char *cp;
- mblk_t *mp;
- struct T_bind_req *tbr;
- ipa_conn_t *ac;
- ipa6_conn_t *ac6;
- sin_t *sin;
- sin6_t *sin6;
+ conn_t *connp;
+ dev_t conn_dev;
+ icmp_stack_t *is;
+ int error;
- ASSERT(bind_prim == O_T_BIND_REQ || bind_prim == T_BIND_REQ);
- ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock));
- mp = allocb(sizeof (*tbr) + addr_length + 1, BPRI_HI);
- if (mp == NULL)
- return (NULL);
- mp->b_datap->db_type = M_PROTO;
- tbr = (struct T_bind_req *)mp->b_rptr;
- tbr->PRIM_type = bind_prim;
- tbr->ADDR_offset = sizeof (*tbr);
- tbr->CONIND_number = 0;
- tbr->ADDR_length = addr_length;
- cp = (char *)&tbr[1];
- switch (addr_length) {
- case sizeof (ipa_conn_t):
- ASSERT(icmp->icmp_family == AF_INET);
- /* Append a request for an IRE */
- mp->b_cont = allocb(sizeof (ire_t), BPRI_HI);
- if (mp->b_cont == NULL) {
- freemsg(mp);
- return (NULL);
- }
- mp->b_cont->b_wptr += sizeof (ire_t);
- mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
+ conn_dev = NULL;
- /* cp known to be 32 bit aligned */
- ac = (ipa_conn_t *)cp;
- ac->ac_laddr = V4_PART_OF_V6(icmp->icmp_v6src);
- ac->ac_faddr = V4_PART_OF_V6(icmp->icmp_v6dst);
- ac->ac_fport = fport;
- ac->ac_lport = 0;
- break;
+ /* If the stream is already open, return immediately. */
+ if (q->q_ptr != NULL)
+ return (0);
- case sizeof (ipa6_conn_t):
- ASSERT(icmp->icmp_family == AF_INET6);
- /* Append a request for an IRE */
- mp->b_cont = allocb(sizeof (ire_t), BPRI_HI);
- if (mp->b_cont == NULL) {
- freemsg(mp);
- return (NULL);
- }
- mp->b_cont->b_wptr += sizeof (ire_t);
- mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
+ if (sflag == MODOPEN)
+ return (EINVAL);
- /* cp known to be 32 bit aligned */
- ac6 = (ipa6_conn_t *)cp;
- ac6->ac6_laddr = icmp->icmp_v6src;
- ac6->ac6_faddr = icmp->icmp_v6dst;
- ac6->ac6_fport = fport;
- ac6->ac6_lport = 0;
- break;
+ /*
+ * Since ICMP is not used so heavily, allocating from the small
+ * arena should be sufficient.
+ */
+ if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
+ return (EBUSY);
+ }
- case sizeof (sin_t):
- ASSERT(icmp->icmp_family == AF_INET);
- /* Append a request for an IRE */
- mp->b_cont = allocb(sizeof (ire_t), BPRI_HI);
- if (!mp->b_cont) {
- freemsg(mp);
- return (NULL);
- }
- mp->b_cont->b_wptr += sizeof (ire_t);
- mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
+ if (flag & SO_FALLBACK) {
+ /*
+ * Non streams socket needs a stream to fallback to
+ */
+ RD(q)->q_ptr = (void *)conn_dev;
+ WR(q)->q_qinfo = &icmp_fallback_sock_winit;
+ WR(q)->q_ptr = (void *)ip_minor_arena_sa;
+ qprocson(q);
+ return (0);
+ }
- sin = (sin_t *)cp;
- *sin = sin_null;
- sin->sin_family = AF_INET;
- sin->sin_addr.s_addr = V4_PART_OF_V6(icmp->icmp_bound_v6src);
- break;
+ connp = icmp_open(family, credp, &error, KM_SLEEP);
+ if (connp == NULL) {
+ ASSERT(error != NULL);
+ inet_minor_free(ip_minor_arena_sa, connp->conn_dev);
+ return (error);
+ }
- case sizeof (sin6_t):
- ASSERT(icmp->icmp_family == AF_INET6);
- /* Append a request for an IRE */
- mp->b_cont = allocb(sizeof (ire_t), BPRI_HI);
- if (!mp->b_cont) {
- freemsg(mp);
- return (NULL);
- }
- mp->b_cont->b_wptr += sizeof (ire_t);
- mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
+ *devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
+ connp->conn_dev = conn_dev;
+ connp->conn_minor_arena = ip_minor_arena_sa;
- sin6 = (sin6_t *)cp;
- *sin6 = sin6_null;
- sin6->sin6_family = AF_INET6;
- sin6->sin6_addr = icmp->icmp_bound_v6src;
- break;
+ is = connp->conn_icmp->icmp_is;
+
+ /*
+ * Initialize the icmp_t structure for this stream.
+ */
+ q->q_ptr = connp;
+ WR(q)->q_ptr = connp;
+ connp->conn_rq = q;
+ connp->conn_wq = WR(q);
+
+ if (connp->conn_icmp->icmp_family == AF_INET6) {
+ /* Build initial header template for transmit */
+ rw_enter(&connp->conn_icmp->icmp_rwlock, RW_WRITER);
+ if ((error = icmp_build_hdrs(connp->conn_icmp)) != 0) {
+ rw_exit(&connp->conn_icmp->icmp_rwlock);
+ inet_minor_free(ip_minor_arena_sa, connp->conn_dev);
+ ipcl_conn_destroy(connp);
+ return (error);
+ }
+ rw_exit(&connp->conn_icmp->icmp_rwlock);
}
- /* Add protocol number to end */
- cp[addr_length] = icmp->icmp_proto;
- mp->b_wptr = (uchar_t *)&cp[addr_length + 1];
- return (mp);
+
+
+ q->q_hiwat = is->is_recv_hiwat;
+ WR(q)->q_hiwat = is->is_xmit_hiwat;
+ WR(q)->q_lowat = is->is_xmit_lowat;
+
+ qprocson(q);
+
+ /* Set the Stream head write offset. */
+ (void) proto_set_tx_wroff(q, connp,
+ connp->conn_icmp->icmp_max_hdr_len + is->is_wroff_extra);
+ (void) proto_set_rx_hiwat(connp->conn_rq, connp, q->q_hiwat);
+
+ mutex_enter(&connp->conn_lock);
+ connp->conn_state_flags &= ~CONN_INCIPIENT;
+ mutex_exit(&connp->conn_lock);
+
+ return (0);
}
-/* For /dev/icmp aka AF_INET open */
+/* For /dev/icmp4 aka AF_INET open */
static int
icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
{
- return (icmp_open(q, devp, flag, sflag, credp, B_FALSE));
+ return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET));
}
/* For /dev/icmp6 aka AF_INET6 open */
static int
icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
{
- return (icmp_open(q, devp, flag, sflag, credp, B_TRUE));
+ return (icmp_tpi_open(q, devp, flag, sflag, credp, AF_INET6));
}
/*
* This is the open routine for icmp. It allocates a icmp_t structure for
* the stream and, on the first open of the module, creates an ND table.
*/
-/*ARGSUSED2*/
-static int
-icmp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
- boolean_t isv6)
+/* ARGSUSED */
+static conn_t *
+icmp_open(int family, cred_t *credp, int *err, int flags)
{
- int err;
icmp_t *icmp;
conn_t *connp;
- dev_t conn_dev;
zoneid_t zoneid;
netstack_t *ns;
icmp_stack_t *is;
+ boolean_t isv6 = B_FALSE;
- /* If the stream is already open, return immediately. */
- if (q->q_ptr != NULL)
- return (0);
-
- if (sflag == MODOPEN)
- return (EINVAL);
+ *err = secpolicy_net_icmpaccess(credp);
+ if (*err != 0)
+ return (NULL);
+ if (family == AF_INET6)
+ isv6 = B_TRUE;
ns = netstack_find_by_cred(credp);
ASSERT(ns != NULL);
is = ns->netstack_icmp;
@@ -1488,20 +1600,11 @@ icmp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
else
zoneid = crgetzoneid(credp);
- /*
- * Since ICMP is not used so heavily, allocating from the small
- * arena should be sufficient.
- */
- if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
- netstack_rele(ns);
- return (EBUSY);
- }
- *devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
+ ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP);
- connp = ipcl_conn_create(IPCL_RAWIPCONN, KM_SLEEP, ns);
- connp->conn_dev = conn_dev;
- connp->conn_minor_arena = ip_minor_arena_sa;
+ connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns);
icmp = connp->conn_icmp;
+ icmp->icmp_v6dst = sin6_null;
/*
* ipcl_conn_create did a netstack_hold. Undo the hold that was
@@ -1509,14 +1612,6 @@ icmp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
*/
netstack_rele(ns);
- /*
- * Initialize the icmp_t structure for this stream.
- */
- q->q_ptr = connp;
- WR(q)->q_ptr = connp;
- connp->conn_rq = q;
- connp->conn_wq = WR(q);
-
rw_enter(&icmp->icmp_rwlock, RW_WRITER);
ASSERT(connp->conn_ulp == IPPROTO_ICMP);
ASSERT(connp->conn_icmp == icmp);
@@ -1561,37 +1656,14 @@ icmp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
icmp->icmp_is = is;
- q->q_hiwat = is->is_recv_hiwat;
- WR(q)->q_hiwat = is->is_xmit_hiwat;
- WR(q)->q_lowat = is->is_xmit_lowat;
-
connp->conn_recv = icmp_input;
crhold(credp);
connp->conn_cred = credp;
- mutex_enter(&connp->conn_lock);
- connp->conn_state_flags &= ~CONN_INCIPIENT;
- mutex_exit(&connp->conn_lock);
-
- qprocson(q);
-
- if (icmp->icmp_family == AF_INET6) {
- /* Build initial header template for transmit */
- if ((err = icmp_build_hdrs(icmp)) != 0) {
- rw_exit(&icmp->icmp_rwlock);
- qprocsoff(q);
- ipcl_conn_destroy(connp);
- return (err);
- }
- }
rw_exit(&icmp->icmp_rwlock);
- /* Set the Stream head write offset. */
- (void) mi_set_sth_wroff(q,
- icmp->icmp_max_hdr_len + is->is_wroff_extra);
- (void) mi_set_sth_hiwat(q, q->q_hiwat);
-
- return (0);
+ connp->conn_flow_cntrld = B_FALSE;
+ return (connp);
}
/*
@@ -1657,14 +1729,15 @@ icmp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
* It returns the size of the option retrieved.
*/
int
-icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr)
+icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
{
- conn_t *connp = Q_TO_CONN(q);
- icmp_t *icmp = connp->conn_icmp;
- icmp_stack_t *is = icmp->icmp_is;
- int *i1 = (int *)ptr;
+ icmp_t *icmp = connp->conn_icmp;
+ icmp_stack_t *is = icmp->icmp_is;
+ int *i1 = (int *)ptr;
ip6_pkt_t *ipp = &icmp->icmp_sticky_ipp;
+ int ret = 0;
+ ASSERT(RW_READ_HELD(&icmp->icmp_rwlock));
switch (level) {
case SOL_SOCKET:
switch (name) {
@@ -1696,12 +1769,12 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr)
break;
case SO_SNDBUF:
- ASSERT(q->q_hiwat <= INT_MAX);
- *i1 = (int)q->q_hiwat;
+ ASSERT(icmp->icmp_xmit_hiwat <= INT_MAX);
+ *i1 = icmp->icmp_xmit_hiwat;
break;
case SO_RCVBUF:
- ASSERT(RD(q)->q_hiwat <= INT_MAX);
- *i1 = (int)RD(q)->q_hiwat;
+ ASSERT(icmp->icmp_recv_hiwat <= INT_MAX);
+ *i1 = icmp->icmp_recv_hiwat;
break;
case SO_DGRAM_ERRIND:
*i1 = icmp->icmp_dgram_errind;
@@ -1726,21 +1799,25 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr)
* case SO_ALLZONES:
*/
default:
- return (-1);
+ ret = -1;
+ goto done;
}
break;
case IPPROTO_IP:
/*
* Only allow IPv4 option processing on IPv4 sockets.
*/
- if (icmp->icmp_family != AF_INET)
- return (-1);
+ if (icmp->icmp_family != AF_INET) {
+ ret = -1;
+ goto done;
+ }
switch (name) {
case IP_OPTIONS:
case T_IP_OPTIONS:
/* Options are passed up with each packet */
- return (0);
+ ret = 0;
+ goto done;
case IP_HDRINCL:
*i1 = (int)icmp->icmp_hdrincl;
break;
@@ -1754,13 +1831,16 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr)
case IP_MULTICAST_IF:
/* 0 address if not set */
*(ipaddr_t *)ptr = icmp->icmp_multicast_if_addr;
- return (sizeof (ipaddr_t));
+ ret = sizeof (ipaddr_t);
+ goto done;
case IP_MULTICAST_TTL:
*(uchar_t *)ptr = icmp->icmp_multicast_ttl;
- return (sizeof (uchar_t));
+ ret = sizeof (uchar_t);
+ goto done;
case IP_MULTICAST_LOOP:
*ptr = connp->conn_multicast_loop;
- return (sizeof (uint8_t));
+ ret = sizeof (uint8_t);
+ goto done;
case IP_BOUND_IF:
/* Zero if not set */
*i1 = icmp->icmp_bound_if;
@@ -1768,12 +1848,12 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr)
case IP_UNSPEC_SRC:
*ptr = icmp->icmp_unspec_source;
break; /* goto sizeof (int) option return */
- case IP_BROADCAST_TTL:
- *(uchar_t *)ptr = connp->conn_broadcast_ttl;
- return (sizeof (uchar_t));
case IP_RECVIF:
*ptr = icmp->icmp_recvif;
break; /* goto sizeof (int) option return */
+ case IP_BROADCAST_TTL:
+ *(uchar_t *)ptr = connp->conn_broadcast_ttl;
+ return (sizeof (uchar_t));
case IP_RECVPKTINFO:
/*
* This also handles IP_PKTINFO.
@@ -1784,7 +1864,8 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr)
* error for IP_PKTINFO as it's not supported as a
* sticky option.
*/
- return (-EINVAL);
+ ret = -EINVAL;
+ goto done;
/*
* Cannot "get" the value of following options
* at this level. Action is same as "default" to
@@ -1815,15 +1896,18 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr)
* case IP_NEXTHOP:
*/
default:
- return (-1);
+ ret = -1;
+ goto done;
}
break;
case IPPROTO_IPV6:
/*
* Only allow IPv6 option processing on native IPv6 sockets.
*/
- if (icmp->icmp_family != AF_INET6)
- return (-1);
+ if (icmp->icmp_family != AF_INET6) {
+ ret = -1;
+ goto done;
+ }
switch (name) {
case IPV6_UNICAST_HOPS:
*i1 = (unsigned int)icmp->icmp_ttl;
@@ -1850,8 +1934,10 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr)
* Return offset or -1 if no checksum offset.
* Does not apply to IPPROTO_ICMPV6
*/
- if (icmp->icmp_proto == IPPROTO_ICMPV6)
- return (-1);
+ if (icmp->icmp_proto == IPPROTO_ICMPV6) {
+ ret = -1;
+ goto done;
+ }
if (icmp->icmp_raw_checksum) {
*i1 = icmp->icmp_checksum_off;
@@ -1868,7 +1954,8 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr)
case MCAST_JOIN_SOURCE_GROUP:
case MCAST_LEAVE_SOURCE_GROUP:
/* cannot "get" the value for these */
- return (-1);
+ ret = -1;
+ goto done;
case IPV6_RECVPKTINFO:
*i1 = icmp->icmp_ip_recvpktinfo;
break;
@@ -1912,7 +1999,8 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr)
pkti->ipi6_addr = ipp->ipp_addr;
else
pkti->ipi6_addr = ipv6_all_zeros;
- return (sizeof (struct in6_pktinfo));
+ ret = sizeof (struct in6_pktinfo);
+ goto done;
}
case IPV6_NEXTHOP: {
sin6_t *sin6 = (sin6_t *)ptr;
@@ -1922,7 +2010,8 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr)
*sin6 = sin6_null;
sin6->sin6_family = AF_INET6;
sin6->sin6_addr = ipp->ipp_nexthop;
- return (sizeof (sin6_t));
+ ret = (sizeof (sin6_t));
+ goto done;
}
case IPV6_HOPOPTS:
if (!(ipp->ipp_fields & IPPF_HOPOPTS))
@@ -1937,28 +2026,38 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr)
ptr[1] = (ipp->ipp_hopoptslen -
icmp->icmp_label_len_v6 + 7) / 8 - 1;
}
- return (ipp->ipp_hopoptslen - icmp->icmp_label_len_v6);
+ ret = (ipp->ipp_hopoptslen - icmp->icmp_label_len_v6);
+ goto done;
case IPV6_RTHDRDSTOPTS:
if (!(ipp->ipp_fields & IPPF_RTDSTOPTS))
return (0);
bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen);
- return (ipp->ipp_rtdstoptslen);
+ ret = ipp->ipp_rtdstoptslen;
+ goto done;
case IPV6_RTHDR:
if (!(ipp->ipp_fields & IPPF_RTHDR))
return (0);
bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen);
- return (ipp->ipp_rthdrlen);
+ ret = ipp->ipp_rthdrlen;
+ goto done;
case IPV6_DSTOPTS:
- if (!(ipp->ipp_fields & IPPF_DSTOPTS))
- return (0);
+ if (!(ipp->ipp_fields & IPPF_DSTOPTS)) {
+ ret = 0;
+ goto done;
+ }
bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen);
- return (ipp->ipp_dstoptslen);
+ ret = ipp->ipp_dstoptslen;
+ goto done;
case IPV6_PATHMTU:
- if (!(ipp->ipp_fields & IPPF_PATHMTU))
- return (0);
-
- return (ip_fill_mtuinfo(&icmp->icmp_v6dst, 0,
- (struct ip6_mtuinfo *)ptr, is->is_netstack));
+ if (!(ipp->ipp_fields & IPPF_PATHMTU)) {
+ ret = 0;
+ } else {
+ ret = ip_fill_mtuinfo(
+ &icmp->icmp_v6dst.sin6_addr, 0,
+ (struct ip6_mtuinfo *)ptr,
+ is->is_netstack);
+ }
+ goto done;
case IPV6_TCLASS:
if (ipp->ipp_fields & IPPF_TCLASS)
*i1 = ipp->ipp_tclass;
@@ -1967,18 +2066,21 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr)
IPV6_DEFAULT_VERS_AND_FLOW);
break;
default:
- return (-1);
+ ret = -1;
+ goto done;
}
break;
case IPPROTO_ICMPV6:
/*
* Only allow IPv6 option processing on native IPv6 sockets.
*/
- if (icmp->icmp_family != AF_INET6)
- return (-1);
+ if (icmp->icmp_family != AF_INET6) {
+ ret = -1;
+ }
- if (icmp->icmp_proto != IPPROTO_ICMPV6)
- return (-1);
+ if (icmp->icmp_proto != IPPROTO_ICMPV6) {
+ ret = -1;
+ }
switch (name) {
case ICMP6_FILTER:
@@ -1989,14 +2091,19 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr)
(void) bcopy(icmp->icmp_filter, ptr,
sizeof (icmp6_filter_t));
}
- return (sizeof (icmp6_filter_t));
+ ret = sizeof (icmp6_filter_t);
+ goto done;
default:
- return (-1);
+ ret = -1;
+ goto done;
}
default:
- return (-1);
+ ret = -1;
+ goto done;
}
- return (sizeof (int));
+ ret = sizeof (int);
+done:
+ return (ret);
}
/*
@@ -2004,84 +2111,36 @@ icmp_opt_get_locked(queue_t *q, int level, int name, uchar_t *ptr)
* It returns the size of the option retrieved.
*/
int
-icmp_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
+icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
{
- icmp_t *icmp = Q_TO_ICMP(q);
+ conn_t *connp = Q_TO_CONN(q);
+ icmp_t *icmp = connp->conn_icmp;
int err;
rw_enter(&icmp->icmp_rwlock, RW_READER);
- err = icmp_opt_get_locked(q, level, name, ptr);
+ err = icmp_opt_get(connp, level, name, ptr);
rw_exit(&icmp->icmp_rwlock);
return (err);
}
-
-/* This routine sets socket options. */
-/* ARGSUSED */
int
-icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name,
- uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
- void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
+ uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, cred_t *cr,
+ void *thisdg_attrs, boolean_t checkonly)
{
- conn_t *connp = Q_TO_CONN(q);
- icmp_t *icmp = connp->conn_icmp;
- icmp_stack_t *is = icmp->icmp_is;
+
int *i1 = (int *)invalp;
boolean_t onoff = (*i1 == 0) ? 0 : 1;
- boolean_t checkonly;
+ icmp_t *icmp = connp->conn_icmp;
+ icmp_stack_t *is = icmp->icmp_is;
int error;
- switch (optset_context) {
- case SETFN_OPTCOM_CHECKONLY:
- checkonly = B_TRUE;
- /*
- * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
- * inlen != 0 implies value supplied and
- * we have to "pretend" to set it.
- * inlen == 0 implies that there is no
- * value part in T_CHECK request and just validation
- * done elsewhere should be enough, we just return here.
- */
- if (inlen == 0) {
- *outlenp = 0;
- return (0);
- }
- break;
- case SETFN_OPTCOM_NEGOTIATE:
- checkonly = B_FALSE;
- break;
- case SETFN_UD_NEGOTIATE:
- case SETFN_CONN_NEGOTIATE:
- checkonly = B_FALSE;
- /*
- * Negotiating local and "association-related" options
- * through T_UNITDATA_REQ.
- *
- * Following routine can filter out ones we do not
- * want to be "set" this way.
- */
- if (!icmp_opt_allow_udr_set(level, name)) {
- *outlenp = 0;
- return (EINVAL);
- }
- break;
- default:
- /*
- * We should never get here
- */
- *outlenp = 0;
- return (EINVAL);
- }
-
- ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
- (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
-
+ ASSERT(RW_WRITE_HELD(&icmp->icmp_rwlock));
/*
* For fixed length options, no sanity check
* of passed in length is done. It is assumed *_optcom_req()
* routines do the right thing.
*/
-
switch (level) {
case SOL_SOCKET:
switch (name) {
@@ -2161,12 +2220,14 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name,
/* Drop lock across the bind operation */
rw_exit(&icmp->icmp_rwlock);
- icmp_bind_proto(q);
+ (void) icmp_bind_proto(connp);
rw_enter(&icmp->icmp_rwlock, RW_WRITER);
return (0);
case SO_REUSEADDR:
- if (!checkonly)
+ if (!checkonly) {
icmp->icmp_reuseaddr = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
/*
@@ -2174,16 +2235,22 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name,
* but are only meaningful to IP.
*/
case SO_DONTROUTE:
- if (!checkonly)
+ if (!checkonly) {
icmp->icmp_dontroute = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case SO_USELOOPBACK:
- if (!checkonly)
+ if (!checkonly) {
icmp->icmp_useloopback = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case SO_BROADCAST:
- if (!checkonly)
+ if (!checkonly) {
icmp->icmp_broadcast = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case SO_SNDBUF:
@@ -2192,7 +2259,10 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name,
return (ENOBUFS);
}
if (!checkonly) {
- q->q_hiwat = *i1;
+ if (!IPCL_IS_NONSTR(connp)) {
+ connp->conn_wq->q_hiwat = *i1;
+ }
+ icmp->icmp_xmit_hiwat = *i1;
}
break;
case SO_RCVBUF:
@@ -2201,9 +2271,10 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name,
return (ENOBUFS);
}
if (!checkonly) {
- RD(q)->q_hiwat = *i1;
+ icmp->icmp_recv_hiwat = *i1;
rw_exit(&icmp->icmp_rwlock);
- (void) mi_set_sth_hiwat(RD(q), *i1);
+ (void) proto_set_rx_hiwat(connp->conn_rq, connp,
+ *i1);
rw_enter(&icmp->icmp_rwlock, RW_WRITER);
}
break;
@@ -2273,8 +2344,9 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name,
icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
icmp->icmp_ip_snd_options_len;
rw_exit(&icmp->icmp_rwlock);
- (void) mi_set_sth_wroff(RD(q), icmp->icmp_max_hdr_len +
- is->is_wroff_extra);
+ (void) proto_set_tx_wroff(connp->conn_rq == NULL ? NULL:
+ RD(connp->conn_rq), connp,
+ icmp->icmp_max_hdr_len + is->is_wroff_extra);
rw_enter(&icmp->icmp_rwlock, RW_WRITER);
break;
case IP_HDRINCL:
@@ -2297,8 +2369,10 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name,
* TODO should check OPTMGMT reply and undo this if
* there is an error.
*/
- if (!checkonly)
+ if (!checkonly) {
icmp->icmp_multicast_if_addr = *i1;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IP_MULTICAST_TTL:
if (!checkonly)
@@ -2308,23 +2382,29 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name,
if (!checkonly) {
connp->conn_multicast_loop =
(*invalp == 0) ? 0 : 1;
+ PASS_OPT_TO_IP(connp);
}
break;
case IP_BOUND_IF:
- if (!checkonly)
+ if (!checkonly) {
icmp->icmp_bound_if = *i1;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IP_UNSPEC_SRC:
- if (!checkonly)
+ if (!checkonly) {
icmp->icmp_unspec_source = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IP_BROADCAST_TTL:
if (!checkonly)
connp->conn_broadcast_ttl = *invalp;
break;
case IP_RECVIF:
- if (!checkonly)
+ if (!checkonly) {
icmp->icmp_recvif = onoff;
+ }
/*
* pass to ip
*/
@@ -2354,8 +2434,9 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name,
}
- if (inlen != sizeof (struct in_pktinfo))
+ if (inlen != sizeof (struct in_pktinfo)) {
return (EINVAL);
+ }
if ((attr_pktinfop = (ip4_pkt_t *)thisdg_attrs)
== NULL) {
@@ -2436,8 +2517,10 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name,
switch (name) {
case IPV6_MULTICAST_IF:
- if (!checkonly)
+ if (!checkonly) {
icmp->icmp_multicast_if_index = *i1;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IPV6_UNICAST_HOPS:
/* -1 means use default */
@@ -2492,8 +2575,10 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name,
*outlenp = 0;
return (EINVAL);
}
- if (!checkonly)
+ if (!checkonly) {
connp->conn_multicast_loop = *i1;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IPV6_CHECKSUM:
/*
@@ -2544,51 +2629,71 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name,
*/
return (-EINVAL);
case IPV6_BOUND_IF:
- if (!checkonly)
+ if (!checkonly) {
icmp->icmp_bound_if = *i1;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IPV6_UNSPEC_SRC:
- if (!checkonly)
+ if (!checkonly) {
icmp->icmp_unspec_source = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IPV6_RECVTCLASS:
- if (!checkonly)
+ if (!checkonly) {
icmp->icmp_ipv6_recvtclass = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
/*
* Set boolean switches for ancillary data delivery
*/
case IPV6_RECVPKTINFO:
- if (!checkonly)
+ if (!checkonly) {
icmp->icmp_ip_recvpktinfo = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IPV6_RECVPATHMTU:
- if (!checkonly)
+ if (!checkonly) {
icmp->icmp_ipv6_recvpathmtu = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IPV6_RECVHOPLIMIT:
- if (!checkonly)
+ if (!checkonly) {
icmp->icmp_ipv6_recvhoplimit = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IPV6_RECVHOPOPTS:
- if (!checkonly)
+ if (!checkonly) {
icmp->icmp_ipv6_recvhopopts = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IPV6_RECVDSTOPTS:
- if (!checkonly)
+ if (!checkonly) {
icmp->icmp_ipv6_recvdstopts = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case _OLD_IPV6_RECVDSTOPTS:
if (!checkonly)
icmp->icmp_old_ipv6_recvdstopts = onoff;
break;
case IPV6_RECVRTHDRDSTOPTS:
- if (!checkonly)
+ if (!checkonly) {
icmp->icmp_ipv6_recvrtdstopts = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IPV6_RECVRTHDR:
- if (!checkonly)
+ if (!checkonly) {
icmp->icmp_ipv6_recvrthdr = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
/*
* Set sticky options or ancillary data.
@@ -2601,8 +2706,10 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name,
* in ip_opt_set(). For ancillary data the
* source address is checked in ip_wput_v6.
*/
- if (inlen != 0 && inlen != sizeof (struct in6_pktinfo))
+ if (inlen != 0 && inlen !=
+ sizeof (struct in6_pktinfo)) {
return (EINVAL);
+ }
if (checkonly)
break;
@@ -2630,6 +2737,7 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name,
error = icmp_build_hdrs(icmp);
if (error != 0)
return (error);
+ PASS_OPT_TO_IP(connp);
}
break;
case IPV6_HOPLIMIT:
@@ -2660,8 +2768,9 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name,
* IPV6_RECVTCLASS accepts -1 as use kernel default
* and [0, 255] as the actualy traffic class.
*/
- if (inlen != 0 && inlen != sizeof (int))
+ if (inlen != 0 && inlen != sizeof (int)) {
return (EINVAL);
+ }
if (checkonly)
break;
@@ -2691,8 +2800,9 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name,
* IP will verify that the nexthop is reachable
* and fail for sticky options.
*/
- if (inlen != 0 && inlen != sizeof (sin6_t))
+ if (inlen != 0 && inlen != sizeof (sin6_t)) {
return (EINVAL);
+ }
if (checkonly)
break;
@@ -2702,10 +2812,12 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name,
} else {
sin6_t *sin6 = (sin6_t *)invalp;
- if (sin6->sin6_family != AF_INET6)
+ if (sin6->sin6_family != AF_INET6) {
return (EAFNOSUPPORT);
- if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr))
+ }
+ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
return (EADDRNOTAVAIL);
+ }
ipp->ipp_nexthop = sin6->sin6_addr;
if (!IN6_IS_ADDR_UNSPECIFIED(
&ipp->ipp_nexthop))
@@ -2717,6 +2829,7 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name,
error = icmp_build_hdrs(icmp);
if (error != 0)
return (error);
+ PASS_OPT_TO_IP(connp);
}
break;
case IPV6_HOPOPTS: {
@@ -2726,8 +2839,9 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name,
* eight bytes, and matching size passed in.
*/
if (inlen != 0 &&
- inlen != (8 * (hopts->ip6h_len + 1)))
+ inlen != (8 * (hopts->ip6h_len + 1))) {
return (EINVAL);
+ }
if (checkonly)
break;
@@ -2974,23 +3088,89 @@ icmp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name,
*outlenp = inlen;
return (0);
}
+
/* This routine sets socket options. */
/* ARGSUSED */
int
-icmp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
+icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
+ uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
+ void *thisdg_attrs, cred_t *cr)
+{
+ boolean_t checkonly;
+ int error;
+
+ error = 0;
+ switch (optset_context) {
+ case SETFN_OPTCOM_CHECKONLY:
+ checkonly = B_TRUE;
+ /*
+ * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
+ * inlen != 0 implies value supplied and
+ * we have to "pretend" to set it.
+ * inlen == 0 implies that there is no
+ * value part in T_CHECK request and just validation
+ * done elsewhere should be enough, we just return here.
+ */
+ if (inlen == 0) {
+ *outlenp = 0;
+ error = 0;
+ goto done;
+ }
+ break;
+ case SETFN_OPTCOM_NEGOTIATE:
+ checkonly = B_FALSE;
+ break;
+ case SETFN_UD_NEGOTIATE:
+ case SETFN_CONN_NEGOTIATE:
+ checkonly = B_FALSE;
+ /*
+ * Negotiating local and "association-related" options
+ * through T_UNITDATA_REQ.
+ *
+ * Following routine can filter out ones we do not
+ * want to be "set" this way.
+ */
+ if (!icmp_opt_allow_udr_set(level, name)) {
+ *outlenp = 0;
+ error = EINVAL;
+ goto done;
+ }
+ break;
+ default:
+ /*
+ * We should never get here
+ */
+ *outlenp = 0;
+ error = EINVAL;
+ goto done;
+ }
+
+ ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
+ (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
+ error = icmp_do_opt_set(connp, level, name, inlen, invalp, outlenp,
+ outvalp, cr, thisdg_attrs, checkonly);
+
+done:
+ return (error);
+}
+
+/* This routine sets socket options. */
+/* ARGSUSED */
+int
+icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
{
+ conn_t *connp = Q_TO_CONN(q);
icmp_t *icmp;
- int err;
-
- icmp = Q_TO_ICMP(q);
+ int error;
+ icmp = connp->conn_icmp;
rw_enter(&icmp->icmp_rwlock, RW_WRITER);
- err = icmp_opt_set_locked(q, optset_context, level, name, inlen, invalp,
- outlenp, outvalp, thisdg_attrs, cr, mblk);
+ error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp,
+ outlenp, outvalp, thisdg_attrs, cr);
rw_exit(&icmp->icmp_rwlock);
- return (err);
+ return (error);
}
/*
@@ -3055,7 +3235,8 @@ icmp_build_hdrs(icmp_t *icmp)
if (hdrs_len > icmp->icmp_max_hdr_len) {
icmp->icmp_max_hdr_len = hdrs_len;
rw_exit(&icmp->icmp_rwlock);
- (void) mi_set_sth_wroff(icmp->icmp_connp->conn_rq,
+ (void) proto_set_tx_wroff(icmp->icmp_connp->conn_rq,
+ icmp->icmp_connp,
icmp->icmp_max_hdr_len + is->is_wroff_extra);
rw_enter(&icmp->icmp_rwlock, RW_WRITER);
}
@@ -3123,6 +3304,33 @@ icmp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
icmppa->icmp_param_value = new_value;
return (0);
}
+static void
+icmp_queue_fallback(icmp_t *icmp, mblk_t *mp)
+{
+ ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock));
+ if (IPCL_IS_NONSTR(icmp->icmp_connp)) {
+ /*
+ * fallback has started but messages have not been moved yet
+ */
+ if (icmp->icmp_fallback_queue_head == NULL) {
+ ASSERT(icmp->icmp_fallback_queue_tail == NULL);
+ icmp->icmp_fallback_queue_head = mp;
+ icmp->icmp_fallback_queue_tail = mp;
+ } else {
+ ASSERT(icmp->icmp_fallback_queue_tail != NULL);
+ icmp->icmp_fallback_queue_tail->b_next = mp;
+ icmp->icmp_fallback_queue_tail = mp;
+ }
+ mutex_exit(&icmp->icmp_recv_lock);
+ } else {
+ /*
+ * no more fallbacks possible, ok to drop lock.
+ */
+ mutex_exit(&icmp->icmp_recv_lock);
+ putnext(icmp->icmp_connp->conn_rq, mp);
+ }
+}
+
/*ARGSUSED2*/
static void
icmp_input(void *arg1, mblk_t *mp, void *arg2)
@@ -3148,6 +3356,7 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2)
uint_t icmp_opt = 0;
boolean_t icmp_ipv6_recvhoplimit = B_FALSE;
uint_t hopstrip;
+ int error;
ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
@@ -3189,7 +3398,7 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2)
/*
* ICMP messages.
*/
- icmp_icmp_error(connp->conn_rq, mp);
+ icmp_icmp_error(connp, mp);
return;
}
}
@@ -3388,8 +3597,7 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2)
freeb(options_mp);
BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
- putnext(connp->conn_rq, mp);
- return;
+ goto deliver;
}
/*
@@ -3707,7 +3915,7 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2)
udi_size -= toh->len;
}
if (icmp->icmp_timestamp) {
- struct T_opthdr *toh;
+ struct T_opthdr *toh;
toh = (struct T_opthdr *)dstopt;
toh->level = SOL_SOCKET;
@@ -3723,6 +3931,7 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2)
dstopt = (uchar_t *)toh + toh->len;
udi_size -= toh->len;
}
+
if (icmp_opt & IPPF_HOPOPTS) {
struct T_opthdr *toh;
@@ -3792,235 +4001,37 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2)
ASSERT(udi_size == 0);
}
BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
- putnext(connp->conn_rq, mp);
-}
-
-/*
- * Handle the results of a T_BIND_REQ whether deferred by IP or handled
- * immediately.
- */
-static void
-icmp_bind_result(conn_t *connp, mblk_t *mp)
-{
- struct T_error_ack *tea;
-
- switch (mp->b_datap->db_type) {
- case M_PROTO:
- case M_PCPROTO:
- /* M_PROTO messages contain some type of TPI message. */
- if ((mp->b_wptr - mp->b_rptr) < sizeof (t_scalar_t)) {
- freemsg(mp);
- return;
- }
- tea = (struct T_error_ack *)mp->b_rptr;
-
- switch (tea->PRIM_type) {
- case T_ERROR_ACK:
- switch (tea->ERROR_prim) {
- case O_T_BIND_REQ:
- case T_BIND_REQ:
- icmp_bind_error(connp, mp);
- return;
- default:
- break;
- }
- ASSERT(0);
- freemsg(mp);
- return;
-
- case T_BIND_ACK:
- icmp_bind_ack(connp, mp);
- return;
-
- default:
- break;
- }
- freemsg(mp);
- return;
- default:
- /* FIXME: other cases? */
- ASSERT(0);
- freemsg(mp);
- return;
- }
-}
-
-/*
- * Process a T_BIND_ACK
- */
-static void
-icmp_bind_ack(conn_t *connp, mblk_t *mp)
-{
- icmp_t *icmp = connp->conn_icmp;
- mblk_t *mp1;
- ire_t *ire;
- struct T_bind_ack *tba;
- uchar_t *addrp;
- ipa_conn_t *ac;
- ipa6_conn_t *ac6;
-
- rw_enter(&icmp->icmp_rwlock, RW_WRITER);
- /*
- * We know if headers are included or not so we can
- * safely do this.
- */
- if (icmp->icmp_state == TS_UNBND) {
- /*
- * TPI has not yet bound - bind sent by
- * icmp_bind_proto.
- */
- freemsg(mp);
- rw_exit(&icmp->icmp_rwlock);
- return;
- }
- ASSERT(icmp->icmp_pending_op != -1);
-
- /*
- * If a broadcast/multicast address was bound set
- * the source address to 0.
- * This ensures no datagrams with broadcast address
- * as source address are emitted (which would violate
- * RFC1122 - Hosts requirements)
- *
- * Note that when connecting the returned IRE is
- * for the destination address and we only perform
- * the broadcast check for the source address (it
- * is OK to connect to a broadcast/multicast address.)
- */
- mp1 = mp->b_cont;
- if (mp1 != NULL && mp1->b_datap->db_type == IRE_DB_TYPE) {
- ire = (ire_t *)mp1->b_rptr;
- /*
- * Note: we get IRE_BROADCAST for IPv6 to "mark" a multicast
- * local address.
- */
- if (ire->ire_type == IRE_BROADCAST &&
- icmp->icmp_state != TS_DATA_XFER) {
- ASSERT(icmp->icmp_pending_op == T_BIND_REQ ||
- icmp->icmp_pending_op == O_T_BIND_REQ);
- /* This was just a local bind to a MC/broadcast addr */
- V6_SET_ZERO(icmp->icmp_v6src);
- if (icmp->icmp_family == AF_INET6)
- (void) icmp_build_hdrs(icmp);
- } else if (V6_OR_V4_INADDR_ANY(icmp->icmp_v6src)) {
- /*
- * Local address not yet set - pick it from the
- * T_bind_ack
- */
- tba = (struct T_bind_ack *)mp->b_rptr;
- addrp = &mp->b_rptr[tba->ADDR_offset];
- switch (icmp->icmp_family) {
- case AF_INET:
- if (tba->ADDR_length == sizeof (ipa_conn_t)) {
- ac = (ipa_conn_t *)addrp;
- } else {
- ASSERT(tba->ADDR_length ==
- sizeof (ipa_conn_x_t));
- ac = &((ipa_conn_x_t *)addrp)->acx_conn;
- }
- IN6_IPADDR_TO_V4MAPPED(ac->ac_laddr,
- &icmp->icmp_v6src);
- break;
- case AF_INET6:
- if (tba->ADDR_length == sizeof (ipa6_conn_t)) {
- ac6 = (ipa6_conn_t *)addrp;
- } else {
- ASSERT(tba->ADDR_length ==
- sizeof (ipa6_conn_x_t));
- ac6 = &((ipa6_conn_x_t *)
- addrp)->ac6x_conn;
+deliver:
+ if (IPCL_IS_NONSTR(connp)) {
+ if ((*connp->conn_upcalls->su_recv)
+ (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error,
+ NULL) < 0) {
+ mutex_enter(&icmp->icmp_recv_lock);
+ if (error == ENOSPC) {
+ /*
+ * let's confirm while holding the lock
+ */
+ if ((*connp->conn_upcalls->su_recv)
+ (connp->conn_upper_handle, NULL, 0, 0,
+ &error, NULL) < 0) {
+ if (error == ENOSPC) {
+ connp->conn_flow_cntrld =
+ B_TRUE;
+ } else {
+ ASSERT(error == EOPNOTSUPP);
+ }
}
- icmp->icmp_v6src = ac6->ac6_laddr;
- (void) icmp_build_hdrs(icmp);
+ mutex_exit(&icmp->icmp_recv_lock);
+ } else {
+ ASSERT(error == EOPNOTSUPP);
+ icmp_queue_fallback(icmp, mp);
}
}
- mp1 = mp1->b_cont;
- }
- icmp->icmp_pending_op = -1;
- rw_exit(&icmp->icmp_rwlock);
- /*
- * Look for one or more appended ACK message added by
- * icmp_connect or icmp_disconnect.
- * If none found just send up the T_BIND_ACK.
- * icmp_connect has appended a T_OK_ACK and a
- * T_CONN_CON.
- * icmp_disconnect has appended a T_OK_ACK.
- */
- if (mp1 != NULL) {
- if (mp->b_cont == mp1)
- mp->b_cont = NULL;
- else {
- ASSERT(mp->b_cont->b_cont == mp1);
- mp->b_cont->b_cont = NULL;
- }
- freemsg(mp);
- mp = mp1;
- while (mp != NULL) {
- mp1 = mp->b_cont;
- mp->b_cont = NULL;
- putnext(connp->conn_rq, mp);
- mp = mp1;
- }
- return;
- }
- freemsg(mp->b_cont);
- mp->b_cont = NULL;
- putnext(connp->conn_rq, mp);
-}
-
-static void
-icmp_bind_error(conn_t *connp, mblk_t *mp)
-{
- icmp_t *icmp = connp->conn_icmp;
- struct T_error_ack *tea;
-
- tea = (struct T_error_ack *)mp->b_rptr;
- /*
- * If our O_T_BIND_REQ/T_BIND_REQ fails,
- * clear out the source address before
- * passing the message upstream.
- * If this was caused by a T_CONN_REQ
- * revert back to bound state.
- */
- rw_enter(&icmp->icmp_rwlock, RW_WRITER);
- if (icmp->icmp_state == TS_UNBND) {
- /*
- * TPI has not yet bound - bind sent by icmp_bind_proto.
- */
- freemsg(mp);
- rw_exit(&icmp->icmp_rwlock);
- return;
- }
- ASSERT(icmp->icmp_pending_op != -1);
- tea->ERROR_prim = icmp->icmp_pending_op;
- icmp->icmp_pending_op = -1;
-
- switch (tea->ERROR_prim) {
- case T_CONN_REQ:
- ASSERT(icmp->icmp_state == TS_DATA_XFER);
- /* Connect failed */
- /* Revert back to the bound source */
- icmp->icmp_v6src = icmp->icmp_bound_v6src;
- icmp->icmp_state = TS_IDLE;
- if (icmp->icmp_family == AF_INET6)
- (void) icmp_build_hdrs(icmp);
- break;
-
- case T_DISCON_REQ:
- case T_BIND_REQ:
- case O_T_BIND_REQ:
- V6_SET_ZERO(icmp->icmp_v6src);
- V6_SET_ZERO(icmp->icmp_bound_v6src);
- icmp->icmp_state = TS_UNBND;
- if (icmp->icmp_family == AF_INET6)
- (void) icmp_build_hdrs(icmp);
- break;
- default:
- break;
+ } else {
+ putnext(connp->conn_rq, mp);
}
- rw_exit(&icmp->icmp_rwlock);
- putnext(connp->conn_rq, mp);
+ ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock));
}
/*
@@ -4121,7 +4132,8 @@ icmp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
(void) mi_mpprintf(mp, MI_COL_PTRFMT_STR "%s %s %s",
(void *)icmp,
- inet_ntop(AF_INET6, &icmp->icmp_v6dst, faddrbuf,
+ inet_ntop(AF_INET6, &icmp->icmp_v6dst.sin6_addr,
+ faddrbuf,
sizeof (faddrbuf)),
inet_ntop(AF_INET6, &icmp->icmp_v6src, laddrbuf,
sizeof (laddrbuf)),
@@ -4152,32 +4164,26 @@ icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
freemsg(mp);
}
-/*
- * This routine is called by icmp_wput to handle T_UNBIND_REQ messages.
- * After some error checking, the message is passed downstream to ip.
- */
-static void
-icmp_unbind(queue_t *q, mblk_t *mp)
+
+static int
+rawip_do_unbind(conn_t *connp)
{
- icmp_t *icmp = Q_TO_ICMP(q);
+ icmp_t *icmp = connp->conn_icmp;
rw_enter(&icmp->icmp_rwlock, RW_WRITER);
/* If a bind has not been done, we can't unbind. */
if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) {
rw_exit(&icmp->icmp_rwlock);
- icmp_err_ack(q, mp, TOUTSTATE, 0);
- return;
+ return (-TOUTSTATE);
}
icmp->icmp_pending_op = T_UNBIND_REQ;
rw_exit(&icmp->icmp_rwlock);
/*
- * Pass the unbind to IP; T_UNBIND_REQ is larger than T_OK_ACK
- * and therefore ip_unbind must never return NULL.
+ * Call ip to unbind
*/
- mp = ip_unbind(q, mp);
- ASSERT(mp != NULL);
- ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK);
+
+ ip_unbind(connp);
/*
* Once we're unbound from IP, the pending operation may be cleared
@@ -4191,17 +4197,54 @@ icmp_unbind(queue_t *q, mblk_t *mp)
if (icmp->icmp_family == AF_INET6)
(void) icmp_build_hdrs(icmp);
rw_exit(&icmp->icmp_rwlock);
+ return (0);
+}
+
+/*
+ * This routine is called by icmp_wput to handle T_UNBIND_REQ messages.
+ * After some error checking, the message is passed downstream to ip.
+ */
+static void
+icmp_tpi_unbind(queue_t *q, mblk_t *mp)
+{
+ conn_t *connp = Q_TO_CONN(q);
+ int error;
+
+ ASSERT(mp->b_cont == NULL);
+ error = rawip_do_unbind(connp);
+ if (error) {
+ if (error < 0) {
+ icmp_err_ack(q, mp, -error, 0);
+ } else {
+ icmp_err_ack(q, mp, 0, error);
+ }
+ return;
+ }
+
+ /*
+ * Convert mp into a T_OK_ACK
+ */
+
+ mp = mi_tpi_ok_ack_alloc(mp);
+ /*
+ * should not happen in practice... T_OK_ACK is smaller than the
+ * original message.
+ */
+ ASSERT(mp != NULL);
+ ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK);
qreply(q, mp);
}
+
/*
* Process IPv4 packets that already include an IP header.
* Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and
* IPPROTO_IGMP).
*/
-static void
-icmp_wput_hdrincl(queue_t *q, mblk_t *mp, icmp_t *icmp, ip4_pkt_t *pktinfop)
+static int
+icmp_wput_hdrincl(queue_t *q, conn_t *connp, mblk_t *mp, icmp_t *icmp,
+ ip4_pkt_t *pktinfop)
{
icmp_stack_t *is = icmp->icmp_is;
ipha_t *ipha;
@@ -4210,7 +4253,6 @@ icmp_wput_hdrincl(queue_t *q, mblk_t *mp, icmp_t *icmp, ip4_pkt_t *pktinfop)
mblk_t *mp1;
uint_t pkt_len;
ip_opt_info_t optinfo;
- conn_t *connp = icmp->icmp_connp;
optinfo.ip_opt_flags = 0;
optinfo.ip_opt_ill_index = 0;
@@ -4221,7 +4263,7 @@ icmp_wput_hdrincl(queue_t *q, mblk_t *mp, icmp_t *icmp, ip4_pkt_t *pktinfop)
ASSERT(icmp != NULL);
BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
freemsg(mp);
- return;
+ return (0);
}
ipha = (ipha_t *)mp->b_rptr;
}
@@ -4266,7 +4308,7 @@ icmp_wput_hdrincl(queue_t *q, mblk_t *mp, icmp_t *icmp, ip4_pkt_t *pktinfop)
BUMP_MIB(&is->is_rawip_mib,
rawipOutErrors);
freemsg(mp);
- return;
+ return (0);
}
ipha = (ipha_t *)mp->b_rptr;
}
@@ -4278,13 +4320,11 @@ icmp_wput_hdrincl(queue_t *q, mblk_t *mp, icmp_t *icmp, ip4_pkt_t *pktinfop)
pkt_len = ntohs(ipha->ipha_length)
+ icmp->icmp_ip_snd_options_len;
if (pkt_len > IP_MAXPACKET) {
- icmp_ud_err(q, mp, EMSGSIZE);
- return;
+ return (EMSGSIZE);
}
if (!(mp1 = allocb(ip_hdr_length + is->is_wroff_extra +
tp_hdr_len, BPRI_LO))) {
- icmp_ud_err(q, mp, ENOMEM);
- return;
+ return (ENOMEM);
}
mp1->b_rptr += is->is_wroff_extra;
mp1->b_wptr = mp1->b_rptr + ip_hdr_length;
@@ -4329,10 +4369,11 @@ icmp_wput_hdrincl(queue_t *q, mblk_t *mp, icmp_t *icmp, ip4_pkt_t *pktinfop)
mblk_setcred(mp, connp->conn_cred);
ip_output_options(connp, mp, q, IP_WPUT, &optinfo);
+ return (0);
}
-static boolean_t
-icmp_update_label(queue_t *q, icmp_t *icmp, mblk_t *mp, ipaddr_t dst)
+static int
+icmp_update_label(icmp_t *icmp, mblk_t *mp, ipaddr_t dst)
{
int err;
uchar_t opt_storage[IP_MAX_OPT_LENGTH];
@@ -4351,13 +4392,12 @@ icmp_update_label(queue_t *q, icmp_t *icmp, mblk_t *mp, ipaddr_t dst)
BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
DTRACE_PROBE4(
tx__ip__log__drop__updatelabel__icmp,
- char *, "queue(1) failed to update options(2) on mp(3)",
- queue_t *, q, char *, opt_storage, mblk_t *, mp);
- icmp_ud_err(q, mp, err);
- return (B_FALSE);
+ char *, "icmp(1) failed to update options(2) on mp(3)",
+ icmp_t *, icmp, char *, opt_storage, mblk_t *, mp);
+ return (err);
}
IN6_IPADDR_TO_V4MAPPED(dst, &icmp->icmp_v6lastdst);
- return (B_TRUE);
+ return (0);
}
/*
@@ -4371,7 +4411,6 @@ icmp_wput(queue_t *q, mblk_t *mp)
uchar_t *rptr = mp->b_rptr;
ipha_t *ipha;
mblk_t *mp1;
- int ip_hdr_length;
#define tudr ((struct T_unitdata_req *)rptr)
size_t ip_len;
conn_t *connp = Q_TO_CONN(q);
@@ -4382,7 +4421,12 @@ icmp_wput(queue_t *q, mblk_t *mp)
ipaddr_t v4dst;
ip4_pkt_t pktinfo;
ip4_pkt_t *pktinfop = &pktinfo;
- ip_opt_info_t optinfo;
+ ip6_pkt_t ipp_s; /* For ancillary data options */
+ ip6_pkt_t *ipp = &ipp_s;
+ int error;
+
+ ipp->ipp_fields = 0;
+ ipp->ipp_sticky_ignored = 0;
switch (mp->b_datap->db_type) {
case M_DATA:
@@ -4406,11 +4450,17 @@ icmp_wput(queue_t *q, mblk_t *mp)
if (is_system_labeled() &&
(!IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6lastdst) ||
V4_PART_OF_V6(icmp->icmp_v6lastdst) !=
- ipha->ipha_dst) &&
- !icmp_update_label(q, icmp, mp, ipha->ipha_dst)) {
- return;
+ ipha->ipha_dst)) {
+ error = icmp_update_label(icmp, mp,
+ ipha->ipha_dst);
+ if (error != 0) {
+ icmp_ud_err(q, mp, error);
+ return;
+ }
}
- icmp_wput_hdrincl(q, mp, icmp, NULL);
+ error = icmp_wput_hdrincl(q, connp, mp, icmp, NULL);
+ if (error != 0)
+ icmp_ud_err(q, mp, error);
return;
}
freemsg(mp);
@@ -4432,14 +4482,6 @@ icmp_wput(queue_t *q, mblk_t *mp)
/* Handle T_UNITDATA_REQ messages here. */
-
-
- if (icmp->icmp_state == TS_UNBND) {
- /* If a port has not been bound to the stream, fail. */
- BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- icmp_ud_err(q, mp, EPROTO);
- return;
- }
mp1 = mp->b_cont;
if (mp1 == NULL) {
BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
@@ -4475,8 +4517,22 @@ icmp_wput(queue_t *q, mblk_t *mp)
* Destination is a native IPv6 address.
* Send out an IPv6 format packet.
*/
- icmp_wput_ipv6(q, mp, sin6, tudr->OPT_length);
- return;
+ if (tudr->OPT_length != 0) {
+ int error;
+
+ error = 0;
+ if (icmp_unitdata_opt_process(q, mp, &error,
+ (void *)ipp) < 0) {
+ /* failure */
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ icmp_ud_err(q, mp, error);
+ return;
+ }
+ ASSERT(error == 0);
+ }
+
+ error = raw_ip_send_data_v6(q, connp, mp1, sin6, ipp);
+ goto done;
case AF_INET:
sin = (sin_t *)&rptr[tudr->DEST_offset];
@@ -4497,9 +4553,6 @@ icmp_wput(queue_t *q, mblk_t *mp)
pktinfop->ip4_ill_index = 0;
pktinfop->ip4_addr = INADDR_ANY;
- optinfo.ip_opt_flags = 0;
- optinfo.ip_opt_ill_index = 0;
-
/*
* If options passed in, feed it for verification and handling
@@ -4522,7 +4575,48 @@ icmp_wput(queue_t *q, mblk_t *mp)
* OPT_length/offset now potentially modified
* and contain option setting results
*/
+ }
+ error = raw_ip_send_data_v4(q, connp, mp1, v4dst, pktinfop);
+done:
+ if (error != 0) {
+ icmp_ud_err(q, mp, error);
+ return;
+ } else {
+ mp->b_cont = NULL;
+ freeb(mp);
+ }
+}
+
+
+/* ARGSUSED */
+static void
+icmp_wput_fallback(queue_t *q, mblk_t *mp)
+{
+#ifdef DEBUG
+ cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n");
+#endif
+ freemsg(mp);
+}
+
+static int
+raw_ip_send_data_v4(queue_t *q, conn_t *connp, mblk_t *mp, ipaddr_t v4dst,
+ ip4_pkt_t *pktinfop)
+{
+ ipha_t *ipha;
+ size_t ip_len;
+ icmp_t *icmp = connp->conn_icmp;
+ icmp_stack_t *is = icmp->icmp_is;
+ int ip_hdr_length;
+ ip_opt_info_t optinfo;
+
+ optinfo.ip_opt_flags = 0;
+ optinfo.ip_opt_ill_index = 0;
+
+ if (icmp->icmp_state == TS_UNBND) {
+ /* If a port has not been bound to the stream, fail. */
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ return (EPROTO);
}
if (v4dst == INADDR_ANY)
@@ -4531,35 +4625,34 @@ icmp_wput(queue_t *q, mblk_t *mp)
/* Check if our saved options are valid; update if not */
if (is_system_labeled() &&
(!IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6lastdst) ||
- V4_PART_OF_V6(icmp->icmp_v6lastdst) != v4dst) &&
- !icmp_update_label(q, icmp, mp, v4dst)) {
- return;
- }
+ V4_PART_OF_V6(icmp->icmp_v6lastdst) != v4dst)) {
+ int error = icmp_update_label(icmp, mp, v4dst);
- /* Protocol 255 contains full IP headers */
- if (icmp->icmp_hdrincl) {
- freeb(mp);
- icmp_wput_hdrincl(q, mp1, icmp, pktinfop);
- return;
+ if (error != 0)
+ return (error);
}
+ /* Protocol 255 contains full IP headers */
+ if (icmp->icmp_hdrincl)
+ return (icmp_wput_hdrincl(q, connp, mp, icmp, pktinfop));
/* Add an IP header */
ip_hdr_length = IP_SIMPLE_HDR_LENGTH + icmp->icmp_ip_snd_options_len;
- ipha = (ipha_t *)&mp1->b_rptr[-ip_hdr_length];
- if ((uchar_t *)ipha < mp1->b_datap->db_base ||
- mp1->b_datap->db_ref != 1 ||
+ ipha = (ipha_t *)&mp->b_rptr[-ip_hdr_length];
+ if ((uchar_t *)ipha < mp->b_datap->db_base ||
+ mp->b_datap->db_ref != 1 ||
!OK_32PTR(ipha)) {
+ mblk_t *mp1;
if (!(mp1 = allocb(ip_hdr_length + is->is_wroff_extra,
BPRI_LO))) {
BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- icmp_ud_err(q, mp, ENOMEM);
- return;
+ return (ENOMEM);
}
- mp1->b_cont = mp->b_cont;
+ mp1->b_cont = mp;
ipha = (ipha_t *)mp1->b_datap->db_lim;
mp1->b_wptr = (uchar_t *)ipha;
ipha = (ipha_t *)((uchar_t *)ipha - ip_hdr_length);
+ mp = mp1;
}
#ifdef _BIG_ENDIAN
/* Set version, header length, and tos */
@@ -4604,11 +4697,11 @@ icmp_wput(queue_t *q, mblk_t *mp)
ipha->ipha_ident = IP_HDR_INCLUDED;
/* Finish common formatting of the packet. */
- mp1->b_rptr = (uchar_t *)ipha;
+ mp->b_rptr = (uchar_t *)ipha;
- ip_len = mp1->b_wptr - (uchar_t *)ipha;
- if (mp1->b_cont != NULL)
- ip_len += msgdsize(mp1->b_cont);
+ ip_len = mp->b_wptr - (uchar_t *)ipha;
+ if (mp->b_cont != NULL)
+ ip_len += msgdsize(mp->b_cont);
/*
* Set the length into the IP header.
@@ -4618,13 +4711,11 @@ icmp_wput(queue_t *q, mblk_t *mp)
*/
if (ip_len > IP_MAXPACKET) {
BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- icmp_ud_err(q, mp, EMSGSIZE);
- return;
+ return (EMSGSIZE);
}
ipha->ipha_length = htons((uint16_t)ip_len);
/*
- * Copy in the destination address from the T_UNITDATA
- * request
+ * Copy in the destination address request
*/
ipha->ipha_dst = v4dst;
@@ -4645,16 +4736,14 @@ icmp_wput(queue_t *q, mblk_t *mp)
(void) ip_massage_options(ipha, is->is_netstack);
}
- freeb(mp);
BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
- mblk_setcred(mp1, connp->conn_cred);
- ip_output_options(Q_TO_CONN(q), mp1, q, IP_WPUT, &optinfo);
-#undef ipha
-#undef tudr
+ mblk_setcred(mp, connp->conn_cred);
+ ip_output_options(connp, mp, q, IP_WPUT, &optinfo);
+ return (0);
}
-static boolean_t
-icmp_update_label_v6(queue_t *wq, icmp_t *icmp, mblk_t *mp, in6_addr_t *dst)
+static int
+icmp_update_label_v6(icmp_t *icmp, mblk_t *mp, in6_addr_t *dst)
{
int err;
uchar_t opt_storage[TSOL_MAX_IPV6_OPTION];
@@ -4672,33 +4761,30 @@ icmp_update_label_v6(queue_t *wq, icmp_t *icmp, mblk_t *mp, in6_addr_t *dst)
BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
DTRACE_PROBE4(
tx__ip__log__drop__updatelabel__icmp6,
- char *, "queue(1) failed to update options(2) on mp(3)",
- queue_t *, wq, char *, opt_storage, mblk_t *, mp);
- icmp_ud_err(wq, mp, err);
- return (B_FALSE);
+ char *, "icmp(1) failed to update options(2) on mp(3)",
+ icmp_t *, icmp, char *, opt_storage, mblk_t *, mp);
+ return (err);
}
icmp->icmp_v6lastdst = *dst;
- return (B_TRUE);
+ return (0);
}
/*
- * icmp_wput_ipv6():
+ * raw_ip_send_data_v6():
* Assumes that icmp_wput did some sanity checking on the destination
* address, but that the label may not yet be correct.
*/
-void
-icmp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen)
+static int
+raw_ip_send_data_v6(queue_t *q, conn_t *connp, mblk_t *mp, sin6_t *sin6,
+ ip6_pkt_t *ipp)
{
ip6_t *ip6h;
- ip6i_t *ip6i; /* mp1->b_rptr even if no ip6i_t */
- mblk_t *mp1;
+ ip6i_t *ip6i; /* mp->b_rptr even if no ip6i_t */
int ip_hdr_len = IPV6_HDR_LEN;
size_t ip_len;
- icmp_t *icmp = Q_TO_ICMP(q);
+ icmp_t *icmp = connp->conn_icmp;
icmp_stack_t *is = icmp->icmp_is;
- ip6_pkt_t ipp_s; /* For ancillary data options */
- ip6_pkt_t *ipp = &ipp_s;
ip6_pkt_t *tipp;
uint32_t csum = 0;
uint_t ignore = 0;
@@ -4716,30 +4802,10 @@ icmp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen)
*/
if (IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6src)) {
BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- icmp_ud_err(q, mp, EADDRNOTAVAIL);
- return;
- }
-
- ipp->ipp_fields = 0;
- ipp->ipp_sticky_ignored = 0;
-
- /*
- * If TPI options passed in, feed it for verification and handling
- */
- if (tudr_optlen != 0) {
- int error;
-
- if (icmp_unitdata_opt_process(q, mp, &error,
- (void *)ipp) < 0) {
- /* failure */
- BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- icmp_ud_err(q, mp, error);
- return;
- }
- ignore = ipp->ipp_sticky_ignored;
- ASSERT(error == 0);
+ return (EADDRNOTAVAIL);
}
+ ignore = ipp->ipp_sticky_ignored;
if (sin6->sin6_scope_id != 0 &&
IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
/*
@@ -4763,9 +4829,12 @@ icmp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen)
* avoid blowing up our stack here.
*/
if (is_system_labeled() &&
- !IN6_ARE_ADDR_EQUAL(&icmp->icmp_v6lastdst, &ip6_dst) &&
- !icmp_update_label_v6(q, icmp, mp, &ip6_dst)) {
- return;
+ !IN6_ARE_ADDR_EQUAL(&icmp->icmp_v6lastdst, &ip6_dst)) {
+ int error = 0;
+
+ error = icmp_update_label_v6(icmp, mp, &ip6_dst);
+ if (error != 0)
+ return (error);
}
/*
@@ -4933,28 +5002,30 @@ no_options:
ip_hdr_len += sizeof (ip6i_t);
/* check/fix buffer config, setup pointers into it */
- mp1 = mp->b_cont;
- ip6h = (ip6_t *)&mp1->b_rptr[-ip_hdr_len];
- if ((mp1->b_datap->db_ref != 1) ||
- ((unsigned char *)ip6h < mp1->b_datap->db_base) ||
+ ip6h = (ip6_t *)&mp->b_rptr[-ip_hdr_len];
+ if ((mp->b_datap->db_ref != 1) ||
+ ((unsigned char *)ip6h < mp->b_datap->db_base) ||
!OK_32PTR(ip6h)) {
+ mblk_t *mp1;
+
/* Try to get everything in a single mblk next time */
if (ip_hdr_len > icmp->icmp_max_hdr_len) {
icmp->icmp_max_hdr_len = ip_hdr_len;
- (void) mi_set_sth_wroff(RD(q),
+
+ (void) proto_set_tx_wroff(q == NULL ? NULL:RD(q), connp,
icmp->icmp_max_hdr_len + is->is_wroff_extra);
}
mp1 = allocb(ip_hdr_len + is->is_wroff_extra, BPRI_LO);
if (!mp1) {
BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- icmp_ud_err(q, mp, ENOMEM);
- return;
+ return (ENOMEM);
}
- mp1->b_cont = mp->b_cont;
+ mp1->b_cont = mp;
mp1->b_wptr = mp1->b_datap->db_lim;
ip6h = (ip6_t *)(mp1->b_wptr - ip_hdr_len);
+ mp = mp1;
}
- mp1->b_rptr = (unsigned char *)ip6h;
+ mp->b_rptr = (unsigned char *)ip6h;
ip6i = (ip6i_t *)ip6h;
#define ANCIL_OR_STICKY_PTR(f) ((is_sticky & f) ? &icmp->icmp_sticky_ipp : ipp)
@@ -5140,27 +5211,25 @@ no_options:
* We know that all extension headers will be in the same mblk
* as the IPv6 header.
*/
- rth = ip_find_rthdr_v6(ip6h, mp1->b_wptr);
+ rth = ip_find_rthdr_v6(ip6h, mp->b_wptr);
if (rth != NULL && rth->ip6r_segleft != 0) {
if (rth->ip6r_type != IPV6_RTHDR_TYPE_0) {
/*
* Drop packet - only support Type 0 routing.
* Notify the application as well.
*/
- icmp_ud_err(q, mp, EPROTO);
BUMP_MIB(&is->is_rawip_mib,
rawipOutErrors);
- return;
+ return (EPROTO);
}
/*
* rth->ip6r_len is twice the number of
* addresses in the header
*/
if (rth->ip6r_len & 0x1) {
- icmp_ud_err(q, mp, EPROTO);
BUMP_MIB(&is->is_rawip_mib,
rawipOutErrors);
- return;
+ return (EPROTO);
}
/*
* Shuffle the routing header and ip6_dst
@@ -5176,17 +5245,16 @@ no_options:
* for subsequent hops.
*/
if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) {
- icmp_ud_err(q, mp, EADDRNOTAVAIL);
BUMP_MIB(&is->is_rawip_mib,
rawipOutErrors);
- return;
+ return (EADDRNOTAVAIL);
}
}
}
- ip_len = mp1->b_wptr - (uchar_t *)ip6h - IPV6_HDR_LEN;
- if (mp1->b_cont != NULL)
- ip_len += msgdsize(mp1->b_cont);
+ ip_len = mp->b_wptr - (uchar_t *)ip6h - IPV6_HDR_LEN;
+ if (mp->b_cont != NULL)
+ ip_len += msgdsize(mp->b_cont);
/*
* Set the length into the IP header.
@@ -5196,11 +5264,10 @@ no_options:
*/
if (ip_len > IP_MAXPACKET) {
BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
- icmp_ud_err(q, mp, EMSGSIZE);
- return;
+ return (EMSGSIZE);
}
if (icmp->icmp_proto == IPPROTO_ICMPV6 || icmp->icmp_raw_checksum) {
- uint_t cksum_off; /* From ip6i == mp1->b_rptr */
+ uint_t cksum_off; /* From ip6i == mp->b_rptr */
uint16_t *cksum_ptr;
uint_t ext_hdrs_len;
@@ -5216,14 +5283,14 @@ no_options:
* Note: ICMPv6 must always checksum the packet.
*/
cksum_off = ip_hdr_len + icmp->icmp_checksum_off;
- if (cksum_off + sizeof (uint16_t) > mp1->b_wptr - mp1->b_rptr) {
- if (!pullupmsg(mp1, cksum_off + sizeof (uint16_t))) {
+ if (cksum_off + sizeof (uint16_t) > mp->b_wptr - mp->b_rptr) {
+ if (!pullupmsg(mp, cksum_off + sizeof (uint16_t))) {
BUMP_MIB(&is->is_rawip_mib,
rawipOutErrors);
freemsg(mp);
- return;
+ return (0);
}
- ip6i = (ip6i_t *)mp1->b_rptr;
+ ip6i = (ip6i_t *)mp->b_rptr;
if (ip6i->ip6i_nxt == IPPROTO_RAW)
ip6h = (ip6_t *)&ip6i[1];
else
@@ -5244,11 +5311,10 @@ no_options:
#endif
ip6h->ip6_plen = (uint16_t)ip_len;
- freeb(mp);
-
/* We're done. Pass the packet to IP */
BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams);
- ip_output_v6(icmp->icmp_connp, mp1, q, IP_WPUT);
+ ip_output_v6(icmp->icmp_connp, mp, q, IP_WPUT);
+ return (0);
}
static void
@@ -5281,10 +5347,10 @@ icmp_wput_other(queue_t *q, mblk_t *mp)
return;
case O_T_BIND_REQ:
case T_BIND_REQ:
- icmp_bind(q, mp);
+ icmp_tpi_bind(q, mp);
return;
case T_CONN_REQ:
- icmp_connect(q, mp);
+ icmp_tpi_connect(q, mp);
return;
case T_CAPABILITY_REQ:
icmp_capability_req(q, mp);
@@ -5301,7 +5367,7 @@ icmp_wput_other(queue_t *q, mblk_t *mp)
icmp_ud_err(q, mp, EADDRNOTAVAIL);
return;
case T_UNBIND_REQ:
- icmp_unbind(q, mp);
+ icmp_tpi_unbind(q, mp);
return;
case T_SVR4_OPTMGMT_REQ:
@@ -5319,7 +5385,7 @@ icmp_wput_other(queue_t *q, mblk_t *mp)
return;
case T_DISCON_REQ:
- icmp_disconnect(q, mp);
+ icmp_tpi_disconnect(q, mp);
return;
/* The following TPI message is not supported by icmp. */
@@ -5375,6 +5441,15 @@ icmp_wput_other(queue_t *q, mblk_t *mp)
return;
}
break;
+ case _SIOCSOCKFALLBACK:
+ /*
+ * socket is falling back to be a
+ * streams socket. Nothing to do
+ */
+ iocp->ioc_count = 0;
+ iocp->ioc_rval = 0;
+ qreply(q, mp);
+ return;
default:
break;
}
@@ -5398,10 +5473,8 @@ icmp_wput_iocdata(queue_t *q, mblk_t *mp)
mblk_t *mp1;
STRUCT_HANDLE(strbuf, sb);
icmp_t *icmp;
- in6_addr_t v6addr;
- ipaddr_t v4addr;
- uint32_t flowinfo = 0;
- int addrlen;
+ uint_t addrlen;
+ uint_t error;
/* Make sure it is one of ours. */
switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
@@ -5458,81 +5531,34 @@ icmp_wput_iocdata(queue_t *q, mblk_t *mp)
mi_copy_done(q, mp, EINVAL);
return;
}
+
+ mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
+
+ if (mp1 == NULL)
+ return;
+
+ rw_enter(&icmp->icmp_rwlock, RW_READER);
switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
case TI_GETMYNAME:
- if (icmp->icmp_family == AF_INET) {
- ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
- if (!IN6_IS_ADDR_V4MAPPED_ANY(&icmp->icmp_v6src) &&
- !IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
- v4addr = V4_PART_OF_V6(icmp->icmp_v6src);
- } else {
- /*
- * INADDR_ANY
- * icmp_v6src is not set, we might be bound to
- * broadcast/multicast. Use icmp_bound_v6src as
- * local address instead (that could
- * also still be INADDR_ANY)
- */
- v4addr = V4_PART_OF_V6(icmp->icmp_bound_v6src);
- }
- } else {
- /* icmp->icmp_family == AF_INET6 */
- if (!IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
- v6addr = icmp->icmp_v6src;
- } else {
- /*
- * UNSPECIFIED
- * icmp_v6src is not set, we might be bound to
- * broadcast/multicast. Use icmp_bound_v6src as
- * local address instead (that could
- * also still be UNSPECIFIED)
- */
- v6addr = icmp->icmp_bound_v6src;
- }
- }
+ error = rawip_do_getsockname(icmp, (void *)mp1->b_rptr,
+ &addrlen);
break;
case TI_GETPEERNAME:
- if (icmp->icmp_family == AF_INET) {
- ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
- v4addr = V4_PART_OF_V6(icmp->icmp_v6dst);
- } else {
- /* icmp->icmp_family == AF_INET6) */
- v6addr = icmp->icmp_v6dst;
- flowinfo = icmp->icmp_flowinfo;
- }
+ error = rawip_do_getpeername(icmp, (void *)mp1->b_rptr,
+ &addrlen);
break;
- default:
- mi_copy_done(q, mp, EPROTO);
- return;
}
- mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
- if (!mp1)
- return;
-
- if (icmp->icmp_family == AF_INET) {
- sin_t *sin;
+ rw_exit(&icmp->icmp_rwlock);
- STRUCT_FSET(sb, len, (int)sizeof (sin_t));
- sin = (sin_t *)mp1->b_rptr;
- mp1->b_wptr = (uchar_t *)&sin[1];
- *sin = sin_null;
- sin->sin_family = AF_INET;
- sin->sin_addr.s_addr = v4addr;
+ if (error != 0) {
+ mi_copy_done(q, mp, error);
} else {
- /* icmp->icmp_family == AF_INET6 */
- sin6_t *sin6;
+ mp1->b_wptr += addrlen;
+ STRUCT_FSET(sb, len, addrlen);
- ASSERT(icmp->icmp_family == AF_INET6);
- STRUCT_FSET(sb, len, (int)sizeof (sin6_t));
- sin6 = (sin6_t *)mp1->b_rptr;
- mp1->b_wptr = (uchar_t *)&sin6[1];
- *sin6 = sin6_null;
- sin6->sin6_family = AF_INET6;
- sin6->sin6_flowinfo = flowinfo;
- sin6->sin6_addr = v6addr;
+ /* Copy out the address */
+ mi_copyout(q, mp);
}
- /* Copy out the address */
- mi_copyout(q, mp);
}
static int
@@ -5565,7 +5591,7 @@ icmp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp,
}
void
-icmp_ddi_init(void)
+icmp_ddi_g_init(void)
{
icmp_max_optsize = optcom_max_optsize(icmp_opt_obj.odb_opt_des_arr,
icmp_opt_obj.odb_opt_arr_cnt);
@@ -5579,11 +5605,13 @@ icmp_ddi_init(void)
}
void
-icmp_ddi_destroy(void)
+icmp_ddi_g_destroy(void)
{
netstack_unregister(NS_ICMP);
}
+#define INET_NAME "ip"
+
/*
* Initialize the ICMP stack instance.
*/
@@ -5592,6 +5620,8 @@ rawip_stack_init(netstackid_t stackid, netstack_t *ns)
{
icmp_stack_t *is;
icmpparam_t *pa;
+ int error = 0;
+ major_t major;
is = (icmp_stack_t *)kmem_zalloc(sizeof (*is), KM_SLEEP);
is->is_netstack = ns;
@@ -5603,6 +5633,10 @@ rawip_stack_init(netstackid_t stackid, netstack_t *ns)
(void) icmp_param_register(&is->is_nd,
is->is_param_arr, A_CNT(icmp_param_arr));
is->is_ksp = rawip_kstat_init(stackid);
+
+ major = mod_name_to_major(INET_NAME);
+ error = ldi_ident_from_major(major, &is->is_ldi_ident);
+ ASSERT(error == 0);
return (is);
}
@@ -5620,6 +5654,7 @@ rawip_stack_fini(netstackid_t stackid, void *arg)
rawip_kstat_fini(stackid, is->is_ksp);
is->is_ksp = NULL;
+ ldi_ident_release(is->is_ldi_ident);
kmem_free(is, sizeof (*is));
}
@@ -5691,3 +5726,848 @@ rawip_kstat_update(kstat_t *ksp, int rw)
netstack_rele(ns);
return (0);
}
+
+/* ARGSUSED */
+int
+rawip_accept(sock_lower_handle_t lproto_handle,
+ sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
+ cred_t *cr)
+{
+ return (EOPNOTSUPP);
+}
+
+/* ARGSUSED */
+int
+rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
+ socklen_t len, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ int error;
+
+ /* Binding to a NULL address really means unbind */
+ if (sa == NULL)
+ error = rawip_do_unbind(connp);
+ else
+ error = rawip_do_bind(connp, sa, len);
+
+ if (error < 0) {
+ if (error == -TOUTSTATE)
+ error = EINVAL;
+ else
+ error = proto_tlitosyserr(-error);
+ }
+ return (error);
+}
+
+static int
+rawip_implicit_bind(conn_t *connp)
+{
+ sin6_t sin6addr;
+ sin_t *sin;
+ sin6_t *sin6;
+ socklen_t len;
+ int error;
+
+ if (connp->conn_icmp->icmp_family == AF_INET) {
+ len = sizeof (struct sockaddr_in);
+ sin = (sin_t *)&sin6addr;
+ *sin = sin_null;
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = INADDR_ANY;
+ } else {
+ ASSERT(connp->conn_icmp->icmp_family == AF_INET6);
+ len = sizeof (sin6_t);
+ sin6 = (sin6_t *)&sin6addr;
+ *sin6 = sin6_null;
+ sin6->sin6_family = AF_INET6;
+ V6_SET_ZERO(sin6->sin6_addr);
+ }
+
+ error = rawip_do_bind(connp, (struct sockaddr *)&sin6addr, len);
+
+ return ((error < 0) ? proto_tlitosyserr(-error) : error);
+}
+
+static int
+rawip_unbind(conn_t *connp)
+{
+ int error;
+
+ error = rawip_do_unbind(connp);
+ if (error < 0) {
+ error = proto_tlitosyserr(-error);
+ }
+ return (error);
+}
+
+/* ARGSUSED */
+int
+rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
+{
+ return (EOPNOTSUPP);
+}
+
+/* ARGSUSED */
+int
+rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
+ socklen_t len, sock_connid_t *id, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ icmp_t *icmp = connp->conn_icmp;
+ int error;
+ boolean_t did_bind = B_FALSE;
+
+ if (sa == NULL) {
+ /*
+ * Disconnect
+ * Make sure we are connected
+ */
+ if (icmp->icmp_state != TS_DATA_XFER)
+ return (EINVAL);
+
+ error = icmp_disconnect(connp);
+ return (error);
+ }
+
+ error = proto_verify_ip_addr(icmp->icmp_family, sa, len);
+ if (error != 0)
+ return (error);
+
+ /* do an implicit bind if necessary */
+ if (icmp->icmp_state == TS_UNBND) {
+ error = rawip_implicit_bind(connp);
+ /*
+ * We could be racing with an actual bind, in which case
+ * we would see EPROTO. We cross our fingers and try
+ * to connect.
+ */
+ if (!(error == 0 || error == EPROTO))
+ return (error);
+ did_bind = B_TRUE;
+ }
+
+ /*
+ * set SO_DGRAM_ERRIND
+ */
+ icmp->icmp_dgram_errind = B_TRUE;
+
+ error = rawip_do_connect(connp, sa, len);
+
+ if (error != 0 && did_bind) {
+ int unbind_err;
+
+ unbind_err = rawip_unbind(connp);
+ ASSERT(unbind_err == 0);
+ }
+
+ if (error == 0) {
+ *id = 0;
+ (*connp->conn_upcalls->su_connected)
+ (connp->conn_upper_handle, 0, NULL, -1);
+ } else if (error < 0) {
+ error = proto_tlitosyserr(-error);
+ }
+ return (error);
+}
+
+/* ARGSUSED */
+void
+rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
+ boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ icmp_t *icmp;
+ struct T_capability_ack tca;
+ struct sockaddr_in6 laddr, faddr;
+ socklen_t laddrlen, faddrlen;
+ short opts;
+ struct stroptions *stropt;
+ mblk_t *stropt_mp;
+ int error;
+
+ icmp = connp->conn_icmp;
+
+ stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL);
+
+ /*
+ * setup the fallback stream that was allocated
+ */
+ connp->conn_dev = (dev_t)RD(q)->q_ptr;
+ connp->conn_minor_arena = WR(q)->q_ptr;
+
+ RD(q)->q_ptr = WR(q)->q_ptr = connp;
+
+ WR(q)->q_qinfo = &icmpwinit;
+
+ connp->conn_rq = RD(q);
+ connp->conn_wq = WR(q);
+
+ /* Notify stream head about options before sending up data */
+ stropt_mp->b_datap->db_type = M_SETOPTS;
+ stropt_mp->b_wptr += sizeof (*stropt);
+ stropt = (struct stroptions *)stropt_mp->b_rptr;
+ stropt->so_flags = SO_WROFF | SO_HIWAT;
+ stropt->so_wroff =
+ (ushort_t)(icmp->icmp_max_hdr_len + icmp->icmp_is->is_wroff_extra);
+ stropt->so_hiwat = icmp->icmp_recv_hiwat;
+ putnext(RD(q), stropt_mp);
+
+ /*
+ * free helper stream
+ */
+ ip_close_helper_stream(connp);
+
+ /*
+ * Collect the information needed to sync with the sonode
+ */
+ icmp_do_capability_ack(icmp, &tca, TC1_INFO);
+
+ laddrlen = faddrlen = sizeof (sin6_t);
+ (void) rawip_getsockname((sock_lower_handle_t)connp,
+ (struct sockaddr *)&laddr, &laddrlen, NULL);
+ error = rawip_getpeername((sock_lower_handle_t)connp,
+ (struct sockaddr *)&faddr, &faddrlen, NULL);
+ if (error != 0)
+ faddrlen = 0;
+ opts = 0;
+ if (icmp->icmp_dgram_errind)
+ opts |= SO_DGRAM_ERRIND;
+ if (icmp->icmp_dontroute)
+ opts |= SO_DONTROUTE;
+
+ /*
+ * Once we grab the drain lock, no data will be send up
+ * to the socket. So we notify the socket that the endpoint
+ * is quiescent and it's therefore safe move data from
+ * the socket to the stream head.
+ */
+ (*quiesced_cb)(connp->conn_upper_handle, q, &tca,
+ (struct sockaddr *)&laddr, laddrlen,
+ (struct sockaddr *)&faddr, faddrlen, opts);
+
+ /*
+ * push up any packets that were queued in icmp_t
+ */
+
+ mutex_enter(&icmp->icmp_recv_lock);
+ while (icmp->icmp_fallback_queue_head != NULL) {
+ mblk_t *mp;
+
+ mp = icmp->icmp_fallback_queue_head;
+ icmp->icmp_fallback_queue_head = mp->b_next;
+ mp->b_next = NULL;
+ mutex_exit(&icmp->icmp_recv_lock);
+ putnext(RD(q), mp);
+ mutex_enter(&icmp->icmp_recv_lock);
+ }
+ icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head;
+ /*
+ * No longer a streams less socket
+ */
+ connp->conn_flags &= ~IPCL_NONSTR;
+ mutex_exit(&icmp->icmp_recv_lock);
+ ASSERT(icmp->icmp_fallback_queue_head == NULL &&
+ icmp->icmp_fallback_queue_tail == NULL);
+
+ ASSERT(connp->conn_ref >= 1);
+}
+
+/* ARGSUSED */
+sock_lower_handle_t
+rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
+ uint_t *smodep, int *errorp, int flags, cred_t *credp)
+{
+ conn_t *connp;
+
+ if (type != SOCK_RAW || (family != AF_INET && family != AF_INET6)) {
+ *errorp = EPROTONOSUPPORT;
+ return (NULL);
+ }
+
+ connp = icmp_open(family, credp, errorp, flags);
+ if (connp != NULL) {
+ icmp_stack_t *is;
+
+ is = connp->conn_icmp->icmp_is;
+ connp->conn_flags |= IPCL_NONSTR;
+
+ if (connp->conn_icmp->icmp_family == AF_INET6) {
+ /* Build initial header template for transmit */
+ rw_enter(&connp->conn_icmp->icmp_rwlock, RW_WRITER);
+ if ((*errorp =
+ icmp_build_hdrs(connp->conn_icmp)) != 0) {
+ rw_exit(&connp->conn_icmp->icmp_rwlock);
+ ipcl_conn_destroy(connp);
+ return (NULL);
+ }
+ rw_exit(&connp->conn_icmp->icmp_rwlock);
+ }
+
+ connp->conn_icmp->icmp_recv_hiwat = is->is_recv_hiwat;
+ connp->conn_icmp->icmp_xmit_hiwat = is->is_xmit_hiwat;
+
+ if ((*errorp = ip_create_helper_stream(connp,
+ is->is_ldi_ident)) != 0) {
+ cmn_err(CE_CONT, "create of IP helper stream failed\n");
+ (void) rawip_do_close(connp);
+ return (NULL);
+ }
+
+ mutex_enter(&connp->conn_lock);
+ connp->conn_state_flags &= ~CONN_INCIPIENT;
+ mutex_exit(&connp->conn_lock);
+ *sock_downcalls = &sock_rawip_downcalls;
+ *smodep = SM_ATOMIC;
+ } else {
+ ASSERT(*errorp != 0);
+ }
+
+ return ((sock_lower_handle_t)connp);
+}
+
+/* ARGSUSED */
+void
+rawip_activate(sock_lower_handle_t proto_handle,
+ sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags,
+ cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ icmp_stack_t *is = connp->conn_icmp->icmp_is;
+ struct sock_proto_props sopp;
+
+ connp->conn_upcalls = sock_upcalls;
+ connp->conn_upper_handle = sock_handle;
+
+ sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
+ SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
+ sopp.sopp_wroff = connp->conn_icmp->icmp_max_hdr_len +
+ is->is_wroff_extra;
+ sopp.sopp_rxhiwat = is->is_recv_hiwat;
+ sopp.sopp_rxlowat = icmp_mod_info.mi_lowat;
+ sopp.sopp_maxblk = INFPSZ;
+ sopp.sopp_maxpsz = IP_MAXPACKET;
+ sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 :
+ icmp_mod_info.mi_minpsz;
+
+ (*connp->conn_upcalls->su_set_proto_props)
+ (connp->conn_upper_handle, &sopp);
+}
+
+static int
+rawip_do_getsockname(icmp_t *icmp, struct sockaddr *sa, uint_t *salenp)
+{
+ sin_t *sin = (sin_t *)sa;
+ sin6_t *sin6 = (sin6_t *)sa;
+
+ ASSERT(icmp != NULL);
+ ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock));
+
+ switch (icmp->icmp_family) {
+ case AF_INET:
+ ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
+ if (*salenp < sizeof (sin_t))
+ return (EINVAL);
+
+ *salenp = sizeof (sin_t);
+ *sin = sin_null;
+ sin->sin_family = AF_INET;
+ if (icmp->icmp_state == TS_UNBND) {
+ break;
+ }
+
+ if (!IN6_IS_ADDR_V4MAPPED_ANY(&icmp->icmp_v6src) &&
+ !IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
+ sin->sin_addr.s_addr = V4_PART_OF_V6(icmp->icmp_v6src);
+ } else {
+ /*
+ * INADDR_ANY
+ * icmp_v6src is not set, we might be bound to
+ * broadcast/multicast. Use icmp_bound_v6src as
+ * local address instead (that could
+ * also still be INADDR_ANY)
+ */
+ sin->sin_addr.s_addr =
+ V4_PART_OF_V6(icmp->icmp_bound_v6src);
+ }
+ break;
+ case AF_INET6:
+
+ if (*salenp < sizeof (sin6_t))
+ return (EINVAL);
+
+ *salenp = sizeof (sin6_t);
+ *sin6 = sin6_null;
+ sin6->sin6_family = AF_INET6;
+ if (icmp->icmp_state == TS_UNBND) {
+ break;
+ }
+ if (!IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) {
+ sin6->sin6_addr = icmp->icmp_v6src;
+ } else {
+ /*
+ * UNSPECIFIED
+ * icmp_v6src is not set, we might be bound to
+ * broadcast/multicast. Use icmp_bound_v6src as
+ * local address instead (that could
+ * also still be UNSPECIFIED)
+ */
+
+ sin6->sin6_addr = icmp->icmp_bound_v6src;
+ }
+ break;
+ }
+ return (0);
+}
+
+static int
+rawip_do_getpeername(icmp_t *icmp, struct sockaddr *sa, uint_t *salenp)
+{
+ sin_t *sin = (sin_t *)sa;
+ sin6_t *sin6 = (sin6_t *)sa;
+
+ ASSERT(icmp != NULL);
+ ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock));
+
+ if (icmp->icmp_state != TS_DATA_XFER)
+ return (ENOTCONN);
+
+ sa->sa_family = icmp->icmp_family;
+ switch (icmp->icmp_family) {
+ case AF_INET:
+ ASSERT(icmp->icmp_ipversion == IPV4_VERSION);
+
+ if (*salenp < sizeof (sin_t))
+ return (EINVAL);
+
+ *salenp = sizeof (sin_t);
+ *sin = sin_null;
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr =
+ V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr);
+ break;
+ case AF_INET6:
+ if (*salenp < sizeof (sin6_t))
+ return (EINVAL);
+
+ *salenp = sizeof (sin6_t);
+ *sin6 = sin6_null;
+ *sin6 = icmp->icmp_v6dst;
+ break;
+ }
+ return (0);
+}
+
+/* ARGSUSED */
+int
+rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
+ socklen_t *salenp, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ icmp_t *icmp = connp->conn_icmp;
+ int error;
+
+ ASSERT(icmp != NULL);
+
+ rw_enter(&icmp->icmp_rwlock, RW_READER);
+
+ error = rawip_do_getpeername(icmp, sa, salenp);
+
+ rw_exit(&icmp->icmp_rwlock);
+
+ return (error);
+}
+
+/* ARGSUSED */
+int
+rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa,
+ socklen_t *salenp, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ icmp_t *icmp = connp->conn_icmp;
+ int error;
+
+ ASSERT(icmp != NULL);
+ rw_enter(&icmp->icmp_rwlock, RW_READER);
+
+ error = rawip_do_getsockname(icmp, sa, salenp);
+
+ rw_exit(&icmp->icmp_rwlock);
+
+ return (error);
+}
+
+int
+rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
+ const void *optvalp, socklen_t optlen, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ icmp_t *icmp = connp->conn_icmp;
+ int error;
+
+ error = proto_opt_check(level, option_name, optlen, NULL,
+ icmp_opt_obj.odb_opt_des_arr,
+ icmp_opt_obj.odb_opt_arr_cnt,
+ icmp_opt_obj.odb_topmost_tpiprovider,
+ B_TRUE, B_FALSE, cr);
+
+ if (error != 0) {
+ /*
+ * option not recognized
+ */
+ if (error < 0) {
+ error = proto_tlitosyserr(-error);
+ }
+ return (error);
+ }
+
+ rw_enter(&icmp->icmp_rwlock, RW_WRITER);
+ error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level,
+ option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen,
+ (uchar_t *)optvalp, NULL, cr);
+ rw_exit(&icmp->icmp_rwlock);
+
+ if (error < 0) {
+ /*
+ * Pass on to ip
+ */
+ error = ip_set_options(connp, level, option_name, optvalp,
+ optlen, cr);
+ }
+
+ ASSERT(error >= 0);
+
+ return (error);
+}
+
+int
+rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
+ void *optvalp, socklen_t *optlen, cred_t *cr)
+{
+ int error;
+ conn_t *connp = (conn_t *)proto_handle;
+ icmp_t *icmp = connp->conn_icmp;
+ t_uscalar_t max_optbuf_len;
+ void *optvalp_buf;
+ int len;
+
+ error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
+ icmp_opt_obj.odb_opt_des_arr,
+ icmp_opt_obj.odb_opt_arr_cnt,
+ icmp_opt_obj.odb_topmost_tpiprovider,
+ B_FALSE, B_TRUE, cr);
+
+ if (error != 0) {
+ if (error < 0) {
+ error = proto_tlitosyserr(-error);
+ }
+ return (error);
+ }
+
+ optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
+ rw_enter(&icmp->icmp_rwlock, RW_READER);
+ len = icmp_opt_get(connp, level, option_name, optvalp_buf);
+ rw_exit(&icmp->icmp_rwlock);
+
+ if (len < 0) {
+ /*
+ * Pass on to IP
+ */
+ kmem_free(optvalp_buf, max_optbuf_len);
+ return (ip_get_options(connp, level, option_name, optvalp,
+ optlen, cr));
+ } else {
+ /*
+ * update optlen and copy option value
+ */
+ t_uscalar_t size = MIN(len, *optlen);
+ bcopy(optvalp_buf, optvalp, size);
+ bcopy(&size, optlen, sizeof (size));
+
+ kmem_free(optvalp_buf, max_optbuf_len);
+ return (0);
+ }
+}
+
+/* ARGSUSED */
+int
+rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ (void) rawip_do_close(connp);
+ return (0);
+}
+
+/* ARGSUSED */
+int
+rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+
+ /* shut down the send side */
+ if (how != SHUT_RD)
+ (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
+ SOCK_OPCTL_SHUT_SEND, 0);
+ /* shut down the recv side */
+ if (how != SHUT_WR)
+ (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
+ SOCK_OPCTL_SHUT_RECV, 0);
+ return (0);
+}
+
+void
+rawip_clr_flowctrl(sock_lower_handle_t proto_handle)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ icmp_t *icmp = connp->conn_icmp;
+
+ mutex_enter(&icmp->icmp_recv_lock);
+ connp->conn_flow_cntrld = B_FALSE;
+ mutex_exit(&icmp->icmp_recv_lock);
+}
+
+int
+rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
+ int mode, int32_t *rvalp, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ int error;
+
+ switch (cmd) {
+ case ND_SET:
+ case ND_GET:
+ case _SIOCSOCKFALLBACK:
+ case TI_GETPEERNAME:
+ case TI_GETMYNAME:
+#ifdef DEBUG
+ cmn_err(CE_CONT, "icmp_ioctl cmd 0x%x on non streams"
+ " socket", cmd);
+#endif
+ error = EINVAL;
+ break;
+ default:
+ /*
+ * Pass on to IP using helper stream
+ */
+ error = ldi_ioctl(
+ connp->conn_helper_info->ip_helper_stream_handle,
+ cmd, arg, mode, cr, rvalp);
+ break;
+ }
+ return (error);
+}
+
+/* ARGSUSED */
+int
+rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
+ cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ icmp_t *icmp = connp->conn_icmp;
+ icmp_stack_t *is = icmp->icmp_is;
+ int error = 0;
+ boolean_t bypass_dgram_errind = B_FALSE;
+
+ ASSERT(DB_TYPE(mp) == M_DATA);
+
+ if (is_system_labeled())
+ msg_setcredpid(mp, cr, curproc->p_pid);
+
+ /* do an implicit bind if necessary */
+ if (icmp->icmp_state == TS_UNBND) {
+ error = rawip_implicit_bind(connp);
+ /*
+ * We could be racing with an actual bind, in which case
+ * we would see EPROTO. We cross our fingers and try
+ * to connect.
+ */
+ if (!(error == 0 || error == EPROTO)) {
+ freemsg(mp);
+ return (error);
+ }
+ }
+
+ rw_enter(&icmp->icmp_rwlock, RW_WRITER);
+
+ if (msg->msg_name != NULL && icmp->icmp_state == TS_DATA_XFER) {
+ error = EISCONN;
+ goto done_lock;
+ }
+
+ switch (icmp->icmp_family) {
+ case AF_INET6: {
+ sin6_t *sin6;
+ ip6_pkt_t ipp_s; /* For ancillary data options */
+ ip6_pkt_t *ipp = &ipp_s;
+
+ sin6 = (sin6_t *)msg->msg_name;
+ if (sin6 != NULL) {
+ error = proto_verify_ip_addr(icmp->icmp_family,
+ (struct sockaddr *)msg->msg_name, msg->msg_namelen);
+ if (error != 0) {
+ bypass_dgram_errind = B_TRUE;
+ goto done_lock;
+ }
+ if (icmp->icmp_delayed_error != 0) {
+ sin6_t *sin1 = (sin6_t *)msg->msg_name;
+ sin6_t *sin2 = (sin6_t *)
+ &icmp->icmp_delayed_addr;
+
+ error = icmp->icmp_delayed_error;
+ icmp->icmp_delayed_error = 0;
+
+ /* Compare IP address and port */
+
+ if (sin1->sin6_port == sin2->sin6_port &&
+ IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
+ &sin2->sin6_addr)) {
+ goto done_lock;
+ }
+ }
+ } else {
+ /*
+ * Use connected address
+ */
+ if (icmp->icmp_state != TS_DATA_XFER) {
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ error = EDESTADDRREQ;
+ bypass_dgram_errind = B_TRUE;
+ goto done_lock;
+ }
+ sin6 = &icmp->icmp_v6dst;
+ }
+
+ /* No support for mapped addresses on raw sockets */
+ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ error = EADDRNOTAVAIL;
+ goto done_lock;
+ }
+
+ ipp->ipp_fields = 0;
+ ipp->ipp_sticky_ignored = 0;
+
+ /*
+ * If options passed in, feed it for verification and handling
+ */
+ if (msg->msg_controllen != 0) {
+ error = process_auxiliary_options(connp,
+ msg->msg_control, msg->msg_controllen,
+ ipp, &icmp_opt_obj, icmp_opt_set);
+ if (error != 0) {
+ goto done_lock;
+ }
+ }
+
+ rw_exit(&icmp->icmp_rwlock);
+
+ /*
+ * Destination is a native IPv6 address.
+ * Send out an IPv6 format packet.
+ */
+
+ error = raw_ip_send_data_v6(connp->conn_wq, connp, mp, sin6,
+ ipp);
+ }
+ break;
+ case AF_INET: {
+ sin_t *sin;
+ ip4_pkt_t pktinfo;
+ ip4_pkt_t *pktinfop = &pktinfo;
+ ipaddr_t v4dst;
+
+ sin = (sin_t *)msg->msg_name;
+ if (sin != NULL) {
+ error = proto_verify_ip_addr(icmp->icmp_family,
+ (struct sockaddr *)msg->msg_name, msg->msg_namelen);
+ if (error != 0) {
+ bypass_dgram_errind = B_TRUE;
+ goto done_lock;
+ }
+ v4dst = sin->sin_addr.s_addr;
+ if (icmp->icmp_delayed_error != 0) {
+ sin_t *sin1 = (sin_t *)msg->msg_name;
+ sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr;
+
+ error = icmp->icmp_delayed_error;
+ icmp->icmp_delayed_error = 0;
+
+ /* Compare IP address and port */
+ if (sin1->sin_port == sin2->sin_port &&
+ sin1->sin_addr.s_addr ==
+ sin2->sin_addr.s_addr) {
+ goto done_lock;
+ }
+
+ }
+ } else {
+ /*
+ * Use connected address
+ */
+ if (icmp->icmp_state != TS_DATA_XFER) {
+ BUMP_MIB(&is->is_rawip_mib, rawipOutErrors);
+ error = EDESTADDRREQ;
+ bypass_dgram_errind = B_TRUE;
+ goto done_lock;
+ }
+ v4dst = V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr);
+ }
+
+
+ pktinfop->ip4_ill_index = 0;
+ pktinfop->ip4_addr = INADDR_ANY;
+
+ /*
+ * If options passed in, feed it for verification and handling
+ */
+ if (msg->msg_controllen != 0) {
+ error = process_auxiliary_options(connp,
+ msg->msg_control, msg->msg_controllen,
+ pktinfop, &icmp_opt_obj, icmp_opt_set);
+ if (error != 0) {
+ goto done_lock;
+ }
+ }
+ rw_exit(&icmp->icmp_rwlock);
+
+ error = raw_ip_send_data_v4(connp->conn_wq, connp, mp,
+ v4dst, pktinfop);
+ break;
+ }
+
+ default:
+ ASSERT(0);
+ }
+
+ goto done;
+
+done_lock:
+ rw_exit(&icmp->icmp_rwlock);
+ if (error != 0) {
+ ASSERT(mp != NULL);
+ freemsg(mp);
+ }
+done:
+ if (bypass_dgram_errind)
+ return (error);
+ return (icmp->icmp_dgram_errind ? error : 0);
+}
+
+sock_downcalls_t sock_rawip_downcalls = {
+ rawip_activate,
+ rawip_accept,
+ rawip_bind,
+ rawip_listen,
+ rawip_connect,
+ rawip_getpeername,
+ rawip_getsockname,
+ rawip_getsockopt,
+ rawip_setsockopt,
+ rawip_send,
+ NULL,
+ NULL,
+ NULL,
+ rawip_shutdown,
+ rawip_clr_flowctrl,
+ rawip_ioctl,
+ rawip_close
+};
diff --git a/usr/src/uts/common/inet/ip/icmp_opt_data.c b/usr/src/uts/common/inet/ip/icmp_opt_data.c
index 8769a7d3d4..4f15801dfb 100644
--- a/usr/src/uts/common/inet/ip/icmp_opt_data.c
+++ b/usr/src/uts/common/inet/ip/icmp_opt_data.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/stream.h>
#define _SUN_TPI_VERSION 2
@@ -52,8 +50,8 @@
extern int icmp_opt_default(queue_t *, int, int, uchar_t *);
-extern int icmp_opt_get(queue_t *, int, int, uchar_t *);
-extern int icmp_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
+extern int icmp_tpi_opt_get(queue_t *, int, int, uchar_t *);
+extern int icmp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
uint_t *, uchar_t *, void *, cred_t *, mblk_t *);
/*
@@ -96,10 +94,10 @@ opdes_t icmp_opt_arr[] = {
{ IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
(OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
- 40, -1 /* not initialized */ },
+ IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
{ T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
(OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
- 40, -1 /* not initialized */ },
+ IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
{ IP_HDRINCL, IPPROTO_IP, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
sizeof (int), 0 },
@@ -347,8 +345,8 @@ uint_t icmp_max_optsize; /* initialized when ICMP driver is loaded */
optdb_obj_t icmp_opt_obj = {
icmp_opt_default, /* ICMP default value function pointer */
- icmp_opt_get, /* ICMP get function pointer */
- icmp_opt_set, /* ICMP set function pointer */
+ icmp_tpi_opt_get, /* ICMP get function pointer */
+ icmp_tpi_opt_set, /* ICMP set function pointer */
B_TRUE, /* ICMP is tpi provider */
ICMP_OPT_ARR_CNT, /* ICMP option database count of entries */
icmp_opt_arr, /* ICMP option database */
diff --git a/usr/src/uts/common/inet/ip/icmpddi.c b/usr/src/uts/common/inet/ip/icmpddi.c
index a5861d9120..dd0023c0c8 100644
--- a/usr/src/uts/common/inet/ip/icmpddi.c
+++ b/usr/src/uts/common/inet/ip/icmpddi.c
@@ -29,6 +29,9 @@
#include <sys/modctl.h>
#include <inet/common.h>
#include <inet/ip.h>
+#include <inet/rawip_impl.h>
+#include <sys/strsubr.h>
+#include <sys/socketvar.h>
#define INET_NAME "icmp"
#define INET_MODDESC "ICMP dummy STREAMS module"
@@ -36,6 +39,9 @@
#define INET_DEVMINOR 0
#define INET_DEVSTRTAB icmpinfov4
#define INET_MODSTRTAB dummymodinfo
+#define INET_SOCKDESC "Rawip socket module"
+#define INET_SOCK_PROTO_CREATE_FUNC (*rawip_create)
+#define INET_SOCK_PROTO_FB_FUNC (*rawip_fallback)
#define INET_DEVMTFLAGS D_MP
#define INET_MODMTFLAGS D_MP
diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c
index b0eaa51983..3141cd914e 100644
--- a/usr/src/uts/common/inet/ip/ip.c
+++ b/usr/src/uts/common/inet/ip/ip.c
@@ -38,7 +38,6 @@
#include <sys/tihdr.h>
#include <sys/xti_inet.h>
#include <sys/ddi.h>
-#include <sys/sunddi.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/kobj.h>
@@ -120,7 +119,6 @@
#include <inet/udp_impl.h>
#include <inet/rawip_impl.h>
#include <inet/rts_impl.h>
-#include <sys/sunddi.h>
#include <sys/tsol/label.h>
#include <sys/tsol/tnet.h>
@@ -625,7 +623,7 @@ uint_t ip_max_frag_dups = 10;
#define IS_SIMPLE_IPH(ipha) \
((ipha)->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)
-/* RFC1122 Conformance */
+/* RFC 1122 Conformance */
#define IP_FORWARD_DEFAULT IP_FORWARD_NEVER
#define ILL_MAX_NAMELEN LIFNAMSIZ
@@ -658,8 +656,7 @@ static void icmp_send_redirect(queue_t *, mblk_t *, ipaddr_t,
ip_stack_t *);
static void ip_arp_news(queue_t *, mblk_t *);
-static boolean_t ip_bind_insert_ire(mblk_t *, ire_t *, iulp_t *,
- ip_stack_t *);
+static boolean_t ip_bind_get_ire_v4(mblk_t **, ire_t *, iulp_t *, ip_stack_t *);
mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t);
char *ip_dot_addr(ipaddr_t, char *);
mblk_t *ip_carve_mp(mblk_t **, ssize_t);
@@ -770,6 +767,8 @@ static void ip_multirt_bad_mtu(ire_t *, uint32_t);
static int ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *);
static int ip_cgtp_filter_set(queue_t *, mblk_t *, char *,
caddr_t, cred_t *);
+extern int ip_helper_stream_setup(queue_t *, dev_t *, int, int,
+ cred_t *, boolean_t);
static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t,
@@ -1318,6 +1317,7 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = {
ip_sioctl_set_ipmpfailback, NULL },
/* SIOCSENABLESDP is handled by SDP */
/* 183 */ { IPI_DONTCARE /* SIOCSENABLESDP */, 0, 0, 0, NULL, NULL },
+ /* 184 */ { IPI_DONTCARE /* SIOCSQPTR */, 0, 0, 0, NULL, NULL },
};
int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t);
@@ -1373,7 +1373,8 @@ static ipha_t icmp_ipha = {
};
struct module_info ip_mod_info = {
- IP_MOD_ID, IP_MOD_NAME, 1, INFPSZ, 65536, 1024
+ IP_MOD_ID, IP_MOD_NAME, IP_MOD_MINPSZ, IP_MOD_MAXPSZ, IP_MOD_HIWAT,
+ IP_MOD_LOWAT
};
/*
@@ -4334,6 +4335,23 @@ ip_bind_ipsec_policy_set(conn_t *connp, mblk_t *policy_mp)
return (B_TRUE);
}
+static void
+ip_bind_post_handling(conn_t *connp, mblk_t *mp, boolean_t ire_requested)
+{
+ /*
+ * Pass the IPsec headers size in ire_ipsec_overhead.
+ * We can't do this in ip_bind_get_ire because the policy
+ * may not have been inherited at that point in time and hence
+ * conn_out_enforce_policy may not be set.
+ */
+ if (ire_requested && connp->conn_out_enforce_policy &&
+ mp != NULL && DB_TYPE(mp) == IRE_DB_REQ_TYPE) {
+ ire_t *ire = (ire_t *)mp->b_rptr;
+ ASSERT(MBLKL(mp) >= sizeof (ire_t));
+ ire->ire_ipsec_overhead = conn_ipsec_length(connp);
+ }
+}
+
/*
* Upper level protocols (ULP) pass through bind requests to IP for inspection
* and to arrange for power-fanout assist. The ULP is identified by
@@ -4374,7 +4392,6 @@ ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp)
uchar_t *ucp;
mblk_t *mp1;
boolean_t ire_requested;
- boolean_t ipsec_policy_set = B_FALSE;
int error = 0;
int protocol;
ipa_conn_x_t *acx;
@@ -4453,7 +4470,6 @@ ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp)
mp1 = mp->b_cont;
ire_requested = (mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE);
- ipsec_policy_set = (mp1 != NULL && DB_TYPE(mp1) == IPSEC_POLICY_SET);
switch (tbr->ADDR_length) {
default:
@@ -4463,14 +4479,14 @@ ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp)
case IP_ADDR_LEN:
/* Verification of local address only */
- error = ip_bind_laddr(connp, mp, *(ipaddr_t *)ucp, 0,
- ire_requested, ipsec_policy_set, B_FALSE);
+ error = ip_bind_laddr_v4(connp, &mp1, protocol,
+ *(ipaddr_t *)ucp, 0, B_FALSE);
break;
case sizeof (sin_t):
sin = (sin_t *)ucp;
- error = ip_bind_laddr(connp, mp, sin->sin_addr.s_addr,
- sin->sin_port, ire_requested, ipsec_policy_set, B_TRUE);
+ error = ip_bind_laddr_v4(connp, &mp1, protocol,
+ sin->sin_addr.s_addr, sin->sin_port, B_TRUE);
break;
case sizeof (ipa_conn_t):
@@ -4479,9 +4495,9 @@ ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp)
if (ac->ac_lport == 0)
ac->ac_lport = connp->conn_lport;
/* Always verify destination reachability. */
- error = ip_bind_connected(connp, mp, &ac->ac_laddr,
- ac->ac_lport, ac->ac_faddr, ac->ac_fport, ire_requested,
- ipsec_policy_set, B_TRUE, B_TRUE);
+ error = ip_bind_connected_v4(connp, &mp1, protocol,
+ &ac->ac_laddr, ac->ac_lport, ac->ac_faddr, ac->ac_fport,
+ B_TRUE, B_TRUE);
break;
case sizeof (ipa_conn_x_t):
@@ -4490,29 +4506,17 @@ ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp)
* Whether or not to verify destination reachability depends
* on the setting of the ACX_VERIFY_DST flag in acx->acx_flags.
*/
- error = ip_bind_connected(connp, mp, &acx->acx_conn.ac_laddr,
- acx->acx_conn.ac_lport, acx->acx_conn.ac_faddr,
- acx->acx_conn.ac_fport, ire_requested, ipsec_policy_set,
+ error = ip_bind_connected_v4(connp, &mp1, protocol,
+ &acx->acx_conn.ac_laddr, acx->acx_conn.ac_lport,
+ acx->acx_conn.ac_faddr, acx->acx_conn.ac_fport,
B_TRUE, (acx->acx_flags & ACX_VERIFY_DST) != 0);
break;
}
- if (error == EINPROGRESS)
- return (NULL);
- else if (error != 0)
+ ASSERT(error != EINPROGRESS);
+ if (error != 0)
goto bad_addr;
- /*
- * Pass the IPsec headers size in ire_ipsec_overhead.
- * We can't do this in ip_bind_insert_ire because the policy
- * may not have been inherited at that point in time and hence
- * conn_out_enforce_policy may not be set.
- */
- mp1 = mp->b_cont;
- if (ire_requested && connp->conn_out_enforce_policy &&
- mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE) {
- ire_t *ire = (ire_t *)mp1->b_rptr;
- ASSERT(MBLKL(mp1) >= sizeof (ire_t));
- ire->ire_ipsec_overhead = conn_ipsec_length(connp);
- }
+
+ ip_bind_post_handling(connp, mp->b_cont, ire_requested);
/* Send it home. */
mp->b_datap->db_type = M_PCPROTO;
@@ -4539,7 +4543,7 @@ bad_addr:
* upper protocol is expected to reset the src address
* to 0 if it sees a IRE_BROADCAST type returned so that
* no packets are emitted with broadcast/multicast address as
- * source address (that violates hosts requirements RFC1122)
+ * source address (that violates hosts requirements RFC 1122)
* The addresses valid for bind are:
* (1) - INADDR_ANY (0)
* (2) - IP address of an UP interface
@@ -4561,21 +4565,26 @@ bad_addr:
* matching IREs so bind has to look up based on the zone.
*
* Note: lport is in network byte order.
+ *
*/
int
-ip_bind_laddr(conn_t *connp, mblk_t *mp, ipaddr_t src_addr, uint16_t lport,
- boolean_t ire_requested, boolean_t ipsec_policy_set,
- boolean_t fanout_insert)
+ip_bind_laddr_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
+ ipaddr_t src_addr, uint16_t lport, boolean_t fanout_insert)
{
int error = 0;
ire_t *src_ire;
- mblk_t *policy_mp;
- ipif_t *ipif;
zoneid_t zoneid;
ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+ mblk_t *mp = NULL;
+ boolean_t ire_requested = B_FALSE;
+ boolean_t ipsec_policy_set = B_FALSE;
- if (ipsec_policy_set) {
- policy_mp = mp->b_cont;
+ if (mpp)
+ mp = *mpp;
+
+ if (mp != NULL) {
+ ire_requested = (DB_TYPE(mp) == IRE_DB_REQ_TYPE);
+ ipsec_policy_set = (DB_TYPE(mp) == IPSEC_POLICY_SET);
}
/*
@@ -4585,7 +4594,6 @@ ip_bind_laddr(conn_t *connp, mblk_t *mp, ipaddr_t src_addr, uint16_t lport,
connp->conn_fully_bound = B_FALSE;
src_ire = NULL;
- ipif = NULL;
zoneid = IPCL_ZONEID(connp);
@@ -4598,7 +4606,7 @@ ip_bind_laddr(conn_t *connp, mblk_t *mp, ipaddr_t src_addr, uint16_t lport,
* Note: Following code is in if-else-if form for
* readability compared to a condition check.
*/
- /* LINTED - statement has no consequent */
+ /* LINTED - statement has no consequence */
if (IRE_IS_LOCAL(src_ire)) {
/*
* (2) Bind to address of local UP interface
@@ -4617,20 +4625,10 @@ ip_bind_laddr(conn_t *connp, mblk_t *mp, ipaddr_t src_addr, uint16_t lport,
* (ipif_lookup_addr() looks up all interfaces
* but we do not get here for UP interfaces
* - case (2) above)
- * We put the protocol byte back into the mblk
- * since we may come back via ip_wput_nondata()
- * later with this mblk if ipif_lookup_addr chooses
- * to defer processing.
- */
- *mp->b_wptr++ = (char)connp->conn_ulp;
- if ((ipif = ipif_lookup_addr(src_addr, NULL, zoneid,
- CONNP_TO_WQ(connp), mp, ip_wput_nondata,
- &error, ipst)) != NULL) {
- ipif_refrele(ipif);
- } else if (error == EINPROGRESS) {
- if (src_ire != NULL)
- ire_refrele(src_ire);
- return (EINPROGRESS);
+ */
+ /* LINTED - statement has no consequent */
+ if (ip_addr_exists(src_addr, zoneid, ipst)) {
+ /* The address exists */
} else if (CLASSD(src_addr)) {
error = 0;
if (src_ire != NULL)
@@ -4653,20 +4651,16 @@ ip_bind_laddr(conn_t *connp, mblk_t *mp, ipaddr_t src_addr, uint16_t lport,
*/
error = EADDRNOTAVAIL;
}
- /*
- * Just to keep it consistent with the processing in
- * ip_bind_v4()
- */
- mp->b_wptr--;
}
if (error) {
/* Red Alert! Attempting to be a bogon! */
- ip1dbg(("ip_bind: bad src address 0x%x\n",
+ ip1dbg(("ip_bind_laddr_v4: bad src address 0x%x\n",
ntohl(src_addr)));
goto bad_addr;
}
}
+
/*
* Allow setting new policies. For example, disconnects come
* down as ipa_t bind. As we would have set conn_policy_cached
@@ -4690,17 +4684,17 @@ ip_bind_laddr(conn_t *connp, mblk_t *mp, ipaddr_t src_addr, uint16_t lport,
/*
* Do we need to add a check to reject Multicast packets
*/
- error = ipcl_bind_insert(connp, *mp->b_wptr, src_addr, lport);
+ error = ipcl_bind_insert(connp, protocol, src_addr, lport);
}
if (error == 0) {
if (ire_requested) {
- if (!ip_bind_insert_ire(mp, src_ire, NULL, ipst)) {
+ if (!ip_bind_get_ire_v4(mpp, src_ire, NULL, ipst)) {
error = -1;
/* Falls through to bad_addr */
}
} else if (ipsec_policy_set) {
- if (!ip_bind_ipsec_policy_set(connp, policy_mp)) {
+ if (!ip_bind_ipsec_policy_set(connp, mp)) {
error = -1;
/* Falls through to bad_addr */
}
@@ -4717,15 +4711,32 @@ bad_addr:
}
if (src_ire != NULL)
IRE_REFRELE(src_ire);
- if (ipsec_policy_set) {
- ASSERT(policy_mp == mp->b_cont);
- ASSERT(policy_mp != NULL);
- freeb(policy_mp);
- /*
- * As of now assume that nothing else accompanies
- * IPSEC_POLICY_SET.
- */
- mp->b_cont = NULL;
+ return (error);
+}
+
+int
+ip_proto_bind_laddr_v4(conn_t *connp, mblk_t **ire_mpp, uint8_t protocol,
+ ipaddr_t src_addr, uint16_t lport, boolean_t fanout_insert)
+{
+ int error;
+ mblk_t *mp = NULL;
+ boolean_t ire_requested;
+
+ if (ire_mpp)
+ mp = *ire_mpp;
+ ire_requested = (mp != NULL && DB_TYPE(mp) == IRE_DB_REQ_TYPE);
+
+ ASSERT(!connp->conn_af_isv6);
+ connp->conn_pkt_isv6 = B_FALSE;
+ connp->conn_ulp = protocol;
+
+ error = ip_bind_laddr_v4(connp, ire_mpp, protocol, src_addr, lport,
+ fanout_insert);
+ if (error == 0) {
+ ip_bind_post_handling(connp, ire_mpp ? *ire_mpp : NULL,
+ ire_requested);
+ } else if (error < 0) {
+ error = -TBADADDR;
}
return (error);
}
@@ -4746,16 +4757,14 @@ bad_addr:
* Note: lport and fport are in network byte order.
*/
int
-ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp,
- uint16_t lport, ipaddr_t dst_addr, uint16_t fport,
- boolean_t ire_requested, boolean_t ipsec_policy_set,
+ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol,
+ ipaddr_t *src_addrp, uint16_t lport, ipaddr_t dst_addr, uint16_t fport,
boolean_t fanout_insert, boolean_t verify_dst)
{
+
ire_t *src_ire;
ire_t *dst_ire;
int error = 0;
- int protocol;
- mblk_t *policy_mp;
ire_t *sire = NULL;
ire_t *md_dst_ire = NULL;
ire_t *lso_dst_ire = NULL;
@@ -4763,25 +4772,33 @@ ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp,
zoneid_t zoneid;
ipaddr_t src_addr = *src_addrp;
ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+ mblk_t *mp = NULL;
+ boolean_t ire_requested = B_FALSE;
+ boolean_t ipsec_policy_set = B_FALSE;
+ ts_label_t *tsl = NULL;
+
+ if (mpp)
+ mp = *mpp;
+
+ if (mp != NULL) {
+ ire_requested = (DB_TYPE(mp) == IRE_DB_REQ_TYPE);
+ ipsec_policy_set = (DB_TYPE(mp) == IPSEC_POLICY_SET);
+ tsl = MBLK_GETLABEL(mp);
+ }
src_ire = dst_ire = NULL;
- protocol = *mp->b_wptr & 0xFF;
/*
* If we never got a disconnect before, clear it now.
*/
connp->conn_fully_bound = B_FALSE;
- if (ipsec_policy_set) {
- policy_mp = mp->b_cont;
- }
-
zoneid = IPCL_ZONEID(connp);
if (CLASSD(dst_addr)) {
/* Pick up an IRE_BROADCAST */
dst_ire = ire_route_lookup(ip_g_all_ones, 0, 0, 0, NULL,
- NULL, zoneid, MBLK_GETLABEL(mp),
+ NULL, zoneid, tsl,
(MATCH_IRE_RECURSIVE |
MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE |
MATCH_IRE_SECATTR), ipst);
@@ -4804,11 +4821,11 @@ ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp,
if (connp->conn_nexthop_set) {
dst_ire = ire_route_lookup(connp->conn_nexthop_v4, 0,
- 0, 0, NULL, NULL, zoneid, MBLK_GETLABEL(mp),
+ 0, 0, NULL, NULL, zoneid, tsl,
MATCH_IRE_SECATTR, ipst);
} else {
dst_ire = ire_route_lookup(dst_addr, 0, 0, 0, NULL,
- &sire, zoneid, MBLK_GETLABEL(mp),
+ &sire, zoneid, tsl,
(MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
MATCH_IRE_PARENT | MATCH_IRE_RJ_BHOLE |
MATCH_IRE_SECATTR), ipst);
@@ -4840,8 +4857,9 @@ ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp,
*/
if (verify_dst || (dst_ire != NULL)) {
if (ip_debug > 2) {
- pr_addr_dbg("ip_bind_connected: bad connected "
- "dst %s\n", AF_INET, &dst_addr);
+ pr_addr_dbg("ip_bind_connected_v4:"
+ "bad connected dst %s\n",
+ AF_INET, &dst_addr);
}
if (dst_ire == NULL || !(dst_ire->ire_type & IRE_HOST))
error = ENETUNREACH;
@@ -4872,7 +4890,8 @@ ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp,
connp->conn_mac_exempt, ipst) != 0) {
error = EHOSTUNREACH;
if (ip_debug > 2) {
- pr_addr_dbg("ip_bind_connected: no label for dst %s\n",
+ pr_addr_dbg("ip_bind_connected_v4:"
+ " no label for dst %s\n",
AF_INET, &dst_addr);
}
goto bad_addr;
@@ -5056,7 +5075,7 @@ ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp,
/* src_ire must be a local|loopback */
if (!IRE_IS_LOCAL(src_ire)) {
if (ip_debug > 2) {
- pr_addr_dbg("ip_bind_connected: bad connected "
+ pr_addr_dbg("ip_bind_connected_v4: bad connected "
"src %s\n", AF_INET, &src_addr);
}
error = EADDRNOTAVAIL;
@@ -5071,7 +5090,7 @@ ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp,
*/
if (src_ire->ire_type == IRE_LOOPBACK &&
!(IRE_IS_LOCAL(dst_ire) || CLASSD(dst_addr))) {
- ip1dbg(("ip_bind_connected: bad connected loopback\n"));
+ ip1dbg(("ip_bind_connected_v4: bad connected loopback\n"));
error = -1;
goto bad_addr;
}
@@ -5114,12 +5133,13 @@ ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp,
if (sire != NULL) {
ulp_info = &(sire->ire_uinfo);
}
- if (!ip_bind_insert_ire(mp, dst_ire, ulp_info, ipst)) {
+ if (!ip_bind_get_ire_v4(mpp, dst_ire, ulp_info, ipst)) {
error = -1;
goto bad_addr;
}
+ mp = *mpp;
} else if (ipsec_policy_set) {
- if (!ip_bind_ipsec_policy_set(connp, policy_mp)) {
+ if (!ip_bind_ipsec_policy_set(connp, mp)) {
error = -1;
goto bad_addr;
}
@@ -5171,27 +5191,36 @@ ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp,
ASSERT(ill->ill_lso_capab != NULL);
if ((lsoinfo_mp = ip_lsoinfo_return(lso_dst_ire, connp,
- ill->ill_name, ill->ill_lso_capab)) != NULL)
- linkb(mp, lsoinfo_mp);
+ ill->ill_name, ill->ill_lso_capab)) != NULL) {
+ if (mp == NULL) {
+ *mpp = lsoinfo_mp;
+ } else {
+ linkb(mp, lsoinfo_mp);
+ }
+ }
} else if (md_dst_ire != NULL) {
mblk_t *mdinfo_mp;
ASSERT(ill->ill_mdt_capab != NULL);
if ((mdinfo_mp = ip_mdinfo_return(md_dst_ire, connp,
- ill->ill_name, ill->ill_mdt_capab)) != NULL)
- linkb(mp, mdinfo_mp);
+ ill->ill_name, ill->ill_mdt_capab)) != NULL) {
+ if (mp == NULL) {
+ *mpp = mdinfo_mp;
+ } else {
+ linkb(mp, mdinfo_mp);
+ }
+ }
}
}
bad_addr:
if (ipsec_policy_set) {
- ASSERT(policy_mp == mp->b_cont);
- ASSERT(policy_mp != NULL);
- freeb(policy_mp);
+ ASSERT(mp != NULL);
+ freeb(mp);
/*
* As of now assume that nothing else accompanies
* IPSEC_POLICY_SET.
*/
- mp->b_cont = NULL;
+ *mpp = NULL;
}
if (src_ire != NULL)
IRE_REFRELE(src_ire);
@@ -5206,32 +5235,62 @@ bad_addr:
return (error);
}
+int
+ip_proto_bind_connected_v4(conn_t *connp, mblk_t **ire_mpp, uint8_t protocol,
+ ipaddr_t *src_addrp, uint16_t lport, ipaddr_t dst_addr, uint16_t fport,
+ boolean_t fanout_insert, boolean_t verify_dst)
+{
+ int error;
+ mblk_t *mp = NULL;
+ boolean_t ire_requested;
+
+ if (ire_mpp)
+ mp = *ire_mpp;
+ ire_requested = (mp != NULL && DB_TYPE(mp) == IRE_DB_REQ_TYPE);
+
+ ASSERT(!connp->conn_af_isv6);
+ connp->conn_pkt_isv6 = B_FALSE;
+ connp->conn_ulp = protocol;
+
+ /* For raw socket, the local port is not set. */
+ if (lport == 0)
+ lport = connp->conn_lport;
+ error = ip_bind_connected_v4(connp, ire_mpp, protocol,
+ src_addrp, lport, dst_addr, fport, fanout_insert, verify_dst);
+ if (error == 0) {
+ ip_bind_post_handling(connp, ire_mpp ? *ire_mpp : NULL,
+ ire_requested);
+ } else if (error < 0) {
+ error = -TBADADDR;
+ }
+ return (error);
+}
+
/*
- * Insert the ire in b_cont. Returns false if it fails (due to lack of space).
+ * Get the ire in *mpp. Returns false if it fails (due to lack of space).
* Prefers dst_ire over src_ire.
*/
static boolean_t
-ip_bind_insert_ire(mblk_t *mp, ire_t *ire, iulp_t *ulp_info, ip_stack_t *ipst)
+ip_bind_get_ire_v4(mblk_t **mpp, ire_t *ire, iulp_t *ulp_info, ip_stack_t *ipst)
{
- mblk_t *mp1;
- ire_t *ret_ire = NULL;
+ mblk_t *mp = *mpp;
+ ire_t *ret_ire;
- mp1 = mp->b_cont;
- ASSERT(mp1 != NULL);
+ ASSERT(mp != NULL);
if (ire != NULL) {
/*
- * mp1 initialized above to IRE_DB_REQ_TYPE
+ * mp initialized above to IRE_DB_REQ_TYPE
* appended mblk. Its <upper protocol>'s
* job to make sure there is room.
*/
- if ((mp1->b_datap->db_lim - mp1->b_rptr) < sizeof (ire_t))
- return (0);
+ if ((mp->b_datap->db_lim - mp->b_rptr) < sizeof (ire_t))
+ return (B_FALSE);
- mp1->b_datap->db_type = IRE_DB_TYPE;
- mp1->b_wptr = mp1->b_rptr + sizeof (ire_t);
- bcopy(ire, mp1->b_rptr, sizeof (ire_t));
- ret_ire = (ire_t *)mp1->b_rptr;
+ mp->b_datap->db_type = IRE_DB_TYPE;
+ mp->b_wptr = mp->b_rptr + sizeof (ire_t);
+ bcopy(ire, mp->b_rptr, sizeof (ire_t));
+ ret_ire = (ire_t *)mp->b_rptr;
/*
* Pass the latest setting of the ip_path_mtu_discovery and
* copy the ulp info if any.
@@ -5242,16 +5301,15 @@ ip_bind_insert_ire(mblk_t *mp, ire_t *ire, iulp_t *ulp_info, ip_stack_t *ipst)
bcopy(ulp_info, &(ret_ire->ire_uinfo),
sizeof (iulp_t));
}
- ret_ire->ire_mp = mp1;
+ ret_ire->ire_mp = mp;
} else {
/*
* No IRE was found. Remove IRE mblk.
*/
- mp->b_cont = mp1->b_cont;
- freeb(mp1);
+ *mpp = mp->b_cont;
+ freeb(mp);
}
-
- return (1);
+ return (B_TRUE);
}
/*
@@ -5645,9 +5703,9 @@ ip_ddi_destroy(void)
{
tnet_fini();
- icmp_ddi_destroy();
- rts_ddi_destroy();
- udp_ddi_destroy();
+ icmp_ddi_g_destroy();
+ rts_ddi_g_destroy();
+ udp_ddi_g_destroy();
sctp_ddi_g_destroy();
tcp_ddi_g_destroy();
ipsec_policy_g_destroy();
@@ -5814,6 +5872,7 @@ ip_stack_fini(netstackid_t stackid, void *arg)
kmem_free(ipst->ips_ill_g_heads, sizeof (ill_g_head_t) * MAX_G_HEADS);
ipst->ips_ill_g_heads = NULL;
+ ldi_ident_release(ipst->ips_ldi_ident);
kmem_free(ipst, sizeof (*ipst));
}
@@ -5898,9 +5957,9 @@ ip_ddi_init(void)
tnet_init();
- udp_ddi_init();
- rts_ddi_init();
- icmp_ddi_init();
+ udp_ddi_g_init();
+ rts_ddi_g_init();
+ icmp_ddi_g_init();
}
/*
@@ -5912,6 +5971,7 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
ip_stack_t *ipst;
ipparam_t *pa;
ipndp_t *na;
+ major_t major;
#ifdef NS_DEBUG
printf("ip_stack_init(stack %d)\n", stackid);
@@ -6011,6 +6071,8 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns)
list_create(&ipst->ips_capab_taskq_list, sizeof (mblk_t),
offsetof(mblk_t, b_next));
+ major = mod_name_to_major(INET_NAME);
+ (void) ldi_ident_from_major(major, &ipst->ips_ldi_ident);
return (ipst);
}
@@ -6353,7 +6415,7 @@ ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags,
}
}
- if (connp == NULL || connp->conn_upq == NULL) {
+ if (connp == NULL) {
/*
* No one bound to these addresses. Is
* there a client that wants all
@@ -6392,6 +6454,9 @@ ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags,
}
return;
}
+
+ ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL);
+
CONN_INC_REF(connp);
first_connp = connp;
@@ -6415,7 +6480,7 @@ ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags,
/*
* Copy the packet.
*/
- if (connp == NULL || connp->conn_upq == NULL ||
+ if (connp == NULL ||
(((first_mp1 = dupmsg(first_mp)) == NULL) &&
((first_mp1 = ip_copymsg(first_mp)) == NULL))) {
/*
@@ -6425,11 +6490,17 @@ ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags,
connp = first_connp;
break;
}
+ ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
mp1 = mctl_present ? first_mp1->b_cont : first_mp1;
CONN_INC_REF(connp);
mutex_exit(&connfp->connf_lock);
rq = connp->conn_rq;
- if (!canputnext(rq)) {
+
+ /*
+ * Check flow control
+ */
+ if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
+ (!IPCL_IS_NONSTR(connp) && !canputnext(rq))) {
if (flags & IP_FF_RAWIP) {
BUMP_MIB(mibptr, rawipIfStatsInOverflows);
} else {
@@ -6527,7 +6598,11 @@ ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags,
}
rq = connp->conn_rq;
- if (!canputnext(rq)) {
+ /*
+ * Check flow control
+ */
+ if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
+ (!IPCL_IS_NONSTR(connp) && !canputnext(rq))) {
if (flags & IP_FF_RAWIP) {
BUMP_MIB(mibptr, rawipIfStatsInOverflows);
} else {
@@ -6975,7 +7050,8 @@ ip_fanout_udp_conn(conn_t *connp, mblk_t *first_mp, mblk_t *mp,
else
first_mp = mp;
- if (CONN_UDP_FLOWCTLD(connp)) {
+ if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
+ (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) {
BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
freemsg(first_mp);
return;
@@ -7166,9 +7242,12 @@ ip_fanout_udp(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha,
connp = connp->conn_next;
}
- if (connp == NULL || connp->conn_upq == NULL)
+ if (connp == NULL ||
+ !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL)
goto notfound;
+ ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL);
+
if (is_system_labeled() &&
!tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr,
connp))
@@ -7202,9 +7281,12 @@ ip_fanout_udp(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha,
connp = connp->conn_next;
}
- if (connp == NULL || connp->conn_upq == NULL)
+ if (connp == NULL ||
+ !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL)
goto notfound;
+ ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL);
+
first_connp = connp;
/*
* When SO_REUSEADDR is not set, send the packet only to the first
@@ -7321,7 +7403,8 @@ notfound:
connp))
connp = NULL;
- if (connp == NULL || connp->conn_upq == NULL) {
+ if (connp == NULL ||
+ !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL) {
/*
* No one bound to this port. Is
* there a client that wants all
@@ -7349,6 +7432,7 @@ notfound:
}
return;
}
+ ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL);
CONN_INC_REF(connp);
mutex_exit(&connfp->connf_lock);
@@ -7377,7 +7461,8 @@ notfound:
connp = connp->conn_next;
}
- if (connp == NULL || connp->conn_upq == NULL) {
+ if (connp == NULL ||
+ !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL) {
/*
* No one bound to this port. Is
* there a client that wants all
@@ -7406,6 +7491,7 @@ notfound:
}
return;
}
+ ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL);
first_connp = connp;
@@ -9774,6 +9860,15 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
return (ip_modopen(q, devp, flag, sflag, credp));
}
+ if ((flag & ~(FKLYR)) == IP_HELPER_STR) {
+ /*
+ * Non streams based socket looking for a stream
+ * to access IP
+ */
+ return (ip_helper_stream_setup(q, devp, flag, sflag,
+ credp, isv6));
+ }
+
ns = netstack_find_by_cred(credp);
ASSERT(ns != NULL);
ipst = ns->netstack_ip;
@@ -10344,7 +10439,7 @@ ip_opt_set_ipif(conn_t *connp, ipaddr_t addr, boolean_t checkonly, int option,
if (ipif == NULL) {
if (error == EINPROGRESS)
return (error);
- else if ((option == IP_MULTICAST_IF) ||
+ if ((option == IP_MULTICAST_IF) ||
(option == IP_NEXTHOP))
return (EHOSTUNREACH);
else
@@ -11611,7 +11706,6 @@ ip_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
}
return (-1);
}
-
/* Named Dispatch routine to get a current value out of our parameter table. */
/* ARGSUSED */
static int
@@ -12806,10 +12900,11 @@ ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
if ((connp = ipcl_classify_v4(mp, IPPROTO_UDP, IP_SIMPLE_HDR_LENGTH,
ire->ire_zoneid, ipst)) != NULL) {
- ASSERT(connp->conn_upq != NULL);
+ ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL);
IP_STAT(ipst, ip_udp_fast_path);
- if (CONN_UDP_FLOWCTLD(connp)) {
+ if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
+ (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) {
freemsg(mp);
BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
} else {
@@ -20373,11 +20468,9 @@ ip_trash_ire_reclaim_stack(ip_stack_t *ipst)
* upper level protocol. We remove this conn from any fanout hash list it is
* on, and zero out the bind information. No reply is expected up above.
*/
-mblk_t *
-ip_unbind(queue_t *q, mblk_t *mp)
+void
+ip_unbind(conn_t *connp)
{
- conn_t *connp = Q_TO_CONN(q);
-
ASSERT(!MUTEX_HELD(&connp->conn_lock));
if (is_system_labeled() && connp->conn_anon_port) {
@@ -20390,20 +20483,6 @@ ip_unbind(queue_t *q, mblk_t *mp)
ipcl_hash_remove(connp);
- ASSERT(mp->b_cont == NULL);
- /*
- * Convert mp into a T_OK_ACK
- */
- mp = mi_tpi_ok_ack_alloc(mp);
-
- /*
- * should not happen in practice... T_OK_ACK is smaller than the
- * original message.
- */
- if (mp == NULL)
- return (NULL);
-
- return (mp);
}
/*
@@ -20475,11 +20554,13 @@ ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller,
ASSERT(connp != NULL);
zoneid = connp->conn_zoneid;
ipst = connp->conn_netstack->netstack_ip;
+ ASSERT(ipst != NULL);
/* is queue flow controlled? */
if ((q->q_first != NULL || connp->conn_draining) &&
(caller == IP_WPUT)) {
ASSERT(!need_decref);
+ ASSERT(!IP_FLOW_CONTROLLED_ULP(connp->conn_ulp));
(void) putq(q, mp);
return;
}
@@ -21514,7 +21595,6 @@ dontroute:
* connectivity.
*/
ipha->ipha_ttl = 1;
-
/* If suitable ipif not found, drop packet */
dst_ipif = ipif_lookup_onlink_addr(dst, zoneid, ipst);
if (dst_ipif == NULL) {
@@ -23244,6 +23324,7 @@ blocked:
* ip_wsrv will be scheduled or
* is already running.
*/
+
(void) putq(connp->conn_wq,
first_mp);
}
@@ -27522,26 +27603,6 @@ ip_ioctl_finish(queue_t *q, mblk_t *mp, int err, int mode, ipsq_t *ipsq)
ipsq_current_finish(ipsq);
}
-/*
- * This is called from ip_wput_nondata to resume a deferred TCP bind.
- */
-/* ARGSUSED */
-void
-ip_resume_tcp_bind(void *arg, mblk_t *mp, void *arg2)
-{
- conn_t *connp = arg;
- tcp_t *tcp;
-
- ASSERT(connp != NULL && IPCL_IS_TCP(connp) && connp->conn_tcp != NULL);
- tcp = connp->conn_tcp;
-
- if (connp->conn_tcp->tcp_state == TCPS_CLOSED)
- freemsg(mp);
- else
- tcp_rput_other(tcp, mp);
- CONN_OPER_PENDING_DONE(connp);
-}
-
/* Called from ip_wput for all non data messages */
/* ARGSUSED */
void
@@ -27782,8 +27843,9 @@ nak:
case M_PROTO:
case M_PCPROTO:
/*
- * The only PROTO messages we expect are ULP binds and
- * copies of option negotiation acknowledgements.
+ * The only PROTO messages we expect are copies of option
+ * negotiation acknowledgements, AH and ESP bind requests
+ * are also expected.
*/
switch (((union T_primitives *)mp->b_rptr)->type) {
case O_T_BIND_REQ:
@@ -27809,37 +27871,15 @@ nak:
mp = connp->conn_af_isv6 ? ip_bind_v6(q, mp,
connp, NULL) : ip_bind_v4(q, mp, connp);
- if (mp == NULL)
- return;
- if (IPCL_IS_TCP(connp)) {
- /*
- * In the case of TCP endpoint we
- * come here only for bind retries
- */
- ASSERT(ipsq != NULL);
- CONN_INC_REF(connp);
- SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
- ip_resume_tcp_bind, connp,
- SQ_FILL, SQTAG_BIND_RETRY);
- } else if (IPCL_IS_UDP(connp)) {
- /*
- * In the case of UDP endpoint we
- * come here only for bind retries
- */
- ASSERT(ipsq != NULL);
- udp_resume_bind(connp, mp);
- } else if (IPCL_IS_RAWIP(connp)) {
- /*
- * In the case of RAWIP endpoint we
- * come here only for bind retries
- */
- ASSERT(ipsq != NULL);
- rawip_resume_bind(connp, mp);
- } else {
- /* The case of AH and ESP */
- qreply(q, mp);
- CONN_OPER_PENDING_DONE(connp);
- }
+ ASSERT(mp != NULL);
+
+ ASSERT(!IPCL_IS_TCP(connp));
+ ASSERT(!IPCL_IS_UDP(connp));
+ ASSERT(!IPCL_IS_RAWIP(connp));
+
+ /* The case of AH and ESP */
+ qreply(q, mp);
+ CONN_OPER_PENDING_DONE(connp);
return;
}
case T_SVR4_OPTMGMT_REQ:
@@ -27908,7 +27948,8 @@ nak:
proto_str = "T_UNBIND_REQ";
goto protonak;
}
- mp = ip_unbind(q, mp);
+ ip_unbind(Q_TO_CONN(q));
+ mp = mi_tpi_ok_ack_alloc(mp);
qreply(q, mp);
return;
default:
@@ -28582,6 +28623,11 @@ conn_drain_insert(conn_t *connp)
head->conn_drain_prev->conn_drain_next = connp;
head->conn_drain_prev = connp;
}
+ /*
+ * For non streams based sockets assert flow control.
+ */
+ (*connp->conn_upcalls->su_txq_full)
+ (connp->conn_upper_handle, B_TRUE);
mutex_exit(CONN_DRAIN_LIST_LOCK(connp));
}
@@ -28695,7 +28741,16 @@ conn_drain_tail(conn_t *connp, boolean_t closing)
}
connp->conn_drain_next = NULL;
connp->conn_drain_prev = NULL;
+
+ /*
+ * For non streams based sockets open up flow control.
+ */
+ if (IPCL_IS_NONSTR(connp)) {
+ (*connp->conn_upcalls->su_txq_full)
+ (connp->conn_upper_handle, B_FALSE);
+ }
}
+
mutex_exit(CONN_DRAIN_LIST_LOCK(connp));
}
@@ -28779,6 +28834,7 @@ ip_wsrv(queue_t *q)
*/
connp->conn_draining = 0;
enableok(q);
+
}
/* Enable the next conn for draining */
@@ -28941,7 +28997,7 @@ ip_conn_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr)
"CONN " MI_COL_HDRPAD_STR
"rfq " MI_COL_HDRPAD_STR
"stq " MI_COL_HDRPAD_STR
- " zone local remote");
+ " zone local remote");
/*
* Because of the ndd constraint, at most we can have 64K buffer
@@ -29339,7 +29395,6 @@ ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t,
return (or->or_private == CGTP_MCAST_SUCCESS ? 0 : error);
}
-
/*
* Issue a warning regarding a route crossing an interface with an
* incorrect MTU. Only one message every 'ip_multirt_log_interval'
diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c
index a1d97627b2..fe326778c2 100644
--- a/usr/src/uts/common/inet/ip/ip6.c
+++ b/usr/src/uts/common/inet/ip/ip6.c
@@ -191,13 +191,15 @@ static void icmp_inbound_too_big_v6(queue_t *, mblk_t *, ill_t *ill,
static void icmp_pkt_v6(queue_t *, mblk_t *, void *, size_t,
const in6_addr_t *, boolean_t, zoneid_t, ip_stack_t *);
static void icmp_redirect_v6(queue_t *, mblk_t *, ill_t *ill);
-static int ip_bind_connected_v6(conn_t *, mblk_t *, in6_addr_t *,
+static int ip_bind_connected_v6(conn_t *, mblk_t **, uint8_t, in6_addr_t *,
uint16_t, const in6_addr_t *, ip6_pkt_t *, uint16_t,
- boolean_t, boolean_t, boolean_t, boolean_t);
-static boolean_t ip_bind_insert_ire_v6(mblk_t *, ire_t *, const in6_addr_t *,
+ boolean_t, boolean_t);
+static boolean_t ip_bind_get_ire_v6(mblk_t **, ire_t *, const in6_addr_t *,
iulp_t *, ip_stack_t *);
-static int ip_bind_laddr_v6(conn_t *, mblk_t *, const in6_addr_t *,
- uint16_t, boolean_t, boolean_t, boolean_t);
+static void ip_bind_post_handling_v6(conn_t *, mblk_t *, boolean_t,
+ boolean_t, ip_stack_t *);
+static int ip_bind_laddr_v6(conn_t *, mblk_t **, uint8_t,
+ const in6_addr_t *, uint16_t, boolean_t);
static void ip_fanout_proto_v6(queue_t *, mblk_t *, ip6_t *, ill_t *,
ill_t *, uint8_t, uint_t, uint_t, boolean_t, zoneid_t);
static void ip_fanout_tcp_v6(queue_t *, mblk_t *, ip6_t *, ill_t *,
@@ -2071,12 +2073,8 @@ ip_bind_v6(queue_t *q, mblk_t *mp, conn_t *connp, ip6_pkt_t *ipp)
uint16_t lport;
uint16_t fport;
uchar_t *ucp;
- mblk_t *mp1;
- boolean_t ire_requested;
- boolean_t ipsec_policy_set;
int error = 0;
boolean_t local_bind;
- boolean_t orig_pkt_isv6 = connp->conn_pkt_isv6;
ipa6_conn_x_t *acx6;
boolean_t verify_dst;
ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
@@ -2145,9 +2143,6 @@ ip_bind_v6(queue_t *q, mblk_t *mp, conn_t *connp, ip6_pkt_t *ipp)
ip1dbg(("ip_bind_v6: unaligned address\n"));
goto bad_addr;
}
- mp1 = mp->b_cont; /* trailing mp if any */
- ire_requested = (mp1 && mp1->b_datap->db_type == IRE_DB_REQ_TYPE);
- ipsec_policy_set = (mp1 && mp1->b_datap->db_type == IPSEC_POLICY_SET);
switch (tbr->ADDR_length) {
default:
@@ -2173,9 +2168,6 @@ ip_bind_v6(queue_t *q, mblk_t *mp, conn_t *connp, ip6_pkt_t *ipp)
/*
* Verify that both the source and destination addresses
* are valid.
- * Note that we allow connect to broadcast and multicast
- * addresses when ire_requested is set. Thus the ULP
- * has to check for IRE_BROADCAST and multicast.
*/
ac6 = (ipa6_conn_t *)ucp;
v6srcp = &ac6->ac6_laddr;
@@ -2192,9 +2184,6 @@ ip_bind_v6(queue_t *q, mblk_t *mp, conn_t *connp, ip6_pkt_t *ipp)
case sizeof (ipa6_conn_x_t):
/*
* Verify that the source address is valid.
- * Note that we allow connect to broadcast and multicast
- * addresses when ire_requested is set. Thus the ULP
- * has to check for IRE_BROADCAST and multicast.
*/
acx6 = (ipa6_conn_x_t *)ucp;
ac6 = &acx6->ac6x_conn;
@@ -2211,80 +2200,35 @@ ip_bind_v6(queue_t *q, mblk_t *mp, conn_t *connp, ip6_pkt_t *ipp)
break;
}
if (local_bind) {
- if (IN6_IS_ADDR_V4MAPPED(v6srcp) && !connp->conn_ipv6_v6only) {
- /* Bind to IPv4 address */
- ipaddr_t v4src;
-
- IN6_V4MAPPED_TO_IPADDR(v6srcp, v4src);
-
- error = ip_bind_laddr(connp, mp, v4src, lport,
- ire_requested, ipsec_policy_set,
- tbr->ADDR_length != IPV6_ADDR_LEN);
- if (error != 0)
- goto bad_addr;
- connp->conn_pkt_isv6 = B_FALSE;
- } else {
- if (IN6_IS_ADDR_V4MAPPED(v6srcp)) {
- error = 0;
- goto bad_addr;
- }
- error = ip_bind_laddr_v6(connp, mp, v6srcp, lport,
- ire_requested, ipsec_policy_set,
- (tbr->ADDR_length != IPV6_ADDR_LEN));
- if (error != 0)
- goto bad_addr;
- connp->conn_pkt_isv6 = B_TRUE;
- }
+ error = ip_proto_bind_laddr_v6(connp, &mp->b_cont, protocol,
+ v6srcp, lport, tbr->ADDR_length != IPV6_ADDR_LEN);
} else {
- /*
- * Bind to local and remote address. Local might be
- * unspecified in which case it will be extracted from
- * ire_src_addr_v6
- */
- if (IN6_IS_ADDR_V4MAPPED(v6dstp) && !connp->conn_ipv6_v6only) {
- /* Connect to IPv4 address */
- ipaddr_t v4src;
- ipaddr_t v4dst;
-
- /* Is the source unspecified or mapped? */
- if (!IN6_IS_ADDR_V4MAPPED(v6srcp) &&
- !IN6_IS_ADDR_UNSPECIFIED(v6srcp)) {
- ip1dbg(("ip_bind_v6: "
- "dst is mapped, but not the src\n"));
- goto bad_addr;
- }
- IN6_V4MAPPED_TO_IPADDR(v6srcp, v4src);
- IN6_V4MAPPED_TO_IPADDR(v6dstp, v4dst);
-
- /*
- * XXX Fix needed. Need to pass ipsec_policy_set
- * instead of B_FALSE.
- */
+ error = ip_proto_bind_connected_v6(connp, &mp->b_cont, protocol,
+ v6srcp, lport, v6dstp, ipp, fport, B_TRUE, verify_dst);
+ }
- /* Always verify destination reachability. */
- error = ip_bind_connected(connp, mp, &v4src, lport,
- v4dst, fport, ire_requested, ipsec_policy_set,
- B_TRUE, B_TRUE);
- if (error != 0)
- goto bad_addr;
- IN6_IPADDR_TO_V4MAPPED(v4src, v6srcp);
- connp->conn_pkt_isv6 = B_FALSE;
- } else if (IN6_IS_ADDR_V4MAPPED(v6srcp)) {
- ip1dbg(("ip_bind_v6: "
- "src is mapped, but not the dst\n"));
- goto bad_addr;
- } else {
- error = ip_bind_connected_v6(connp, mp, v6srcp,
- lport, v6dstp, ipp, fport, ire_requested,
- ipsec_policy_set, B_TRUE, verify_dst);
- if (error != 0)
- goto bad_addr;
- connp->conn_pkt_isv6 = B_TRUE;
- }
+ if (error == 0) {
+ /* Send it home. */
+ mp->b_datap->db_type = M_PCPROTO;
+ tbr->PRIM_type = T_BIND_ACK;
+ return (mp);
}
+bad_addr:
+ ASSERT(error != EINPROGRESS);
+ if (error > 0)
+ mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error);
+ else
+ mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0);
+ return (mp);
+}
+
+static void
+ip_bind_post_handling_v6(conn_t *connp, mblk_t *mp,
+ boolean_t version_changed, boolean_t ire_requested, ip_stack_t *ipst)
+{
/* Update conn_send and pktversion if v4/v6 changed */
- if (orig_pkt_isv6 != connp->conn_pkt_isv6) {
+ if (version_changed) {
ip_setpktversion(connp, connp->conn_pkt_isv6, B_TRUE, ipst);
}
/*
@@ -2293,27 +2237,12 @@ ip_bind_v6(queue_t *q, mblk_t *mp, conn_t *connp, ip6_pkt_t *ipp)
* may not have been inherited at that point in time and hence
* conn_out_enforce_policy may not be set.
*/
- mp1 = mp->b_cont;
if (ire_requested && connp->conn_out_enforce_policy &&
- mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE) {
- ire_t *ire = (ire_t *)mp1->b_rptr;
- ASSERT(MBLKL(mp1) >= sizeof (ire_t));
+ mp != NULL && DB_TYPE(mp) == IRE_DB_REQ_TYPE) {
+ ire_t *ire = (ire_t *)mp->b_rptr;
+ ASSERT(MBLKL(mp) >= sizeof (ire_t));
ire->ire_ipsec_overhead = (conn_ipsec_length(connp));
}
-
- /* Send it home. */
- mp->b_datap->db_type = M_PCPROTO;
- tbr->PRIM_type = T_BIND_ACK;
- return (mp);
-
-bad_addr:
- if (error == EINPROGRESS)
- return (NULL);
- if (error > 0)
- mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error);
- else
- mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0);
- return (mp);
}
/*
@@ -2339,20 +2268,27 @@ bad_addr:
* When the address is loopback or multicast, there might be many matching IREs
* so bind has to look up based on the zone.
*/
+/*
+ * Verify the local IP address. Does not change the conn_t except
+ * conn_fully_bound and conn_policy_cached.
+ */
static int
-ip_bind_laddr_v6(conn_t *connp, mblk_t *mp, const in6_addr_t *v6src,
- uint16_t lport, boolean_t ire_requested, boolean_t ipsec_policy_set,
- boolean_t fanout_insert)
+ip_bind_laddr_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol,
+ const in6_addr_t *v6src, uint16_t lport, boolean_t fanout_insert)
{
int error = 0;
ire_t *src_ire = NULL;
- ipif_t *ipif = NULL;
- mblk_t *policy_mp;
zoneid_t zoneid;
+ mblk_t *mp = NULL;
+ boolean_t ire_requested;
+ boolean_t ipsec_policy_set;
ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
- if (ipsec_policy_set)
- policy_mp = mp->b_cont;
+ if (mpp)
+ mp = *mpp;
+
+ ire_requested = (mp != NULL && DB_TYPE(mp) == IRE_DB_REQ_TYPE);
+ ipsec_policy_set = (mp != NULL && DB_TYPE(mp) == IPSEC_POLICY_SET);
/*
* If it was previously connected, conn_fully_bound would have
@@ -2372,11 +2308,11 @@ ip_bind_laddr_v6(conn_t *connp, mblk_t *mp, const in6_addr_t *v6src,
* readability compared to a condition check.
*/
ASSERT(src_ire == NULL || !(src_ire->ire_type & IRE_BROADCAST));
+ /* LINTED - statement has no consequent */
if (IRE_IS_LOCAL(src_ire)) {
/*
* (2) Bind to address of local UP interface
*/
- ipif = src_ire->ire_ipif;
} else if (IN6_IS_ADDR_MULTICAST(v6src)) {
ipif_t *multi_ipif = NULL;
ire_t *save_ire;
@@ -2418,28 +2354,12 @@ ip_bind_laddr_v6(conn_t *connp, mblk_t *mp, const in6_addr_t *v6src,
if (multi_ipif != NULL)
ipif_refrele(multi_ipif);
} else {
- *mp->b_wptr++ = (char)connp->conn_ulp;
- ipif = ipif_lookup_addr_v6(v6src, NULL, zoneid,
- CONNP_TO_WQ(connp), mp, ip_wput_nondata, &error,
- ipst);
- if (ipif == NULL) {
- if (error == EINPROGRESS) {
- if (src_ire != NULL)
- ire_refrele(src_ire);
- return (error);
- }
+ if (!ip_addr_exists_v6(v6src, zoneid, ipst)) {
/*
* Not a valid address for bind
*/
error = EADDRNOTAVAIL;
- } else {
- ipif_refrele(ipif);
}
- /*
- * Just to keep it consistent with the processing in
- * ip_bind_v6().
- */
- mp->b_wptr--;
}
if (error != 0) {
@@ -2471,17 +2391,18 @@ ip_bind_laddr_v6(conn_t *connp, mblk_t *mp, const in6_addr_t *v6src,
connp->conn_remv6 = ipv6_all_zeros;
connp->conn_lport = lport;
connp->conn_fport = 0;
- error = ipcl_bind_insert_v6(connp, *mp->b_wptr, v6src, lport);
+ error = ipcl_bind_insert_v6(connp, protocol, v6src, lport);
}
if (error == 0) {
if (ire_requested) {
- if (!ip_bind_insert_ire_v6(mp, src_ire, v6src, NULL,
+ if (!ip_bind_get_ire_v6(mpp, src_ire, v6src, NULL,
ipst)) {
error = -1;
goto bad_addr;
}
+ mp = *mpp;
} else if (ipsec_policy_set) {
- if (!ip_bind_ipsec_policy_set(connp, policy_mp)) {
+ if (!ip_bind_ipsec_policy_set(connp, mp)) {
error = -1;
goto bad_addr;
}
@@ -2501,54 +2422,70 @@ bad_addr:
ire_refrele(src_ire);
if (ipsec_policy_set) {
- ASSERT(policy_mp != NULL);
- freeb(policy_mp);
+ ASSERT(mp != NULL);
+ freeb(mp);
/*
* As of now assume that nothing else accompanies
* IPSEC_POLICY_SET.
*/
- mp->b_cont = NULL;
+ *mpp = NULL;
}
+
return (error);
}
-
-/* ARGSUSED */
-static void
-ip_bind_connected_resume_v6(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
- void *dummy_arg)
+int
+ip_proto_bind_laddr_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol,
+ const in6_addr_t *v6srcp, uint16_t lport, boolean_t fanout_insert)
{
- conn_t *connp = NULL;
- t_scalar_t prim;
+ int error;
+ boolean_t ire_requested;
+ mblk_t *mp = NULL;
+ boolean_t orig_pkt_isv6 = connp->conn_pkt_isv6;
+ ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
- ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
+ /*
+ * Note that we allow connect to broadcast and multicast
+ * address when ire_requested is set. Thus the ULP
+ * has to check for IRE_BROADCAST and multicast.
+ */
+ if (mpp)
+ mp = *mpp;
+ ire_requested = (mp && DB_TYPE(mp) == IRE_DB_REQ_TYPE);
- if (CONN_Q(q))
- connp = Q_TO_CONN(q);
- ASSERT(connp != NULL);
+ ASSERT(connp->conn_af_isv6);
+ connp->conn_ulp = protocol;
- prim = ((union T_primitives *)mp->b_rptr)->type;
- ASSERT(prim == O_T_BIND_REQ || prim == T_BIND_REQ);
+ if (IN6_IS_ADDR_V4MAPPED(v6srcp) && !connp->conn_ipv6_v6only) {
+ /* Bind to IPv4 address */
+ ipaddr_t v4src;
- if (IPCL_IS_TCP(connp)) {
- /* Pass sticky_ipp for scope_id and pktinfo */
- mp = ip_bind_v6(q, mp, connp, &connp->conn_tcp->tcp_sticky_ipp);
+ IN6_V4MAPPED_TO_IPADDR(v6srcp, v4src);
+
+ error = ip_bind_laddr_v4(connp, mpp, protocol, v4src, lport,
+ fanout_insert);
+ if (error != 0)
+ goto bad_addr;
+ connp->conn_pkt_isv6 = B_FALSE;
} else {
- /* For UDP and ICMP */
- mp = ip_bind_v6(q, mp, connp, NULL);
- }
- if (mp != NULL) {
- if (IPCL_IS_TCP(connp)) {
- CONN_INC_REF(connp);
- SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
- ip_resume_tcp_bind, connp, SQ_FILL,
- SQTAG_TCP_RPUTOTHER);
- } else if (IPCL_IS_UDP(connp)) {
- udp_resume_bind(connp, mp);
- } else {
- ASSERT(IPCL_IS_RAWIP(connp));
- rawip_resume_bind(connp, mp);
+ if (IN6_IS_ADDR_V4MAPPED(v6srcp)) {
+ error = 0;
+ goto bad_addr;
}
+ error = ip_bind_laddr_v6(connp, mpp, protocol, v6srcp,
+ lport, fanout_insert);
+ if (error != 0)
+ goto bad_addr;
+ connp->conn_pkt_isv6 = B_TRUE;
}
+
+ ip_bind_post_handling_v6(connp, mpp ? *mpp : NULL,
+ orig_pkt_isv6 != connp->conn_pkt_isv6, ire_requested, ipst);
+ return (0);
+
+bad_addr:
+ if (error < 0)
+ error = -TBADADDR;
+ return (error);
}
/*
@@ -2562,42 +2499,43 @@ ip_bind_connected_resume_v6(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
* non-TCP cases, it is NULL and for all other tcp cases it is not useful.
*
*/
-static int
-ip_bind_connected_v6(conn_t *connp, mblk_t *mp, in6_addr_t *v6src,
- uint16_t lport, const in6_addr_t *v6dst, ip6_pkt_t *ipp, uint16_t fport,
- boolean_t ire_requested, boolean_t ipsec_policy_set,
- boolean_t fanout_insert, boolean_t verify_dst)
+int
+ip_bind_connected_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol,
+ in6_addr_t *v6src, uint16_t lport, const in6_addr_t *v6dst,
+ ip6_pkt_t *ipp, uint16_t fport, boolean_t fanout_insert,
+ boolean_t verify_dst)
{
ire_t *src_ire;
ire_t *dst_ire;
int error = 0;
- int protocol;
- mblk_t *policy_mp;
ire_t *sire = NULL;
ire_t *md_dst_ire = NULL;
ill_t *md_ill = NULL;
ill_t *dst_ill = NULL;
ipif_t *src_ipif = NULL;
zoneid_t zoneid;
- boolean_t ill_held = B_FALSE;
+ boolean_t ill_held = B_FALSE;
+ mblk_t *mp = NULL;
+ boolean_t ire_requested = B_FALSE;
+ boolean_t ipsec_policy_set = B_FALSE;
ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+ ts_label_t *tsl = NULL;
- src_ire = dst_ire = NULL;
- /*
- * NOTE: The protocol is beyond the wptr because that's how
- * the undocumented transport<-->IP T_BIND_REQ behavior works.
- */
- protocol = *mp->b_wptr & 0xFF;
+ if (mpp)
+ mp = *mpp;
+
+ if (mp != NULL) {
+ ire_requested = (DB_TYPE(mp) == IRE_DB_REQ_TYPE);
+ ipsec_policy_set = (DB_TYPE(mp) == IPSEC_POLICY_SET);
+ tsl = MBLK_GETLABEL(mp);
+ }
+ src_ire = dst_ire = NULL;
/*
* If we never got a disconnect before, clear it now.
*/
connp->conn_fully_bound = B_FALSE;
- if (ipsec_policy_set) {
- policy_mp = mp->b_cont;
- }
-
zoneid = connp->conn_zoneid;
if (IN6_IS_ADDR_MULTICAST(v6dst)) {
@@ -2620,7 +2558,7 @@ ip_bind_connected_v6(conn_t *connp, mblk_t *mp, in6_addr_t *v6src,
ipif = ipif_lookup_group_v6(v6dst, zoneid, ipst);
}
mutex_exit(&connp->conn_lock);
- if (ipif == NULL || !ire_requested ||
+ if (ipif == NULL || ire_requested ||
(dst_ire = ipif_to_ire_v6(ipif)) == NULL) {
if (ipif != NULL)
ipif_refrele(ipif);
@@ -2637,7 +2575,7 @@ ip_bind_connected_v6(conn_t *connp, mblk_t *mp, in6_addr_t *v6src,
ipif_refrele(ipif);
} else {
dst_ire = ire_route_lookup_v6(v6dst, NULL, NULL, 0,
- NULL, &sire, zoneid, MBLK_GETLABEL(mp),
+ NULL, &sire, zoneid, tsl,
MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
MATCH_IRE_PARENT | MATCH_IRE_RJ_BHOLE | MATCH_IRE_SECATTR,
ipst);
@@ -2693,8 +2631,8 @@ ip_bind_connected_v6(conn_t *connp, mblk_t *mp, in6_addr_t *v6src,
*/
if (dst_ire != NULL && is_system_labeled() &&
!IPCL_IS_TCP(connp) &&
- tsol_compute_label_v6(DB_CREDDEF(mp, connp->conn_cred), v6dst, NULL,
- connp->conn_mac_exempt, ipst) != 0) {
+ tsol_compute_label_v6(DB_CREDDEF(mp, connp->conn_cred),
+ v6dst, NULL, connp->conn_mac_exempt, ipst) != 0) {
error = EHOSTUNREACH;
if (ip_debug > 2) {
pr_addr_dbg("ip_bind_connected: no label for dst %s\n",
@@ -2831,25 +2769,24 @@ ip_bind_connected_v6(conn_t *connp, mblk_t *mp, in6_addr_t *v6src,
/* No need to hold ill here */
dst_ill = dst_ire->ire_ipif->ipif_ill;
}
- if (!ip6_asp_can_lookup(ipst)) {
- *mp->b_wptr++ = (char)protocol;
- ip6_asp_pending_op(CONNP_TO_WQ(connp), mp,
- ip_bind_connected_resume_v6);
- error = EINPROGRESS;
- goto refrele_and_quit;
- }
- src_ipif = ipif_select_source_v6(dst_ill, v6dst,
- RESTRICT_TO_NONE, connp->conn_src_preferences,
- zoneid);
- ip6_asp_table_refrele(ipst);
- if (src_ipif == NULL) {
- pr_addr_dbg("ip_bind_connected_v6: "
- "no usable source address for "
- "connection to %s\n", AF_INET6, v6dst);
+ if (ip6_asp_can_lookup(ipst)) {
+ src_ipif = ipif_select_source_v6(dst_ill,
+ v6dst, RESTRICT_TO_NONE,
+ connp->conn_src_preferences, zoneid);
+ ip6_asp_table_refrele(ipst);
+ if (src_ipif == NULL) {
+ pr_addr_dbg("ip_bind_connected_v6: "
+ "no usable source address for "
+ "connection to %s\n",
+ AF_INET6, v6dst);
+ error = EADDRNOTAVAIL;
+ goto bad_addr;
+ }
+ *v6src = src_ipif->ipif_v6lcl_addr;
+ } else {
error = EADDRNOTAVAIL;
goto bad_addr;
}
- *v6src = src_ipif->ipif_v6lcl_addr;
}
}
@@ -2922,13 +2859,13 @@ ip_bind_connected_v6(conn_t *connp, mblk_t *mp, in6_addr_t *v6src,
if (sire != NULL)
ulp_info = &(sire->ire_uinfo);
- if (!ip_bind_insert_ire_v6(mp, dst_ire, v6dst, ulp_info,
+ if (!ip_bind_get_ire_v6(mpp, dst_ire, v6dst, ulp_info,
ipst)) {
error = -1;
goto bad_addr;
}
} else if (ipsec_policy_set) {
- if (!ip_bind_ipsec_policy_set(connp, policy_mp)) {
+ if (!ip_bind_ipsec_policy_set(connp, mp)) {
error = -1;
goto bad_addr;
}
@@ -2982,19 +2919,24 @@ ip_bind_connected_v6(conn_t *connp, mblk_t *mp, in6_addr_t *v6src,
ASSERT(md_ill != NULL);
ASSERT(md_ill->ill_mdt_capab != NULL);
if ((mdinfo_mp = ip_mdinfo_return(md_dst_ire, connp,
- md_ill->ill_name, md_ill->ill_mdt_capab)) != NULL)
- linkb(mp, mdinfo_mp);
+ md_ill->ill_name, md_ill->ill_mdt_capab)) != NULL) {
+ if (mp == NULL) {
+ *mpp = mdinfo_mp;
+ } else {
+ linkb(mp, mdinfo_mp);
+ }
+ }
}
}
bad_addr:
if (ipsec_policy_set) {
- ASSERT(policy_mp != NULL);
- freeb(policy_mp);
+ ASSERT(mp != NULL);
+ freeb(mp);
/*
* As of now assume that nothing else accompanies
* IPSEC_POLICY_SET.
*/
- mp->b_cont = NULL;
+ *mpp = NULL;
}
refrele_and_quit:
if (src_ire != NULL)
@@ -3012,34 +2954,110 @@ refrele_and_quit:
return (error);
}
+/* ARGSUSED */
+int
+ip_proto_bind_connected_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol,
+ in6_addr_t *v6srcp, uint16_t lport, const in6_addr_t *v6dstp,
+ ip6_pkt_t *ipp, uint16_t fport, boolean_t fanout_insert,
+ boolean_t verify_dst)
+{
+ int error = 0;
+ boolean_t orig_pkt_isv6 = connp->conn_pkt_isv6;
+ boolean_t ire_requested;
+ ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
+
+ /*
+ * Note that we allow connect to broadcast and multicast
+ * address when ire_requested is set. Thus the ULP
+ * has to check for IRE_BROADCAST and multicast.
+ */
+ ASSERT(mpp != NULL);
+ ire_requested = (*mpp != NULL && DB_TYPE(*mpp) == IRE_DB_REQ_TYPE);
+
+ ASSERT(connp->conn_af_isv6);
+ connp->conn_ulp = protocol;
+
+ /* For raw socket, the local port is not set. */
+ lport = lport != 0 ? lport : connp->conn_lport;
+
+ /*
+ * Bind to local and remote address. Local might be
+ * unspecified in which case it will be extracted from
+ * ire_src_addr_v6
+ */
+ if (IN6_IS_ADDR_V4MAPPED(v6dstp) && !connp->conn_ipv6_v6only) {
+ /* Connect to IPv4 address */
+ ipaddr_t v4src;
+ ipaddr_t v4dst;
+
+ /* Is the source unspecified or mapped? */
+ if (!IN6_IS_ADDR_V4MAPPED(v6srcp) &&
+ !IN6_IS_ADDR_UNSPECIFIED(v6srcp)) {
+ ip1dbg(("ip_proto_bind_connected_v6: "
+ "dst is mapped, but not the src\n"));
+ goto bad_addr;
+ }
+ IN6_V4MAPPED_TO_IPADDR(v6srcp, v4src);
+ IN6_V4MAPPED_TO_IPADDR(v6dstp, v4dst);
+
+ /* Always verify destination reachability. */
+ error = ip_bind_connected_v4(connp, mpp, protocol, &v4src,
+ lport, v4dst, fport, B_TRUE, B_TRUE);
+ if (error != 0)
+ goto bad_addr;
+ IN6_IPADDR_TO_V4MAPPED(v4src, v6srcp);
+ connp->conn_pkt_isv6 = B_FALSE;
+ } else if (IN6_IS_ADDR_V4MAPPED(v6srcp)) {
+ ip1dbg(("ip_proto_bind_connected_v6: "
+ "src is mapped, but not the dst\n"));
+ goto bad_addr;
+ } else {
+ error = ip_bind_connected_v6(connp, mpp, protocol, v6srcp,
+ lport, v6dstp, ipp, fport, B_TRUE, verify_dst);
+ if (error != 0)
+ goto bad_addr;
+ connp->conn_pkt_isv6 = B_TRUE;
+ }
+
+ ip_bind_post_handling_v6(connp, mpp ? *mpp : NULL,
+ orig_pkt_isv6 != connp->conn_pkt_isv6, ire_requested, ipst);
+
+ /* Send it home. */
+ return (0);
+
+bad_addr:
+ if (error == 0)
+ error = -TBADADDR;
+ return (error);
+}
+
/*
- * Insert the ire in b_cont. Returns false if it fails (due to lack of space).
+ * Get the ire in *mpp. Returns false if it fails (due to lack of space).
* Makes the IRE be IRE_BROADCAST if dst is a multicast address.
*/
/* ARGSUSED4 */
static boolean_t
-ip_bind_insert_ire_v6(mblk_t *mp, ire_t *ire, const in6_addr_t *dst,
+ip_bind_get_ire_v6(mblk_t **mpp, ire_t *ire, const in6_addr_t *dst,
iulp_t *ulp_info, ip_stack_t *ipst)
{
- mblk_t *mp1;
+ mblk_t *mp = *mpp;
ire_t *ret_ire;
- mp1 = mp->b_cont;
- ASSERT(mp1 != NULL);
+ ASSERT(mp != NULL);
if (ire != NULL) {
/*
- * mp1 initialized above to IRE_DB_REQ_TYPE
+ * mp initialized above to IRE_DB_REQ_TYPE
* appended mblk. Its <upper protocol>'s
* job to make sure there is room.
*/
- if ((mp1->b_datap->db_lim - mp1->b_rptr) < sizeof (ire_t))
+ if ((mp->b_datap->db_lim - mp->b_rptr) < sizeof (ire_t))
return (B_FALSE);
- mp1->b_datap->db_type = IRE_DB_TYPE;
- mp1->b_wptr = mp1->b_rptr + sizeof (ire_t);
- bcopy(ire, mp1->b_rptr, sizeof (ire_t));
- ret_ire = (ire_t *)mp1->b_rptr;
+ mp->b_datap->db_type = IRE_DB_TYPE;
+ mp->b_wptr = mp->b_rptr + sizeof (ire_t);
+ bcopy(ire, mp->b_rptr, sizeof (ire_t));
+ ret_ire = (ire_t *)mp->b_rptr;
if (IN6_IS_ADDR_MULTICAST(dst) ||
IN6_IS_ADDR_V4MAPPED_CLASSD(dst)) {
ret_ire->ire_type = IRE_BROADCAST;
@@ -3049,13 +3067,13 @@ ip_bind_insert_ire_v6(mblk_t *mp, ire_t *ire, const in6_addr_t *dst,
bcopy(ulp_info, &(ret_ire->ire_uinfo),
sizeof (iulp_t));
}
- ret_ire->ire_mp = mp1;
+ ret_ire->ire_mp = mp;
} else {
/*
* No IRE was found. Remove IRE mblk.
*/
- mp->b_cont = mp1->b_cont;
- freeb(mp1);
+ *mpp = mp->b_cont;
+ freeb(mp);
}
return (B_TRUE);
}
@@ -3168,7 +3186,7 @@ ip_fanout_proto_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill,
break;
}
- if (connp == NULL || connp->conn_upq == NULL) {
+ if (connp == NULL) {
/*
* No one bound to this port. Is
* there a client that wants all
@@ -3184,6 +3202,8 @@ ip_fanout_proto_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill,
return;
}
+ ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL);
+
CONN_INC_REF(connp);
first_connp = connp;
@@ -3217,7 +3237,7 @@ ip_fanout_proto_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill,
* needed just for verifying policy and it is never
* sent up.
*/
- if (connp == NULL || connp->conn_upq == NULL ||
+ if (connp == NULL ||
(((first_mp1 = dupmsg(first_mp)) == NULL) &&
((first_mp1 = ip_copymsg(first_mp)) == NULL))) {
/*
@@ -3227,6 +3247,7 @@ ip_fanout_proto_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill,
connp = first_connp;
break;
}
+ ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL);
mp1 = mctl_present ? first_mp1->b_cont : first_mp1;
CONN_INC_REF(connp);
mutex_exit(&connfp->connf_lock);
@@ -3243,7 +3264,9 @@ ip_fanout_proto_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill,
}
if (mp1 == NULL) {
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
- } else if (!canputnext(rq)) {
+ } else if (
+ (IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
+ (!IPCL_IS_NONSTR(connp) && !canputnext(rq))) {
if (flags & IP_FF_RAWIP) {
BUMP_MIB(ill->ill_ip_mib,
rawipIfStatsInOverflows);
@@ -3320,7 +3343,9 @@ ip_fanout_proto_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill,
}
rq = connp->conn_rq;
- if (!canputnext(rq)) {
+ if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
+ (!IPCL_IS_NONSTR(connp) && !canputnext(rq))) {
+
if (flags & IP_FF_RAWIP) {
BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows);
} else {
@@ -3740,7 +3765,8 @@ ip_fanout_udp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, uint32_t ports,
CONN_INC_REF(connp);
mutex_exit(&connfp->connf_lock);
- if (CONN_UDP_FLOWCTLD(connp)) {
+ if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
+ (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) {
freemsg(first_mp);
CONN_DEC_REF(connp);
return;
@@ -3870,7 +3896,8 @@ ip_fanout_udp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, uint32_t ports,
BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
goto next_one;
}
- if (CONN_UDP_FLOWCTLD(connp)) {
+ if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
+ (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) {
BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
freemsg(first_mp1);
goto next_one;
@@ -3938,7 +3965,8 @@ next_one:
first_mp = mp;
}
}
- if (CONN_UDP_FLOWCTLD(connp)) {
+ if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
+ (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) {
BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
freemsg(mp);
} else {
@@ -8397,7 +8425,8 @@ udp_fanout:
return;
}
- if (CONN_UDP_FLOWCTLD(connp)) {
+ if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) ||
+ (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) {
freemsg(first_mp);
BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows);
CONN_DEC_REF(connp);
@@ -9069,7 +9098,7 @@ done:
*
* case 1 : Routing header was processed by this node and
* ip_process_rthdr replaced ip6_dst with the next hop
- * and we are forwarding the packet to the next hop.
+ * and we are forwarding the packet to the next hop.
*
* case 2 : Routing header was not processed by this node and we
* are just forwarding the packet.
diff --git a/usr/src/uts/common/inet/ip/ip6_if.c b/usr/src/uts/common/inet/ip/ip6_if.c
index dc703f40c3..81447c2e30 100644
--- a/usr/src/uts/common/inet/ip/ip6_if.c
+++ b/usr/src/uts/common/inet/ip/ip6_if.c
@@ -284,6 +284,44 @@ repeat:
goto repeat;
}
+boolean_t
+ip_addr_exists_v6(const in6_addr_t *addr, zoneid_t zoneid,
+ ip_stack_t *ipst)
+{
+ ipif_t *ipif;
+ ill_t *ill;
+ ill_walk_context_t ctx;
+
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+
+ ill = ILL_START_WALK_V6(&ctx, ipst);
+ for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+ mutex_enter(&ill->ill_lock);
+ for (ipif = ill->ill_ipif; ipif != NULL;
+ ipif = ipif->ipif_next) {
+ if (zoneid != ALL_ZONES &&
+ ipif->ipif_zoneid != zoneid &&
+ ipif->ipif_zoneid != ALL_ZONES)
+ continue;
+ /* Allow the ipif to be down */
+ if (((IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
+ addr) &&
+ (ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
+ ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
+ IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr,
+ addr))) {
+ mutex_exit(&ill->ill_lock);
+ rw_exit(&ipst->ips_ill_g_lock);
+ return (B_TRUE);
+ }
+ }
+ mutex_exit(&ill->ill_lock);
+ }
+
+ rw_exit(&ipst->ips_ill_g_lock);
+ return (B_FALSE);
+}
+
/*
* Look for an ipif with the specified address. For point-point links
* we look for matches on either the destination address and the local
@@ -2237,7 +2275,6 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst,
dstinfo.dst_scope = ip_addr_scope_v6(dst);
dstinfo.dst_label = ip6_asp_lookup(dst, NULL, ipst);
dstinfo.dst_prefer_src_tmp = ((src_prefs & IPV6_PREFER_SRC_TMP) != 0);
-
rw_enter(&ipst->ips_ill_g_lock, RW_READER);
/*
* Section three of the I-D states that for multicast and
diff --git a/usr/src/uts/common/inet/ip/ip_helper_stream.c b/usr/src/uts/common/inet/ip/ip_helper_stream.c
new file mode 100644
index 0000000000..7da64667d1
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/ip_helper_stream.c
@@ -0,0 +1,482 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/ipclassifier.h>
+#include <inet/proto_set.h>
+#include <sys/stream.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <sys/cmn_err.h>
+#include <sys/t_kuser.h>
+#include <sys/tihdr.h>
+#include <sys/pathname.h>
+#include <sys/sockio.h>
+#include <sys/vmem.h>
+#include <sys/disp.h>
+
+void ip_helper_wput(queue_t *q, mblk_t *mp);
+
+static int ip_helper_stream_close(queue_t *, int);
+
+static struct module_info ip_helper_stream_info = {
+ 0, "iphelper", IP_MOD_MINPSZ, IP_MOD_MAXPSZ, IP_MOD_HIWAT, IP_MOD_LOWAT
+};
+
+static struct qinit ip_helper_stream_rinit = {
+ NULL, NULL, NULL, ip_helper_stream_close, NULL,
+ &ip_helper_stream_info, NULL
+};
+
+static struct qinit ip_helper_stream_winit = {
+ (pfi_t)ip_helper_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL,
+ &ip_helper_stream_info, NULL, NULL, NULL, STRUIOT_NONE
+};
+
+#define IP_USE_HELPER_CACHE (ip_helper_stream_cache != NULL)
+
+/*
+ * set the q_ptr of the 'q' to the conn_t pointer passed in
+ */
+static void
+ip_helper_share_conn(queue_t *q, mblk_t *mp)
+{
+ if (IP_USE_HELPER_CACHE) {
+ ip_helper_stream_info_t *ip_helper_info;
+
+ ip_helper_info = *((ip_helper_stream_info_t **)
+ mp->b_cont->b_rptr);
+ ip_helper_info->ip_helper_stream_minfo = q->q_ptr;
+ ip_helper_info->ip_helper_stream_rq = RD(q);
+ ip_helper_info->ip_helper_stream_wq = WR(q);
+ } else {
+ conn_t *connp = *((conn_t **)mp->b_cont->b_rptr);
+
+ connp->conn_helper_info->ip_helper_stream_minfo = q->q_ptr;
+ connp->conn_helper_info->ip_helper_stream_rq = RD(q);
+ connp->conn_helper_info->ip_helper_stream_wq = WR(q);
+ WR(q)->q_ptr = RD(q)->q_ptr = (void *)connp;
+ connp->conn_rq = RD(q);
+ connp->conn_wq = WR(q);
+ }
+ miocack(q, mp, 0, 0);
+}
+
+void
+ip_helper_wput(queue_t *q, mblk_t *mp)
+{
+ struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
+ if (DB_TYPE(mp) == M_IOCTL &&
+ iocp->ioc_cmd == SIOCSQPTR) {
+ ip_helper_share_conn(q, mp);
+ } else {
+ conn_t *connp = (conn_t *)q->q_ptr;
+
+ if (connp->conn_af_isv6) {
+ ip_wput_v6(q, mp);
+ } else {
+ ip_wput(q, mp);
+ }
+ }
+}
+
+/* ARGSUSED */
+int
+ip_helper_stream_setup(queue_t *q, dev_t *devp, int flag, int sflag,
+ cred_t *credp, boolean_t isv6)
+{
+ major_t maj;
+ ip_helper_minfo_t *ip_minfop;
+
+ ASSERT((flag & ~(FKLYR)) == IP_HELPER_STR);
+
+ ASSERT(RD(q) == q);
+
+ ip_minfop = kmem_alloc(sizeof (ip_helper_minfo_t), KM_NOSLEEP);
+ if (ip_minfop == NULL) {
+ return (ENOMEM);
+ }
+
+ ip_minfop->ip_minfo_dev = 0;
+ ip_minfop->ip_minfo_arena = NULL;
+
+ /*
+ * Clone the device, allocate minor device number
+ */
+ if (ip_minor_arena_la != NULL)
+ ip_minfop->ip_minfo_dev = inet_minor_alloc(ip_minor_arena_la);
+
+ if (ip_minfop->ip_minfo_dev == 0) {
+ /*
+ * numbers in the large arena are exhausted
+ * Try small arena.
+ * Or this is a 32 bit system, 32 bit systems do not have
+ * ip_minor_arena_la
+ */
+ ip_minfop->ip_minfo_dev = inet_minor_alloc(ip_minor_arena_sa);
+ if (ip_minfop->ip_minfo_dev == 0) {
+ return (EBUSY);
+ }
+ ip_minfop->ip_minfo_arena = ip_minor_arena_sa;
+ } else {
+ ip_minfop->ip_minfo_arena = ip_minor_arena_la;
+ }
+
+
+ ASSERT(ip_minfop->ip_minfo_dev != 0);
+ ASSERT(ip_minfop->ip_minfo_arena != NULL);
+
+ RD(q)->q_ptr = WR(q)->q_ptr = ip_minfop;
+
+ maj = getemajor(*devp);
+ *devp = makedevice(maj, (ulong_t)(ip_minfop->ip_minfo_dev));
+
+ q->q_qinfo = &ip_helper_stream_rinit;
+ WR(q)->q_qinfo = &ip_helper_stream_winit;
+ qprocson(q);
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+ip_helper_stream_close(queue_t *q, int flag)
+{
+ ip_helper_minfo_t *ip_minfop;
+
+ qprocsoff(q);
+ ip_minfop = (q)->q_ptr;
+ inet_minor_free(ip_minfop->ip_minfo_arena,
+ ip_minfop->ip_minfo_dev);
+ kmem_free(ip_minfop, sizeof (ip_helper_minfo_t));
+ RD(q)->q_ptr = NULL;
+ WR(q)->q_ptr = NULL;
+ return (0);
+}
+
+/*
+ * Public interface for creating an IP stream with shared conn_t
+ */
+/* ARGSUSED */
+int
+ip_create_helper_stream(conn_t *connp, ldi_ident_t li)
+{
+ int error;
+ int ret;
+
+ ASSERT(!servicing_interrupt());
+
+ error = 0;
+ if (IP_USE_HELPER_CACHE) {
+ connp->conn_helper_info = (ip_helper_stream_info_t *)
+ kmem_cache_alloc(ip_helper_stream_cache, KM_SLEEP);
+ ASSERT(connp->conn_helper_info != NULL);
+ connp->conn_rq = connp->conn_helper_info->ip_helper_stream_rq;
+ connp->conn_wq = connp->conn_helper_info->ip_helper_stream_wq;
+ connp->conn_helper_info->ip_helper_stream_rq->q_ptr =
+ (void *)connp;
+ connp->conn_helper_info->ip_helper_stream_wq->q_ptr =
+ (void *)connp;
+ } else {
+ ASSERT(connp->conn_helper_info == NULL);
+ connp->conn_helper_info = (ip_helper_stream_info_t *)
+ kmem_alloc(sizeof (ip_helper_stream_info_t), KM_SLEEP);
+ /*
+ * open ip device via the layered interface.
+ * pass in kcred as some threads do not have the
+ * priviledge to open /dev/ip and the check in
+ * secpolicy_spec_open() will fail the open
+ */
+ error = ldi_open_by_name(connp->conn_af_isv6 ?
+ DEV_IP6 : DEV_IP, IP_HELPER_STR,
+ kcred, &connp->conn_helper_info->ip_helper_stream_handle,
+ li);
+
+ if (error != 0) {
+ kmem_free(connp->conn_helper_info,
+ (sizeof (ip_helper_stream_info_t)));
+ connp->conn_helper_info = NULL;
+ return (error);
+ }
+ /*
+ * Share connp with the helper stream
+ */
+ error = ldi_ioctl(
+ connp->conn_helper_info->ip_helper_stream_handle,
+ SIOCSQPTR, (intptr_t)connp, FKIOCTL, kcred, &ret);
+
+ if (error != 0) {
+ /*
+ * Passing in a zero flag indicates that an error
+ * occured and stream was not shared
+ */
+ (void) ldi_close(
+ connp->conn_helper_info->ip_helper_stream_handle,
+ 0, kcred);
+ kmem_free(connp->conn_helper_info,
+ (sizeof (ip_helper_stream_info_t)));
+ connp->conn_helper_info = NULL;
+ }
+ }
+ return (error);
+}
+
+/*
+ * Public interface for closing the shared IP stream
+ */
+/* ARGSUSED */
+void
+ip_close_helper_stream(conn_t *connp)
+{
+ ASSERT(!servicing_interrupt());
+ if (IP_USE_HELPER_CACHE) {
+ ASSERT(connp->conn_helper_info->ip_helper_stream_rq != NULL);
+ ASSERT(connp->conn_helper_info->ip_helper_stream_wq != NULL);
+
+ /* Prevent service procedures from being called */
+ disable_svc(connp->conn_helper_info->ip_helper_stream_rq);
+
+ /* Wait until service procedure of each queue is run */
+ wait_svc(connp->conn_helper_info->ip_helper_stream_rq);
+
+ /* Cleanup any pending ioctls */
+ conn_ioctl_cleanup(connp);
+
+ /* Allow service procedures to be called again */
+ enable_svc(connp->conn_helper_info->ip_helper_stream_rq);
+
+ /* Flush the queues */
+ flushq(connp->conn_helper_info->ip_helper_stream_rq, FLUSHALL);
+ flushq(connp->conn_helper_info->ip_helper_stream_wq, FLUSHALL);
+
+ connp->conn_helper_info->ip_helper_stream_rq->q_ptr = NULL;
+ connp->conn_helper_info->ip_helper_stream_wq->q_ptr = NULL;
+
+ kmem_cache_free(ip_helper_stream_cache,
+ connp->conn_helper_info);
+ } else {
+ ASSERT(
+ connp->conn_helper_info->ip_helper_stream_handle != NULL);
+
+ connp->conn_helper_info->ip_helper_stream_rq->q_ptr =
+ connp->conn_helper_info->ip_helper_stream_wq->q_ptr =
+ connp->conn_helper_info->ip_helper_stream_minfo;
+ (void) ldi_close(
+ connp->conn_helper_info->ip_helper_stream_handle,
+ IP_HELPER_STR, kcred);
+ kmem_free(connp->conn_helper_info,
+ sizeof (ip_helper_stream_info_t));
+ }
+ connp->conn_helper_info = NULL;
+}
+
+/*
+ * create a T_SVR4_OPTMGMT_REQ TPI message and send down the IP stream
+ */
+static int
+ip_send_option_request(conn_t *connp, uint_t optset_context, int level,
+ int option_name, const void *optval, t_uscalar_t optlen, cred_t *cr)
+{
+ struct T_optmgmt_req *optmgmt_reqp;
+ struct opthdr *ohp;
+ ssize_t size;
+ mblk_t *mp;
+ int error;
+
+ size = sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) + optlen;
+ mp = allocb_cred(size, cr);
+ if (mp == NULL)
+ return (ENOMEM);
+
+ mp->b_datap->db_type = M_PROTO;
+ optmgmt_reqp = (struct T_optmgmt_req *)mp->b_wptr;
+
+ optmgmt_reqp->PRIM_type = T_SVR4_OPTMGMT_REQ;
+ optmgmt_reqp->MGMT_flags = optset_context;
+ optmgmt_reqp->OPT_length = (t_scalar_t)sizeof (struct opthdr) + optlen;
+ optmgmt_reqp->OPT_offset = (t_scalar_t)sizeof (struct T_optmgmt_req);
+
+ mp->b_wptr += sizeof (struct T_optmgmt_req);
+
+ ohp = (struct opthdr *)mp->b_wptr;
+
+ ohp->level = level;
+ ohp->name = option_name;
+ ohp->len = optlen;
+
+ mp->b_wptr += sizeof (struct opthdr);
+
+ if (optval != NULL) {
+ bcopy(optval, mp->b_wptr, optlen);
+ } else {
+ bzero(mp->b_wptr, optlen);
+ }
+ mp->b_wptr += optlen;
+
+ /*
+ * Send down the primitive
+ */
+ error = ldi_putmsg(connp->conn_helper_info->ip_helper_stream_handle,
+ mp);
+ return (error);
+}
+
+/*
+ * wait/process the response to T_SVR4_OPTMGMT_REQ TPI message
+ */
+static int
+ip_get_option_response(conn_t *connp, uint_t optset_context, void *optval,
+ t_uscalar_t *optlenp)
+{
+ union T_primitives *tpr;
+ int error;
+ mblk_t *mp;
+
+ mp = NULL;
+
+ ASSERT(optset_context == T_CHECK || optset_context == T_NEGOTIATE);
+ error = ldi_getmsg(connp->conn_helper_info->ip_helper_stream_handle,
+ &mp, NULL);
+ if (error != 0) {
+ return (error);
+ }
+
+ if (DB_TYPE(mp) != M_PCPROTO || MBLKL(mp) < sizeof (tpr->type)) {
+ error = EPROTO;
+ goto done;
+ }
+
+ tpr = (union T_primitives *)mp->b_rptr;
+
+ switch (tpr->type) {
+ case T_OPTMGMT_ACK:
+ if (MBLKL(mp) < TOPTMGMTACKSZ)
+ error = EPROTO;
+ break;
+ case T_ERROR_ACK:
+ if (MBLKL(mp) < TERRORACKSZ) {
+ error = EPROTO;
+ break;
+ }
+
+ if (tpr->error_ack.TLI_error == TSYSERR)
+ error = tpr->error_ack.UNIX_error;
+ else
+ error = proto_tlitosyserr(tpr->error_ack.TLI_error);
+ break;
+ default:
+ error = EPROTO;
+ break;
+ }
+
+ if ((optset_context == T_CHECK) && (error == 0)) {
+ struct opthdr *opt_res;
+ t_uscalar_t len;
+ t_uscalar_t size;
+ t_uscalar_t maxlen = *optlenp;
+ void *option;
+ struct T_optmgmt_ack *optmgmt_ack;
+
+ optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
+ opt_res = (struct opthdr *)
+ ((uintptr_t)mp->b_rptr + optmgmt_ack->OPT_offset);
+ /*
+ * Check mblk boundary
+ */
+ if (!MBLKIN(mp, optmgmt_ack->OPT_offset,
+ optmgmt_ack->OPT_length)) {
+ error = EPROTO;
+ goto done;
+ }
+
+ /*
+ * Check alignment
+ */
+ if ((((uintptr_t)opt_res) & (__TPI_ALIGN_SIZE - 1)) != 0) {
+ error = EPROTO;
+ goto done;
+ }
+
+ option = &opt_res[1];
+
+ /* check to ensure that the option is within bounds */
+ if ((((uintptr_t)option + opt_res->len) < (uintptr_t)option) ||
+ !MBLKIN(mp, sizeof (struct opthdr), opt_res->len)) {
+ error = EPROTO;
+ goto done;
+ }
+
+ len = opt_res->len;
+ size = MIN(len, maxlen);
+
+ /*
+ * Copy data
+ */
+ bcopy(option, optval, size);
+ bcopy(&size, optlenp, sizeof (size));
+ }
+
+done:
+ freemsg(mp);
+ return (error);
+}
+
+/*
+ * Public interface to get socketoptions via the ip helper stream.
+ */
+int
+ip_get_options(conn_t *connp, int level, int option_name, void *optval,
+ t_uscalar_t *optlenp, cred_t *cr)
+{
+ int error;
+
+ error = ip_send_option_request(connp, T_CHECK, level, option_name, NULL,
+ *optlenp, cr);
+ if (error)
+ return (error);
+
+ return (ip_get_option_response(connp, T_CHECK, optval, optlenp));
+}
+
+/*
+ * Public interface to set socket options via the ip helper stream.
+ */
+int
+ip_set_options(conn_t *connp, int level, int option_name, const void *optval,
+ t_uscalar_t optlen, cred_t *cr)
+{
+
+ int error;
+
+ error = ip_send_option_request(connp, T_NEGOTIATE, level, option_name,
+ optval, optlen, cr);
+ if (error)
+ return (error);
+
+ return (ip_get_option_response(connp, T_NEGOTIATE, (void *)optval,
+ &optlen));
+}
diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c
index d767b25a76..0597245499 100644
--- a/usr/src/uts/common/inet/ip/ip_if.c
+++ b/usr/src/uts/common/inet/ip/ip_if.c
@@ -5845,6 +5845,55 @@ repeat:
}
/*
+ * Check if the address exists in the system.
+ * We don't hold the conn_lock as we will not perform defered ipsqueue
+ * operation.
+ */
+boolean_t
+ip_addr_exists(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst)
+{
+ ipif_t *ipif;
+ ill_t *ill;
+ ill_walk_context_t ctx;
+
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+
+ ill = ILL_START_WALK_V4(&ctx, ipst);
+ for (; ill != NULL; ill = ill_next(&ctx, ill)) {
+ mutex_enter(&ill->ill_lock);
+ for (ipif = ill->ill_ipif; ipif != NULL;
+ ipif = ipif->ipif_next) {
+ if (zoneid != ALL_ZONES &&
+ zoneid != ipif->ipif_zoneid &&
+ ipif->ipif_zoneid != ALL_ZONES)
+ continue;
+ /* Allow the ipif to be down */
+ /*
+ * XXX Different from ipif_lookup_addr(), we don't do
+ * twice lookups. As from bind()'s point of view, we
+ * may return once we find a match.
+ */
+ if (((ipif->ipif_lcl_addr == addr) &&
+ ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) ||
+ ((ipif->ipif_flags & IPIF_POINTOPOINT) &&
+ (ipif->ipif_pp_dst_addr == addr))) {
+ /*
+ * Allow bind() to be successful even if the
+ * ipif is with IPIF_CHANGING bit set.
+ */
+ mutex_exit(&ill->ill_lock);
+ rw_exit(&ipst->ips_ill_g_lock);
+ return (B_TRUE);
+ }
+ }
+ mutex_exit(&ill->ill_lock);
+ }
+
+ rw_exit(&ipst->ips_ill_g_lock);
+ return (B_FALSE);
+}
+
+/*
* Look for an ipif with the specified address. For point-point links
* we look for matches on either the destination address and the local
* address, but we ignore the check on the local address if IPIF_UNNUMBERED
@@ -22145,7 +22194,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, q, mp,
ip_process_ioctl, &err, ipst);
-
if (usesrc_ill == NULL) {
return (err);
}
diff --git a/usr/src/uts/common/inet/ip/ip_opt_data.c b/usr/src/uts/common/inet/ip/ip_opt_data.c
index 3df66ece60..bb6e98a99e 100644
--- a/usr/src/uts/common/inet/ip/ip_opt_data.c
+++ b/usr/src/uts/common/inet/ip/ip_opt_data.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/stream.h>
#define _SUN_TPI_VERSION 2
@@ -45,7 +43,7 @@ extern int ip_opt_default(queue_t *q, int level, int name, uchar_t *ptr);
extern int ip_opt_get(queue_t *q, int level, int name, uchar_t *ptr);
extern int ip_opt_set(queue_t *q, uint_t optset_context, int level,
int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
- void *, cred_t *cr, mblk_t *);
+ void *dummy, cred_t *cr, mblk_t *first_mp);
/*
* Table of all known options handled on a IP protocol stack.
@@ -71,9 +69,11 @@ opdes_t ip_opt_arr[] = {
{ IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
- (OP_VARLEN|OP_NODEFAULT), 40, -1 /* not initialized */ },
+ (OP_VARLEN|OP_NODEFAULT),
+ IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
{ T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
- (OP_VARLEN|OP_NODEFAULT), 40, -1 /* not initialized */ },
+ (OP_VARLEN|OP_NODEFAULT),
+ IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
{ IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
diff --git a/usr/src/uts/common/inet/ip/ip_rts.c b/usr/src/uts/common/inet/ip/ip_rts.c
index e232f6c04e..3324d1d833 100644
--- a/usr/src/uts/common/inet/ip/ip_rts.c
+++ b/usr/src/uts/common/inet/ip/ip_rts.c
@@ -1,5 +1,5 @@
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -38,8 +38,6 @@
* @(#)rtsock.c 8.6 (Berkeley) 2/11/95
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* This file contains routines that processes routing socket requests.
*/
@@ -104,10 +102,9 @@ static void ip_rts_request_retry(ipsq_t *, queue_t *q, mblk_t *mp, void *);
*
*/
void
-rts_queue_input(mblk_t *mp, queue_t *q, sa_family_t af, ip_stack_t *ipst)
+rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, ip_stack_t *ipst)
{
mblk_t *mp1;
- int checkqfull;
conn_t *connp, *next_connp;
mutex_enter(&ipst->ips_rts_clients->connf_lock);
@@ -130,24 +127,16 @@ rts_queue_input(mblk_t *mp, queue_t *q, sa_family_t af, ip_stack_t *ipst)
* socket, we check if there is room upstream for a copy of the
* message.
*/
- if ((q != NULL) && (CONNP_TO_RQ(connp) == RD(q))) {
- if (connp->conn_loopback == 0) {
+ if ((o_connp == connp) && connp->conn_loopback == 0) {
connp = connp->conn_next;
continue;
- }
- /*
- * Just because it is the same queue doesn't mean it
- * will promptly read its acks. Have to avoid using
- * all of kernel memory.
- */
- checkqfull = B_TRUE;
- } else {
- checkqfull = B_TRUE;
}
CONN_INC_REF(connp);
mutex_exit(&ipst->ips_rts_clients->connf_lock);
/* Pass to rts_input */
- if (!checkqfull || canputnext(CONNP_TO_RQ(connp))) {
+ if ((IPCL_IS_NONSTR(connp) && !PROTO_FLOW_CNTRLD(connp))||
+ (!IPCL_IS_NONSTR(connp) &&
+ canputnext(CONNP_TO_RQ(connp)))) {
mp1 = dupmsg(mp);
if (mp1 == NULL)
mp1 = copymsg(mp);
@@ -273,7 +262,7 @@ ip_rts_unregister(conn_t *connp)
* conn close occurs in conn_ioctl_cleanup.
*/
int
-ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
+ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr)
{
rt_msghdr_t *rtm = NULL;
in6_addr_t dst_addr_v6;
@@ -298,7 +287,6 @@ ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
ipif_t *ipif = NULL;
ipif_t *tmp_ipif = NULL;
IOCP iocp = (IOCP)mp->b_rptr;
- conn_t *connp;
boolean_t gcgrp_xtraref = B_FALSE;
tsol_gcgrp_addr_t ga;
tsol_rtsecattr_t rtsecattr;
@@ -311,8 +299,6 @@ ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
ip1dbg(("ip_rts_request: mp is %x\n", DB_TYPE(mp)));
- ASSERT(CONN_Q(q));
- connp = Q_TO_CONN(q);
zoneid = connp->conn_zoneid;
ipst = connp->conn_netstack->netstack_ip;
@@ -564,7 +550,7 @@ ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
error = ip_rt_add(dst_addr, net_mask, gw_addr, src_addr,
rtm->rtm_flags, ipif, &ire, B_FALSE,
- CONNP_TO_WQ(connp), ioc_mp, ip_rts_request_retry,
+ WR(q), ioc_mp, ip_rts_request_retry,
rtsap, ipst);
if (ipif != NULL)
ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
@@ -602,7 +588,7 @@ ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
error = ip_rt_add_v6(&dst_addr_v6, &net_mask_v6,
&gw_addr_v6, &src_addr_v6, rtm->rtm_flags,
- ipif, &ire, CONNP_TO_WQ(connp), ioc_mp,
+ ipif, &ire, WR(q), ioc_mp,
ip_rts_request_retry, rtsap, ipst);
break;
}
@@ -616,7 +602,7 @@ ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
}
error = ip_rt_add_v6(&dst_addr_v6, &net_mask_v6,
&gw_addr_v6, NULL, rtm->rtm_flags,
- ipif, &ire, CONNP_TO_WQ(connp), ioc_mp,
+ ipif, &ire, WR(q), ioc_mp,
ip_rts_request_retry, rtsap, ipst);
if (ipif != NULL)
ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock));
@@ -646,14 +632,12 @@ ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
case AF_INET:
error = ip_rt_delete(dst_addr, net_mask, gw_addr,
found_addrs, rtm->rtm_flags, ipif, B_FALSE,
- CONNP_TO_WQ(connp), ioc_mp, ip_rts_request_retry,
- ipst);
+ WR(q), ioc_mp, ip_rts_request_retry, ipst);
break;
case AF_INET6:
error = ip_rt_delete_v6(&dst_addr_v6, &net_mask_v6,
&gw_addr_v6, found_addrs, rtm->rtm_flags, ipif,
- CONNP_TO_WQ(connp), ioc_mp, ip_rts_request_retry,
- ipst);
+ WR(q), ioc_mp, ip_rts_request_retry, ipst);
break;
}
break;
@@ -867,7 +851,7 @@ ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
*/
tmp_ipif = ipif_lookup_addr(
src_addr, NULL, ALL_ZONES,
- CONNP_TO_WQ(connp), ioc_mp,
+ WR(q), ioc_mp,
ip_rts_request_retry,
&error, ipst);
if (tmp_ipif == NULL) {
@@ -1053,19 +1037,27 @@ done:
/* OK ACK already set up by caller except this */
ip2dbg(("ip_rts_request: OK ACK\n"));
}
- rts_queue_input(mp, q, af, ipst);
+ rts_queue_input(mp, connp, af, ipst);
}
+
iocp->ioc_error = error;
ioc_mp->b_datap->db_type = M_IOCACK;
if (iocp->ioc_error != 0)
iocp->ioc_count = 0;
(connp->conn_recv)(connp, ioc_mp, NULL);
+
/* conn was refheld in ip_wput_ioctl. */
CONN_OPER_PENDING_DONE(connp);
return (error);
}
+int
+ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
+{
+ return (ip_rts_request_common(q, mp, Q_TO_CONN(q), ioc_cr));
+}
+
/*
* Build a reply to the RTM_GET request contained in the given message block
* using the retrieved IRE of the destination address, the parent IRE (if it
diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c
index a19e729b41..50bd38c981 100644
--- a/usr/src/uts/common/inet/ip/ipclassifier.c
+++ b/usr/src/uts/common/inet/ip/ipclassifier.c
@@ -261,8 +261,8 @@
#include <inet/ip.h>
#include <inet/ip6.h>
-#include <inet/tcp.h>
#include <inet/ip_ndp.h>
+#include <inet/ip_impl.h>
#include <inet/udp_impl.h>
#include <inet/sctp_ip.h>
#include <inet/sctp/sctp_impl.h>
@@ -272,9 +272,11 @@
#include <sys/cpuvar.h>
#include <inet/ipclassifier.h>
+#include <inet/tcp.h>
#include <inet/ipsec_impl.h>
#include <sys/tsol/tnet.h>
+#include <sys/sockio.h>
#ifdef DEBUG
#define IPCL_DEBUG
@@ -325,6 +327,7 @@ typedef union itc_s {
struct kmem_cache *tcp_conn_cache;
struct kmem_cache *ip_conn_cache;
+struct kmem_cache *ip_helper_stream_cache;
extern struct kmem_cache *sctp_conn_cache;
extern struct kmem_cache *tcp_sack_info_cache;
extern struct kmem_cache *tcp_iphc_cache;
@@ -350,6 +353,11 @@ static void rawip_conn_destructor(void *, void *);
static int rts_conn_constructor(void *, void *, int);
static void rts_conn_destructor(void *, void *);
+static int ip_helper_stream_constructor(void *, void *, int);
+static void ip_helper_stream_destructor(void *, void *);
+
+boolean_t ip_use_helper_cache = B_TRUE;
+
#ifdef IPCL_DEBUG
#define INET_NTOA_BUFSIZE 18
@@ -394,6 +402,15 @@ ipcl_g_init(void)
sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE,
rts_conn_constructor, rts_conn_destructor,
NULL, NULL, NULL, 0);
+
+ if (ip_use_helper_cache) {
+ ip_helper_stream_cache = kmem_cache_create
+ ("ip_helper_stream_cache", sizeof (ip_helper_stream_info_t),
+ CACHE_ALIGN_SIZE, ip_helper_stream_constructor,
+ ip_helper_stream_destructor, NULL, NULL, NULL, 0);
+ } else {
+ ip_helper_stream_cache = NULL;
+ }
}
/*
@@ -749,6 +766,7 @@ ipcl_conn_destroy(conn_t *connp)
connp->conn_netstack = NULL;
netstack_rele(ns);
}
+
ipcl_conn_cleanup(connp);
/* leave conn_priv aka conn_udp, conn_icmp, etc in place. */
@@ -756,6 +774,7 @@ ipcl_conn_destroy(conn_t *connp)
connp->conn_flags = IPCL_UDPCONN;
kmem_cache_free(udp_conn_cache, connp);
} else if (connp->conn_flags & IPCL_RAWIPCONN) {
+
connp->conn_flags = IPCL_RAWIPCONN;
connp->conn_ulp = IPPROTO_ICMP;
kmem_cache_free(rawip_conn_cache, connp);
@@ -2025,6 +2044,7 @@ tcp_conn_constructor(void *buf, void *cdrarg, int kmflags)
mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL);
tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP);
connp->conn_tcp = tcp;
connp->conn_flags = IPCL_TCPCONN;
@@ -2047,6 +2067,7 @@ tcp_conn_destructor(void *buf, void *cdrarg)
tcp_timermp_free(tcp);
mutex_destroy(&connp->conn_lock);
cv_destroy(&connp->conn_cv);
+ cv_destroy(&connp->conn_sq_cv);
}
/* ARGSUSED */
@@ -2181,15 +2202,56 @@ rts_conn_destructor(void *buf, void *cdrarg)
cv_destroy(&connp->conn_cv);
}
+/* ARGSUSED */
+int
+ip_helper_stream_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ int error;
+ netstack_t *ns;
+ int ret;
+ tcp_stack_t *tcps;
+ ip_helper_stream_info_t *ip_helper_str;
+ ip_stack_t *ipst;
+
+ ns = netstack_find_by_cred(kcred);
+ ASSERT(ns != NULL);
+ tcps = ns->netstack_tcp;
+ ipst = ns->netstack_ip;
+ ASSERT(tcps != NULL);
+ ip_helper_str = (ip_helper_stream_info_t *)buf;
+
+ error = ldi_open_by_name(DEV_IP, IP_HELPER_STR, kcred,
+ &ip_helper_str->ip_helper_stream_handle, ipst->ips_ldi_ident);
+ if (error != 0) {
+ goto done;
+ }
+ error = ldi_ioctl(ip_helper_str->ip_helper_stream_handle,
+ SIOCSQPTR, (intptr_t)buf, FKIOCTL, kcred, &ret);
+ if (error != 0) {
+ (void) ldi_close(ip_helper_str->ip_helper_stream_handle, 0,
+ kcred);
+ }
+done:
+ netstack_rele(ipst->ips_netstack);
+ return (error);
+}
+
+/* ARGSUSED */
+static void
+ip_helper_stream_destructor(void *buf, void *cdrarg)
+{
+ ip_helper_stream_info_t *ip_helper_str = (ip_helper_stream_info_t *)buf;
+
+ ip_helper_str->ip_helper_stream_rq->q_ptr =
+ ip_helper_str->ip_helper_stream_wq->q_ptr =
+ ip_helper_str->ip_helper_stream_minfo;
+ (void) ldi_close(ip_helper_str->ip_helper_stream_handle, 0, kcred);
+}
+
+
/*
* Called as part of ipcl_conn_destroy to assert and clear any pointers
* in the conn_t.
- *
- * Below we list all the pointers in the conn_t as a documentation aid.
- * The ones that we can not ASSERT to be NULL are #ifdef'ed out.
- * If you add any pointers to the conn_t please add an ASSERT here
- * and #ifdef it out if it can't be actually asserted to be NULL.
- * In any case, we bzero most of the conn_t at the end of the function.
*/
void
ipcl_conn_cleanup(conn_t *connp)
@@ -2197,7 +2259,6 @@ ipcl_conn_cleanup(conn_t *connp)
ASSERT(connp->conn_ire_cache == NULL);
ASSERT(connp->conn_latch == NULL);
#ifdef notdef
- /* These are not cleared */
ASSERT(connp->conn_rq == NULL);
ASSERT(connp->conn_wq == NULL);
#endif
@@ -2236,11 +2297,11 @@ ipcl_conn_cleanup(conn_t *connp)
ASSERT(connp->conn_peercred == NULL);
ASSERT(connp->conn_netstack == NULL);
+ ASSERT(connp->conn_helper_info == NULL);
/* Clear out the conn_t fields that are not preserved */
bzero(&connp->conn_start_clr,
sizeof (conn_t) -
((uchar_t *)&connp->conn_start_clr - (uchar_t *)connp));
-
}
/*
diff --git a/usr/src/uts/common/inet/ip/keysock.c b/usr/src/uts/common/inet/ip/keysock.c
index c982fb4c45..af0fd73d63 100644
--- a/usr/src/uts/common/inet/ip/keysock.c
+++ b/usr/src/uts/common/inet/ip/keysock.c
@@ -59,7 +59,7 @@
#include <inet/common.h>
#include <netinet/ip6.h>
#include <inet/ip.h>
-#include <inet/mi.h>
+#include <inet/proto_set.h>
#include <inet/nd.h>
#include <inet/optcom.h>
#include <inet/ipsec_info.h>
@@ -707,7 +707,8 @@ keysock_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
mutex_exit(&keystack->keystack_list_lock);
qprocson(q);
- (void) mi_set_sth_hiwat(q, keystack->keystack_recv_hiwat);
+ (void) proto_set_rx_hiwat(q, NULL,
+ keystack->keystack_recv_hiwat);
/*
* Wait outside the keysock module perimeter for IPsec
* plumbing to be completed. If it fails, keysock_close()
@@ -875,7 +876,7 @@ keysock_opt_set(queue_t *q, uint_t mgmt_flags, int level,
if (*i1 > keystack->keystack_max_buf)
return (ENOBUFS);
RD(q)->q_hiwat = *i1;
- (void) mi_set_sth_hiwat(RD(q), *i1);
+ (void) proto_set_rx_hiwat(RD(q), NULL, *i1);
break;
}
mutex_exit(&ks->keysock_lock);
diff --git a/usr/src/uts/common/inet/ip/rts.c b/usr/src/uts/common/inet/ip/rts.c
index 350a5fa887..7965d37483 100644
--- a/usr/src/uts/common/inet/ip/rts.c
+++ b/usr/src/uts/common/inet/ip/rts.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/strsubr.h>
@@ -41,15 +39,17 @@
#include <sys/suntpi.h>
#include <sys/policy.h>
#include <sys/zone.h>
+#include <sys/disp.h>
#include <sys/socket.h>
+#include <sys/socketvar.h>
#include <netinet/in.h>
#include <inet/common.h>
#include <netinet/ip6.h>
#include <inet/ip.h>
#include <inet/ipclassifier.h>
-#include <inet/mi.h>
+#include <inet/proto_set.h>
#include <inet/nd.h>
#include <inet/optcom.h>
#include <netinet/ip_mroute.h>
@@ -111,20 +111,10 @@ static rtsparam_t lcl_param_arr[] = {
#define rtss_recv_hiwat rtss_params[2].rts_param_value
#define rtss_max_buf rtss_params[3].rts_param_value
-static int rts_close(queue_t *q);
static void rts_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
int sys_error);
static void rts_input(void *, mblk_t *, void *);
static mblk_t *rts_ioctl_alloc(mblk_t *data, cred_t *cr);
-static int rts_open(queue_t *q, dev_t *devp, int flag, int sflag,
- cred_t *credp);
-int rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name,
- uchar_t *ptr);
-int rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name,
- uchar_t *ptr);
-int rts_opt_set(queue_t *q, uint_t optset_context, int level,
- int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
- uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk);
static int rts_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
static boolean_t rts_param_register(IDP *ndp, rtsparam_t *rtspa, int cnt);
static int rts_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
@@ -137,12 +127,21 @@ static void rts_wput_iocdata(queue_t *q, mblk_t *mp);
static void rts_wput_other(queue_t *q, mblk_t *mp);
static int rts_wrw(queue_t *q, struiod_t *dp);
+static int rts_stream_open(queue_t *q, dev_t *devp, int flag, int sflag,
+ cred_t *credp);
+static conn_t *rts_open(int flag, cred_t *credp);
+
+static int rts_stream_close(queue_t *q);
+static int rts_close(sock_lower_handle_t proto_handle, int flags,
+ cred_t *cr);
+
static struct module_info rts_mod_info = {
129, "rts", 1, INFPSZ, 512, 128
};
static struct qinit rtsrinit = {
- NULL, (pfi_t)rts_rsrv, rts_open, rts_close, NULL, &rts_mod_info
+ NULL, (pfi_t)rts_rsrv, rts_stream_open, rts_stream_close, NULL,
+ &rts_mod_info
};
static struct qinit rtswinit = {
@@ -201,9 +200,8 @@ rts_ioctl_alloc(mblk_t *data, cred_t *cr)
* internal datastructure.
*/
static int
-rts_close(queue_t *q)
+rts_common_close(queue_t *q, conn_t *connp)
{
- conn_t *connp = Q_TO_CONN(q);
ASSERT(connp != NULL && IPCL_IS_RTS(connp));
@@ -211,25 +209,39 @@ rts_close(queue_t *q)
ip_quiesce_conn(connp);
- qprocsoff(q);
+ if (!IPCL_IS_NONSTR(connp)) {
+ qprocsoff(q);
- /*
- * Now we are truly single threaded on this stream, and can
- * delete the things hanging off the connp, and finally the connp.
- * We removed this connp from the fanout list, it cannot be
- * accessed thru the fanouts, and we already waited for the
- * conn_ref to drop to 0. We are already in close, so
- * there cannot be any other thread from the top. qprocsoff
- * has completed, and service has completed or won't run in
- * future.
- */
+ /*
+ * Now we are truly single threaded on this stream, and can
+ * delete the things hanging off the connp, and finally the
+ * connp.
+ * We removed this connp from the fanout list, it cannot be
+ * accessed thru the fanouts, and we already waited for the
+ * conn_ref to drop to 0. We are already in close, so
+ * there cannot be any other thread from the top. qprocsoff
+ * has completed, and service has completed or won't run in
+ * future.
+ */
+ inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
+ } else {
+ ip_close_helper_stream(connp);
+ }
ASSERT(connp->conn_ref == 1);
- inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
connp->conn_ref--;
ipcl_conn_destroy(connp);
+ return (0);
+}
+
+static int
+rts_stream_close(queue_t *q)
+{
+ conn_t *connp = Q_TO_CONN(q);
+
+ (void) rts_common_close(q, connp);
q->q_ptr = WR(q)->q_ptr = NULL;
return (0);
}
@@ -240,14 +252,12 @@ rts_close(queue_t *q)
*/
/* ARGSUSED */
static int
-rts_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
+rts_stream_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
{
- rts_t *rts;
conn_t *connp;
dev_t conn_dev;
- zoneid_t zoneid;
- netstack_t *ns;
rts_stack_t *rtss;
+ rts_t *rts;
/* If the stream is already open, return immediately. */
if (q->q_ptr != NULL)
@@ -256,40 +266,26 @@ rts_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
if (sflag == MODOPEN)
return (EINVAL);
- ns = netstack_find_by_cred(credp);
- ASSERT(ns != NULL);
- rtss = ns->netstack_rts;
- ASSERT(rtss != NULL);
-
- /*
- * For exclusive stacks we set the zoneid to zero
- * to make RTS operate as if in the global zone.
- */
- if (ns->netstack_stackid != GLOBAL_NETSTACKID)
- zoneid = GLOBAL_ZONEID;
- else
- zoneid = crgetzoneid(credp);
/*
* Since RTS is not used so heavily, allocating from the small
* arena should be sufficient.
*/
if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
- netstack_rele(ns);
return (EBUSY);
}
+
+ connp = rts_open(flag, credp);
+ ASSERT(connp != NULL);
+
+
*devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
- connp = ipcl_conn_create(IPCL_RTSCONN, KM_SLEEP, ns);
- connp->conn_dev = conn_dev;
- connp->conn_minor_arena = ip_minor_arena_sa;
rts = connp->conn_rts;
- /*
- * ipcl_conn_create did a netstack_hold. Undo the hold that was
- * done by netstack_find_by_cred()
- */
- netstack_rele(ns);
+ rw_enter(&rts->rts_rwlock, RW_WRITER);
+ connp->conn_dev = conn_dev;
+ connp->conn_minor_arena = ip_minor_arena_sa;
/*
* Initialize the rts_t structure for this stream.
@@ -299,25 +295,12 @@ rts_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
connp->conn_rq = q;
connp->conn_wq = WR(q);
- rw_enter(&rts->rts_rwlock, RW_WRITER);
- ASSERT(connp->conn_rts == rts);
- ASSERT(rts->rts_connp == connp);
-
- /* Set the initial state of the stream and the privilege status. */
- rts->rts_state = TS_UNBND;
- connp->conn_zoneid = zoneid;
-
- connp->conn_ulp_labeled = is_system_labeled();
-
- rts->rts_rtss = rtss;
-
+ rtss = rts->rts_rtss;
q->q_hiwat = rtss->rtss_recv_hiwat;
WR(q)->q_hiwat = rtss->rtss_xmit_hiwat;
WR(q)->q_lowat = rtss->rtss_xmit_lowat;
- connp->conn_recv = rts_input;
- crhold(credp);
- connp->conn_cred = credp;
+
mutex_enter(&connp->conn_lock);
connp->conn_state_flags &= ~CONN_INCIPIENT;
@@ -325,7 +308,6 @@ rts_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
qprocson(q);
rw_exit(&rts->rts_rwlock);
-
/*
* Indicate the down IP module that this is a routing socket
* client by sending an RTS IOCTL without any user data. Although
@@ -335,7 +317,67 @@ rts_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
ip_rts_register(connp);
return (0);
+}
+
+/* ARGSUSED */
+static conn_t *
+rts_open(int flag, cred_t *credp)
+{
+ netstack_t *ns;
+ rts_stack_t *rtss;
+ rts_t *rts;
+ conn_t *connp;
+ zoneid_t zoneid;
+
+ ns = netstack_find_by_cred(credp);
+ ASSERT(ns != NULL);
+ rtss = ns->netstack_rts;
+ ASSERT(rtss != NULL);
+
+ /*
+ * For exclusive stacks we set the zoneid to zero
+ * to make RTS operate as if in the global zone.
+ */
+ if (ns->netstack_stackid != GLOBAL_NETSTACKID)
+ zoneid = GLOBAL_ZONEID;
+ else
+ zoneid = crgetzoneid(credp);
+
+ connp = ipcl_conn_create(IPCL_RTSCONN, KM_SLEEP, ns);
+ rts = connp->conn_rts;
+
+ /*
+ * ipcl_conn_create did a netstack_hold. Undo the hold that was
+ * done by netstack_find_by_cred()
+ */
+ netstack_rele(ns);
+
+
+ rw_enter(&rts->rts_rwlock, RW_WRITER);
+ ASSERT(connp->conn_rts == rts);
+ ASSERT(rts->rts_connp == connp);
+
+ connp->conn_zoneid = zoneid;
+ connp->conn_flow_cntrld = B_FALSE;
+ connp->conn_ulp_labeled = is_system_labeled();
+
+ rts->rts_rtss = rtss;
+ rts->rts_xmit_hiwat = rtss->rtss_xmit_hiwat;
+
+ connp->conn_recv = rts_input;
+ crhold(credp);
+ connp->conn_cred = credp;
+
+ /*
+ * rts sockets start out as bound and connected
+ * For streams based sockets, socket state is set to
+ * SS_ISBOUND | SS_ISCONNECTED in so_strinit.
+ */
+ rts->rts_state = TS_DATA_XFER;
+ rw_exit(&rts->rts_rwlock);
+
+ return (connp);
}
/*
@@ -362,7 +404,7 @@ rts_ok_ack(queue_t *q, mblk_t *mp)
* This routine is called by rts_wput to handle T_UNBIND_REQ messages.
*/
static void
-rts_unbind(queue_t *q, mblk_t *mp)
+rts_tpi_unbind(queue_t *q, mblk_t *mp)
{
conn_t *connp = Q_TO_CONN(q);
rts_t *rts = connp->conn_rts;
@@ -383,7 +425,7 @@ rts_unbind(queue_t *q, mblk_t *mp)
* O_T_BIND_REQ and T_BIND_REQ semantics.
*/
static void
-rts_bind(queue_t *q, mblk_t *mp)
+rts_tpi_bind(queue_t *q, mblk_t *mp)
{
conn_t *connp = Q_TO_CONN(q);
rts_t *rts = connp->conn_rts;
@@ -392,13 +434,13 @@ rts_bind(queue_t *q, mblk_t *mp)
if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
- "rts_bind: bad data, %d", rts->rts_state);
+ "rts_tpi_bind: bad data, %d", rts->rts_state);
rts_err_ack(q, mp, TBADADDR, 0);
return;
}
if (rts->rts_state != TS_UNBND) {
(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
- "rts_bind: bad state, %d", rts->rts_state);
+ "rts_tpi_bind: bad state, %d", rts->rts_state);
rts_err_ack(q, mp, TOUTSTATE, 0);
return;
}
@@ -415,7 +457,7 @@ rts_bind(queue_t *q, mblk_t *mp)
tbr = (struct T_bind_req *)mp->b_rptr;
if (tbr->ADDR_length != 0) {
(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
- "rts_bind: bad ADDR_length %d", tbr->ADDR_length);
+ "rts_tpi_bind: bad ADDR_length %d", tbr->ADDR_length);
rts_err_ack(q, mp, TBADADDR, 0);
return;
}
@@ -498,16 +540,14 @@ rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
return (-1);
}
-/*
- * This routine retrieves the current status of socket options.
- * It returns the size of the option retrieved.
- */
-int
-rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
+
+static int
+rts_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
{
- int *i1 = (int *)ptr;
- conn_t *connp = Q_TO_CONN(q);
rts_t *rts = connp->conn_rts;
+ int *i1 = (int *)ptr;
+
+ ASSERT(RW_READ_HELD(&rts->rts_rwlock));
switch (level) {
case SOL_SOCKET:
@@ -543,12 +583,12 @@ rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
* but changing them should do nothing.
*/
case SO_SNDBUF:
- ASSERT(q->q_hiwat <= INT_MAX);
- *i1 = (int)(q->q_hiwat);
+ ASSERT(rts->rts_xmit_hiwat <= INT_MAX);
+ *i1 = (int)(rts->rts_xmit_hiwat);
break;
case SO_RCVBUF:
- ASSERT(q->q_hiwat <= INT_MAX);
- *i1 = (int)(RD(q)->q_hiwat);
+ ASSERT(rts->rts_recv_hiwat <= INT_MAX);
+ *i1 = (int)(rts->rts_recv_hiwat);
break;
case SO_DOMAIN:
*i1 = PF_ROUTE;
@@ -563,60 +603,17 @@ rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
return ((int)sizeof (int));
}
-
-/*
- * This routine sets socket options.
- */
-/*ARGSUSED*/
-int
-rts_opt_set(queue_t *q, uint_t optset_context, int level,
- int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
- uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+/* ARGSUSED */
+static int
+rts_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
+ uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, cred_t *cr,
+ void *thisdg_attrs, boolean_t checkonly)
{
int *i1 = (int *)invalp;
- conn_t *connp = Q_TO_CONN(q);
rts_t *rts = connp->conn_rts;
- boolean_t checkonly;
rts_stack_t *rtss = rts->rts_rtss;
- switch (optset_context) {
- case SETFN_OPTCOM_CHECKONLY:
- checkonly = B_TRUE;
- /*
- * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
- * inlen != 0 implies value supplied and
- * we have to "pretend" to set it.
- * inlen == 0 implies that there is no
- * value part in T_CHECK request and just validation
- * done elsewhere should be enough, we just return here.
- */
- if (inlen == 0) {
- *outlenp = 0;
- return (0);
- }
- break;
- case SETFN_OPTCOM_NEGOTIATE:
- checkonly = B_FALSE;
- break;
- case SETFN_UD_NEGOTIATE:
- case SETFN_CONN_NEGOTIATE:
- checkonly = B_FALSE;
- /*
- * Negotiating local and "association-related" options
- * through T_UNITDATA_REQ or T_CONN_{REQ,CON}
- * Not allowed in this module.
- */
- return (EINVAL);
- default:
- /*
- * We should never get here
- */
- *outlenp = 0;
- return (EINVAL);
- }
-
- ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
- (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
+ ASSERT(RW_WRITE_HELD(&rts->rts_rwlock));
/*
* For rts, we should have no ancillary data sent down
@@ -680,7 +677,9 @@ rts_opt_set(queue_t *q, uint_t optset_context, int level,
return (ENOBUFS);
}
if (!checkonly) {
- q->q_hiwat = *i1;
+ rts->rts_xmit_hiwat = *i1;
+ if (!IPCL_IS_NONSTR(connp))
+ connp->conn_wq->q_hiwat = *i1;
}
break; /* goto sizeof (int) option return */
case SO_RCVBUF:
@@ -689,9 +688,13 @@ rts_opt_set(queue_t *q, uint_t optset_context, int level,
return (ENOBUFS);
}
if (!checkonly) {
- RD(q)->q_hiwat = *i1;
- (void) mi_set_sth_hiwat(RD(q), *i1);
+ rts->rts_recv_hiwat = *i1;
+ rw_exit(&rts->rts_rwlock);
+ (void) proto_set_rx_hiwat(connp->conn_rq, connp,
+ *i1);
+ rw_enter(&rts->rts_rwlock, RW_WRITER);
}
+
break; /* goto sizeof (int) option return */
default:
*outlenp = 0;
@@ -705,11 +708,105 @@ rts_opt_set(queue_t *q, uint_t optset_context, int level,
/*
* Common case of return from an option that is sizeof (int)
*/
- *(int *)outvalp = *i1;
+ if (invalp != outvalp) {
+ /* don't trust bcopy for identical src/dst */
+ (void) bcopy(invalp, outvalp, inlen);
+ }
*outlenp = (t_uscalar_t)sizeof (int);
return (0);
}
+static int
+rts_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
+ uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
+ void *thisdg_attrs, cred_t *cr)
+{
+ boolean_t checkonly = B_FALSE;
+
+ if (optset_context) {
+ switch (optset_context) {
+ case SETFN_OPTCOM_CHECKONLY:
+ checkonly = B_TRUE;
+ /*
+ * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
+ * inlen != 0 implies value supplied and
+ * we have to "pretend" to set it.
+ * inlen == 0 implies that there is no value part
+ * in T_CHECK request and just validation
+ * done elsewhere should be enough, we just return here.
+ */
+ if (inlen == 0) {
+ *outlenp = 0;
+ return (0);
+ }
+ break;
+ case SETFN_OPTCOM_NEGOTIATE:
+ checkonly = B_FALSE;
+ break;
+ case SETFN_UD_NEGOTIATE:
+ case SETFN_CONN_NEGOTIATE:
+ checkonly = B_FALSE;
+ /*
+ * Negotiating local and "association-related" options
+ * through T_UNITDATA_REQ or T_CONN_{REQ,CON}
+ * Not allowed in this module.
+ */
+ return (EINVAL);
+ default:
+ /*
+ * We should never get here
+ */
+ *outlenp = 0;
+ return (EINVAL);
+ }
+
+ ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
+ (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
+
+ }
+ return (rts_do_opt_set(connp, level, name, inlen, invalp, outlenp,
+ outvalp, cr, thisdg_attrs, checkonly));
+
+}
+
+/*
+ * This routine retrieves the current status of socket options.
+ * It returns the size of the option retrieved.
+ */
+int
+rts_tpi_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
+{
+ rts_t *rts;
+ int err;
+
+ rts = Q_TO_RTS(q);
+ rw_enter(&rts->rts_rwlock, RW_READER);
+ err = rts_opt_get(Q_TO_CONN(q), level, name, ptr);
+ rw_exit(&rts->rts_rwlock);
+ return (err);
+}
+
+/*
+ * This routine sets socket options.
+ */
+/*ARGSUSED*/
+int
+rts_tpi_opt_set(queue_t *q, uint_t optset_context, int level,
+ int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
+ uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+{
+ conn_t *connp = Q_TO_CONN(q);
+ int error;
+ rts_t *rts = connp->conn_rts;
+
+
+ rw_enter(&rts->rts_rwlock, RW_WRITER);
+ error = rts_opt_set(connp, optset_context, level, name, inlen, invalp,
+ outlenp, outvalp, thisdg_attrs, cr);
+ rw_exit(&rts->rts_rwlock);
+ return (error);
+}
+
/*
* This routine retrieves the value of an ND variable in a rtsparam_t
* structure. It is called through nd_getset when a user reads the
@@ -803,7 +900,7 @@ rts_wrw(queue_t *q, struiod_t *dp)
rts->rts_error = EINTR;
goto err_ret;
}
- }
+ }
rts->rts_flag |= RTS_WRW_PENDING;
if (isuioq(q) && (error = struioget(q, mp, dp, 0))) {
@@ -954,10 +1051,10 @@ rts_wput_other(queue_t *q, mblk_t *mp)
switch (((union T_primitives *)rptr)->type) {
case T_BIND_REQ:
case O_T_BIND_REQ:
- rts_bind(q, mp);
+ rts_tpi_bind(q, mp);
return;
case T_UNBIND_REQ:
- rts_unbind(q, mp);
+ rts_tpi_unbind(q, mp);
return;
case T_CAPABILITY_REQ:
rts_capability_req(q, mp);
@@ -985,6 +1082,7 @@ rts_wput_other(queue_t *q, mblk_t *mp)
freemsg(mp);
(void) putnextctl1(RD(q), M_ERROR, EPROTO);
return;
+
default:
break;
}
@@ -1086,21 +1184,33 @@ rts_input(void *arg1, mblk_t *mp, void *arg2)
struct iocblk *iocp;
mblk_t *mp1;
struct T_data_ind *tdi;
+ int error;
switch (mp->b_datap->db_type) {
case M_IOCACK:
case M_IOCNAK:
iocp = (struct iocblk *)mp->b_rptr;
- if (rts->rts_flag & (RTS_WPUT_PENDING)) {
- rts->rts_flag &= ~RTS_WPUT_PENDING;
+ if (IPCL_IS_NONSTR(connp)) {
+ ASSERT(rts->rts_flag & (RTS_REQ_PENDING));
+ mutex_enter(&rts->rts_send_mutex);
+ rts->rts_flag &= ~RTS_REQ_INPROG;
rts->rts_error = iocp->ioc_error;
- /*
- * Tell rts_wvw/qwait that we are done.
- * Note: there is no qwait_wakeup() we can use.
- */
- qenable(connp->conn_rq);
+ cv_signal(&rts->rts_io_cv);
+ mutex_exit(&rts->rts_send_mutex);
freemsg(mp);
return;
+ } else {
+ if (rts->rts_flag & (RTS_WPUT_PENDING)) {
+ rts->rts_flag &= ~RTS_WPUT_PENDING;
+ rts->rts_error = iocp->ioc_error;
+ /*
+ * Tell rts_wvw/qwait that we are done.
+ * Note: there is no qwait_wakeup() we can use.
+ */
+ qenable(connp->conn_rq);
+ freemsg(mp);
+ return;
+ }
}
break;
case M_DATA:
@@ -1124,12 +1234,33 @@ rts_input(void *arg1, mblk_t *mp, void *arg2)
default:
break;
}
- putnext(connp->conn_rq, mp);
+
+ if (IPCL_IS_NONSTR(connp)) {
+ if ((*connp->conn_upcalls->su_recv)
+ (connp->conn_upper_handle, mp, msgdsize(mp), 0,
+ &error, NULL) < 0) {
+ ASSERT(error == ENOSPC);
+ /*
+ * Let's confirm hoding the lock that
+ * we are out of recv space.
+ */
+ mutex_enter(&rts->rts_recv_mutex);
+ if ((*connp->conn_upcalls->su_recv)
+ (connp->conn_upper_handle, NULL, 0, 0,
+ &error, NULL) < 0) {
+ ASSERT(error == ENOSPC);
+ connp->conn_flow_cntrld = B_TRUE;
+ }
+ mutex_exit(&rts->rts_recv_mutex);
+ }
+ } else {
+ putnext(connp->conn_rq, mp);
+ }
}
void
-rts_ddi_init(void)
+rts_ddi_g_init(void)
{
rts_max_optsize = optcom_max_optsize(rts_opt_obj.odb_opt_des_arr,
rts_opt_obj.odb_opt_arr_cnt);
@@ -1143,11 +1274,13 @@ rts_ddi_init(void)
}
void
-rts_ddi_destroy(void)
+rts_ddi_g_destroy(void)
{
netstack_unregister(NS_RTS);
}
+#define INET_NAME "ip"
+
/*
* Initialize the RTS stack instance.
*/
@@ -1157,6 +1290,8 @@ rts_stack_init(netstackid_t stackid, netstack_t *ns)
{
rts_stack_t *rtss;
rtsparam_t *pa;
+ int error = 0;
+ major_t major;
rtss = (rts_stack_t *)kmem_zalloc(sizeof (*rtss), KM_SLEEP);
rtss->rtss_netstack = ns;
@@ -1167,6 +1302,10 @@ rts_stack_init(netstackid_t stackid, netstack_t *ns)
(void) rts_param_register(&rtss->rtss_g_nd,
rtss->rtss_params, A_CNT(lcl_param_arr));
+
+ major = mod_name_to_major(INET_NAME);
+ error = ldi_ident_from_major(major, &rtss->rtss_ldi_ident);
+ ASSERT(error == 0);
return (rtss);
}
@@ -1182,5 +1321,411 @@ rts_stack_fini(netstackid_t stackid, void *arg)
nd_free(&rtss->rtss_g_nd);
kmem_free(rtss->rtss_params, sizeof (lcl_param_arr));
rtss->rtss_params = NULL;
+ ldi_ident_release(rtss->rtss_ldi_ident);
kmem_free(rtss, sizeof (*rtss));
}
+
+/* ARGSUSED */
+int
+rts_accept(sock_lower_handle_t lproto_handle,
+ sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
+ cred_t *cr)
+{
+ return (EINVAL);
+}
+
+/* ARGSUSED */
+static int
+rts_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
+ socklen_t len, cred_t *cr)
+{
+ /*
+ * rebind not allowed
+ */
+ return (EINVAL);
+}
+
+/* ARGSUSED */
+int
+rts_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
+{
+ return (EINVAL);
+}
+
+/* ARGSUSED */
+int
+rts_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
+ socklen_t len, sock_connid_t *id, cred_t *cr)
+{
+ /*
+ * rts sockets start out as bound and connected
+ */
+ *id = 0;
+ return (EISCONN);
+}
+
+/* ARGSUSED */
+int
+rts_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
+ socklen_t *addrlen, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ rts_t *rts = connp->conn_rts;
+
+ ASSERT(rts != NULL);
+
+ bzero(addr, sizeof (struct sockaddr));
+ addr->sa_family = AF_ROUTE;
+ *addrlen = sizeof (struct sockaddr);
+
+ return (0);
+}
+
+/* ARGSUSED */
+int
+rts_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
+ socklen_t *addrlen, cred_t *cr)
+{
+ return (EOPNOTSUPP);
+}
+
+static int
+rts_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
+ void *optvalp, socklen_t *optlen, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ rts_t *rts = connp->conn_rts;
+ int error;
+ t_uscalar_t max_optbuf_len;
+ void *optvalp_buf;
+ int len;
+
+ error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
+ rts_opt_obj.odb_opt_des_arr,
+ rts_opt_obj.odb_opt_arr_cnt,
+ rts_opt_obj.odb_topmost_tpiprovider,
+ B_FALSE, B_TRUE, cr);
+ if (error != 0) {
+ if (error < 0)
+ error = proto_tlitosyserr(-error);
+ return (error);
+ }
+
+ optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
+ rw_enter(&rts->rts_rwlock, RW_READER);
+ len = rts_opt_get(connp, level, option_name, optvalp_buf);
+ rw_exit(&rts->rts_rwlock);
+
+ if (len < 0) {
+ /*
+ * Pass on to IP
+ */
+ error = ip_get_options(connp, level, option_name,
+ optvalp, optlen, cr);
+ } else {
+ /*
+ * update optlen and copy option value
+ */
+ t_uscalar_t size = MIN(len, *optlen);
+ bcopy(optvalp_buf, optvalp, size);
+ bcopy(&size, optlen, sizeof (size));
+ error = 0;
+ }
+
+ kmem_free(optvalp_buf, max_optbuf_len);
+ return (error);
+}
+
+static int
+rts_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
+ const void *optvalp, socklen_t optlen, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ rts_t *rts = connp->conn_rts;
+ int error;
+
+ error = proto_opt_check(level, option_name, optlen, NULL,
+ rts_opt_obj.odb_opt_des_arr,
+ rts_opt_obj.odb_opt_arr_cnt,
+ rts_opt_obj.odb_topmost_tpiprovider,
+ B_TRUE, B_FALSE, cr);
+
+ if (error != 0) {
+ if (error < 0)
+ error = proto_tlitosyserr(-error);
+ return (error);
+ }
+
+ rw_enter(&rts->rts_rwlock, RW_WRITER);
+ error = rts_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
+ optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
+ NULL, cr);
+ rw_exit(&rts->rts_rwlock);
+
+ ASSERT(error >= 0);
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+rts_send(sock_lower_handle_t proto_handle, mblk_t *mp,
+ struct nmsghdr *msg, cred_t *cr)
+{
+ mblk_t *mp1;
+ conn_t *connp = (conn_t *)proto_handle;
+ rts_t *rts = connp->conn_rts;
+ rt_msghdr_t *rtm;
+ int error;
+
+ ASSERT(DB_TYPE(mp) == M_DATA);
+ /*
+ * The semantics of the routing socket is such that the rtm_pid
+ * field is automatically filled in during requests with the
+ * current process' pid. We do this here (where we still have
+ * user context) after checking we have at least a message the
+ * size of a routing message header.
+ */
+ if ((mp->b_wptr - mp->b_rptr) < sizeof (rt_msghdr_t)) {
+ if (!pullupmsg(mp, sizeof (rt_msghdr_t))) {
+ rts->rts_error = EINVAL;
+ freemsg(mp);
+ return (rts->rts_error);
+ }
+ }
+ rtm = (rt_msghdr_t *)mp->b_rptr;
+ rtm->rtm_pid = curproc->p_pid;
+
+ mp1 = rts_ioctl_alloc(mp, DB_CRED(mp));
+ if (mp1 == NULL) {
+ ASSERT(rts != NULL);
+ freemsg(mp);
+ return (ENOMEM);
+ }
+
+ /*
+ * Allow only one outstanding request(ioctl) at any given time
+ */
+ mutex_enter(&rts->rts_send_mutex);
+ while (rts->rts_flag & RTS_REQ_PENDING) {
+ int ret;
+
+ ret = cv_wait_sig(&rts->rts_send_cv, &rts->rts_send_mutex);
+ if (ret <= 0) {
+ mutex_exit(&rts->rts_send_mutex);
+ freemsg(mp);
+ return (EINTR);
+ }
+ }
+
+ rts->rts_flag |= RTS_REQ_PENDING;
+
+ rts->rts_flag |= RTS_REQ_INPROG;
+
+ mutex_exit(&rts->rts_send_mutex);
+
+ CONN_INC_REF(connp);
+
+ error = ip_rts_request_common(rts->rts_connp->conn_wq, mp1, connp,
+ DB_CREDDEF(mp, connp->conn_cred));
+
+ mutex_enter(&rts->rts_send_mutex);
+ if (error == EINPROGRESS) {
+ ASSERT(rts->rts_flag & RTS_REQ_INPROG);
+ if (rts->rts_flag & RTS_REQ_INPROG) {
+ /*
+ * Once the request has been issued we wait for
+ * completion
+ */
+ cv_wait(&rts->rts_io_cv, &rts->rts_send_mutex);
+ error = rts->rts_error;
+ }
+ }
+
+ ASSERT((error != 0) || !(rts->rts_flag & RTS_REQ_INPROG));
+ ASSERT(MUTEX_HELD(&rts->rts_send_mutex));
+
+ rts->rts_flag &= ~(RTS_REQ_PENDING | RTS_REQ_INPROG);
+ cv_signal(&rts->rts_send_cv);
+ mutex_exit(&rts->rts_send_mutex);
+ return (error);
+}
+
+/* ARGSUSED */
+sock_lower_handle_t
+rts_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
+ uint_t *smodep, int *errorp, int flags, cred_t *credp)
+{
+ conn_t *connp;
+ rts_t *rts;
+ rts_stack_t *rtss;
+
+ if (family != AF_ROUTE || type != SOCK_RAW ||
+ (proto != 0 && proto != AF_INET && proto != AF_INET6)) {
+ *errorp = EPROTONOSUPPORT;
+ return (NULL);
+ }
+
+ connp = rts_open(flags, credp);
+ ASSERT(connp != NULL);
+ connp->conn_flags |= IPCL_NONSTR;
+
+ rts = connp->conn_rts;
+ rtss = rts->rts_rtss;
+
+ rts->rts_xmit_hiwat = rtss->rtss_xmit_hiwat;
+ rts->rts_xmit_lowat = rtss->rtss_xmit_lowat;
+ rts->rts_recv_hiwat = rtss->rtss_recv_hiwat;
+ rts->rts_recv_lowat = rts_mod_info.mi_lowat;
+
+ ASSERT(rtss->rtss_ldi_ident != NULL);
+
+ *errorp = ip_create_helper_stream(connp, rtss->rtss_ldi_ident);
+ if (*errorp != 0) {
+#ifdef DEBUG
+ cmn_err(CE_CONT, "rts_create: create of IP helper stream"
+ " failed\n");
+#endif
+ (void) rts_close((sock_lower_handle_t)connp, 0, credp);
+ return (NULL);
+ }
+
+ mutex_enter(&connp->conn_lock);
+ connp->conn_state_flags &= ~CONN_INCIPIENT;
+ mutex_exit(&connp->conn_lock);
+
+ *errorp = 0;
+ *smodep = SM_ATOMIC;
+ *sock_downcalls = &sock_rts_downcalls;
+ return ((sock_lower_handle_t)connp);
+}
+
+/* ARGSUSED */
+void
+rts_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
+ sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ rts_t *rts = connp->conn_rts;
+ rts_stack_t *rtss = rts->rts_rtss;
+ struct sock_proto_props sopp;
+
+ connp->conn_upcalls = sock_upcalls;
+ connp->conn_upper_handle = sock_handle;
+
+ sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
+ SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
+ sopp.sopp_wroff = 0;
+ sopp.sopp_rxhiwat = rtss->rtss_recv_hiwat;
+ sopp.sopp_rxlowat = rts_mod_info.mi_lowat;
+ sopp.sopp_maxblk = INFPSZ;
+ sopp.sopp_maxpsz = rts_mod_info.mi_maxpsz;
+ sopp.sopp_minpsz = (rts_mod_info.mi_minpsz == 1) ? 0 :
+ rts_mod_info.mi_minpsz;
+
+ (*connp->conn_upcalls->su_set_proto_props)
+ (connp->conn_upper_handle, &sopp);
+
+ /*
+ * We treat it as already connected for routing socket.
+ */
+ (*connp->conn_upcalls->su_connected)
+ (connp->conn_upper_handle, 0, NULL, -1);
+
+ /*
+ * Indicate the down IP module that this is a routing socket
+ * client by sending an RTS IOCTL without any user data. Although
+ * this is just a notification message (without any real routing
+ * request), we pass in any credential for correctness sake.
+ */
+ ip_rts_register(connp);
+}
+
+/* ARGSUSED */
+int
+rts_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+
+ ASSERT(connp != NULL && IPCL_IS_RTS(connp));
+ return (rts_common_close(NULL, connp));
+}
+
+/* ARGSUSED */
+int
+rts_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+
+ /* shut down the send side */
+ if (how != SHUT_RD)
+ (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
+ SOCK_OPCTL_SHUT_SEND, 0);
+ /* shut down the recv side */
+ if (how != SHUT_WR)
+ (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
+ SOCK_OPCTL_SHUT_RECV, 0);
+ return (0);
+}
+
+void
+rts_clr_flowctrl(sock_lower_handle_t proto_handle)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ rts_t *rts = connp->conn_rts;
+
+ mutex_enter(&rts->rts_recv_mutex);
+ connp->conn_flow_cntrld = B_FALSE;
+ mutex_exit(&rts->rts_recv_mutex);
+}
+
+int
+rts_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
+ int mode, int32_t *rvalp, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ int error;
+
+ switch (cmd) {
+ case ND_SET:
+ case ND_GET:
+ case TI_GETPEERNAME:
+ case TI_GETMYNAME:
+#ifdef DEUG
+ cmn_err(CE_CONT, "rts_ioctl cmd 0x%x on non sreams"
+ " socket", cmd);
+#endif
+ error = EINVAL;
+ break;
+ default:
+ /*
+ * Pass on to IP using helper stream
+ */
+ error = ldi_ioctl(
+ connp->conn_helper_info->ip_helper_stream_handle,
+ cmd, arg, mode, cr, rvalp);
+ break;
+ }
+
+ return (error);
+}
+
+sock_downcalls_t sock_rts_downcalls = {
+ rts_activate,
+ rts_accept,
+ rts_bind,
+ rts_listen,
+ rts_connect,
+ rts_getpeername,
+ rts_getsockname,
+ rts_getsockopt,
+ rts_setsockopt,
+ rts_send,
+ NULL,
+ NULL,
+ NULL,
+ rts_shutdown,
+ rts_clr_flowctrl,
+ rts_ioctl,
+ rts_close
+};
diff --git a/usr/src/uts/common/inet/ip/rts_opt_data.c b/usr/src/uts/common/inet/ip/rts_opt_data.c
index f815cf086c..bac0eabdc4 100644
--- a/usr/src/uts/common/inet/ip/rts_opt_data.c
+++ b/usr/src/uts/common/inet/ip/rts_opt_data.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/stream.h>
#define _SUN_TPI_VERSION 2
@@ -40,14 +38,7 @@
#include <netinet/tcp.h>
#include <netinet/ip_mroute.h>
#include <inet/optcom.h>
-
-extern int rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name,
- uchar_t *ptr);
-extern int rts_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name,
- uchar_t *ptr);
-extern int rts_opt_set(queue_t *q, uint_t optset_context, int level,
- int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
- uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk);
+#include <inet/rts_impl.h>
/*
* Table of all known options handled on a RTS protocol stack.
@@ -102,8 +93,8 @@ uint_t rts_max_optsize; /* initialized in _init() */
optdb_obj_t rts_opt_obj = {
rts_opt_default, /* RTS default value function pointer */
- rts_opt_get, /* RTS get function pointer */
- rts_opt_set, /* RTS set function pointer */
+ rts_tpi_opt_get, /* RTS get function pointer */
+ rts_tpi_opt_set, /* RTS set function pointer */
B_TRUE, /* RTS is tpi provider */
RTS_OPT_ARR_CNT, /* RTS option database count of entries */
rts_opt_arr, /* RTS option database */
diff --git a/usr/src/uts/common/inet/ip/rtsddi.c b/usr/src/uts/common/inet/ip/rtsddi.c
index 27704da503..482c53ab5c 100644
--- a/usr/src/uts/common/inet/ip/rtsddi.c
+++ b/usr/src/uts/common/inet/ip/rtsddi.c
@@ -28,10 +28,22 @@
#include <sys/modctl.h>
#include <inet/common.h>
#include <inet/ip.h>
+#include <inet/rts_impl.h>
+#include <sys/strsubr.h>
+#include <sys/socketvar.h>
+
+#include <netinet/in.h>
+#include <netinet/ip6.h>
+
+#include <inet/common.h>
+#include <inet/ip.h>
+
#define INET_NAME "rts"
#define INET_DEVSTRTAB rtsinfo
#define INET_DEVDESC "PF_ROUTE socket STREAMS driver"
+#define INET_SOCKDESC "PF_ROUTE socket module"
+#define INET_SOCK_PROTO_CREATE_FUNC (*rts_create)
#define INET_DEVMINOR 0
#define INET_DEVMTFLAGS (D_MP|D_MTQPAIR|D_SYNCSTR)
diff --git a/usr/src/uts/common/inet/ip/spdsock.c b/usr/src/uts/common/inet/ip/spdsock.c
index dc2e113505..749db40ee6 100644
--- a/usr/src/uts/common/inet/ip/spdsock.c
+++ b/usr/src/uts/common/inet/ip/spdsock.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/param.h>
#include <sys/types.h>
#include <sys/stream.h>
@@ -55,6 +53,7 @@
#include <inet/ip.h>
#include <inet/ip6.h>
#include <inet/mi.h>
+#include <inet/proto_set.h>
#include <inet/nd.h>
#include <inet/ip_if.h>
#include <inet/tun.h>
@@ -3199,7 +3198,7 @@ spdsock_opt_set(queue_t *q, uint_t mgmt_flags, int level, int name,
if (*i1 > spds->spds_max_buf)
return (ENOBUFS);
RD(q)->q_hiwat = *i1;
- (void) mi_set_sth_hiwat(RD(q), *i1);
+ (void) proto_set_rx_hiwat(RD(q), NULL, *i1);
break;
}
break;
@@ -3407,7 +3406,7 @@ spdsock_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
oq->q_lowat = spds->spds_xmit_lowat;
qprocson(q);
- (void) mi_set_sth_hiwat(q, spds->spds_recv_hiwat);
+ (void) proto_set_rx_hiwat(q, NULL, spds->spds_recv_hiwat);
*devp = makedevice(getmajor(*devp), ss->spdsock_minor);
return (0);
diff --git a/usr/src/uts/common/inet/ip6.h b/usr/src/uts/common/inet/ip6.h
index 1dbe8c3dd1..d463c3f6ee 100644
--- a/usr/src/uts/common/inet/ip6.h
+++ b/usr/src/uts/common/inet/ip6.h
@@ -378,9 +378,9 @@ extern void mld_timeout_handler(void *);
extern void pr_addr_dbg(char *, int, const void *);
extern int ip_multirt_apply_membership_v6(int (*fn)(conn_t *, boolean_t,
- const in6_addr_t *, int, mcast_record_t, const in6_addr_t *,
- mblk_t *), ire_t *, conn_t *, boolean_t, const in6_addr_t *,
- mcast_record_t, const in6_addr_t *, mblk_t *);
+ const in6_addr_t *, int, mcast_record_t, const in6_addr_t *, mblk_t *),
+ ire_t *, conn_t *, boolean_t, const in6_addr_t *, mcast_record_t,
+ const in6_addr_t *, mblk_t *);
extern void ip_newroute_ipif_v6(queue_t *, mblk_t *, ipif_t *,
in6_addr_t, int, zoneid_t);
extern void ip_newroute_v6(queue_t *, mblk_t *, const in6_addr_t *,
@@ -391,6 +391,11 @@ extern size_t ip6_get_src_preferences(conn_t *, uint32_t *);
extern int ip6_set_src_preferences(conn_t *, uint32_t);
extern int ip6_set_pktinfo(cred_t *, conn_t *, struct in6_pktinfo *,
mblk_t *);
+extern int ip_proto_bind_laddr_v6(conn_t *, mblk_t **, uint8_t,
+ const in6_addr_t *, uint16_t, boolean_t);
+extern int ip_proto_bind_connected_v6(conn_t *, mblk_t **,
+ uint8_t, in6_addr_t *, uint16_t, const in6_addr_t *, ip6_pkt_t *,
+ uint16_t, boolean_t, boolean_t);
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/inet/ip_if.h b/usr/src/uts/common/inet/ip_if.h
index c0a6c51696..c5982de059 100644
--- a/usr/src/uts/common/inet/ip_if.h
+++ b/usr/src/uts/common/inet/ip_if.h
@@ -234,8 +234,11 @@ extern ipif_t *ipif_getby_indexes(uint_t, uint_t, boolean_t, ip_stack_t *);
extern void ipif_init(ip_stack_t *);
extern ipif_t *ipif_lookup_addr(ipaddr_t, ill_t *, zoneid_t, queue_t *,
mblk_t *, ipsq_func_t, int *, ip_stack_t *);
+extern boolean_t ip_addr_exists(ipaddr_t, zoneid_t, ip_stack_t *);
extern ipif_t *ipif_lookup_addr_v6(const in6_addr_t *, ill_t *, zoneid_t,
queue_t *, mblk_t *, ipsq_func_t, int *, ip_stack_t *);
+extern boolean_t ip_addr_exists_v6(const in6_addr_t *, zoneid_t,
+ ip_stack_t *);
extern zoneid_t ipif_lookup_addr_zoneid(ipaddr_t, ill_t *, ip_stack_t *);
extern zoneid_t ipif_lookup_addr_zoneid_v6(const in6_addr_t *, ill_t *,
ip_stack_t *);
diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h
index f7a9b8ff58..dae62ab499 100644
--- a/usr/src/uts/common/inet/ip_impl.h
+++ b/usr/src/uts/common/inet/ip_impl.h
@@ -44,6 +44,8 @@ extern "C" {
#define IP_MOD_ID 5701
+#define INET_NAME "ip"
+
#ifdef _BIG_ENDIAN
#define IP_HDR_CSUM_TTL_ADJUST 256
#define IP_TCP_CSUM_COMP IPPROTO_TCP
@@ -546,6 +548,22 @@ extern zoneid_t ip_get_zoneid_v4(ipaddr_t, mblk_t *, ip_stack_t *, zoneid_t);
extern zoneid_t ip_get_zoneid_v6(in6_addr_t *, mblk_t *, const ill_t *,
ip_stack_t *, zoneid_t);
+/*
+ * flag passed in by IP based protocols to get a private ip stream with
+ * no conn_t. Note this flag has the same value as SO_FALLBACK
+ */
+#define IP_HELPER_STR SO_FALLBACK
+
+#define IP_MOD_MINPSZ 1
+#define IP_MOD_MAXPSZ INFPSZ
+#define IP_MOD_HIWAT 65536
+#define IP_MOD_LOWAT 1024
+
+#define DEV_IP "/devices/pseudo/ip@0:ip"
+#define DEV_IP6 "/devices/pseudo/ip6@0:ip6"
+
+extern struct kmem_cache *ip_helper_stream_cache;
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/inet/ip_rts.h b/usr/src/uts/common/inet/ip_rts.h
index a8d3971192..70b33e0278 100644
--- a/usr/src/uts/common/inet/ip_rts.h
+++ b/usr/src/uts/common/inet/ip_rts.h
@@ -19,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _INET_IP_RTS_H
#define _INET_IP_RTS_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -72,8 +70,9 @@ extern void rts_fill_msg_v6(int, int, const in6_addr_t *,
extern size_t rts_header_msg_size(int);
-extern void rts_queue_input(mblk_t *, queue_t *, sa_family_t,
- ip_stack_t *);
+extern void rts_queue_input(mblk_t *, conn_t *, sa_family_t, ip_stack_t *);
+
+extern int ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *, cred_t *);
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/inet/ip_stack.h b/usr/src/uts/common/inet/ip_stack.h
index d0c3953374..3c53e1a3d3 100644
--- a/usr/src/uts/common/inet/ip_stack.h
+++ b/usr/src/uts/common/inet/ip_stack.h
@@ -425,6 +425,8 @@ struct ip_stack {
kmutex_t ips_ipobs_cb_lock;
uint_t ips_ipobs_cb_nwalkers;
kcondvar_t ips_ipobs_cb_cv;
+
+ struct __ldi_ident *ips_ldi_ident;
};
typedef struct ip_stack ip_stack_t;
diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h
index 4665549c69..39cdddb7c4 100644
--- a/usr/src/uts/common/inet/ipclassifier.h
+++ b/usr/src/uts/common/inet/ipclassifier.h
@@ -37,6 +37,9 @@ extern "C" {
#include <inet/ip6.h>
#include <netinet/in.h> /* for IPPROTO_* constants */
#include <sys/sdt.h>
+#include <sys/socket_proto.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
typedef void (*edesc_spf)(void *, mblk_t *, void *, int);
typedef void (*edesc_rpf)(void *, mblk_t *, void *);
@@ -80,6 +83,8 @@ typedef void (*edesc_rpf)(void *, mblk_t *, void *);
#define IPCL_RTSCONN 0x00000020 /* From rts_conn_cache */
#define IPCL_ISV6 0x00000040 /* AF_INET6 */
#define IPCL_IPTUN 0x00000080 /* Has "tun" plumbed above it */
+#define IPCL_NONSTR 0x00001000 /* A non-STREAMS socket */
+#define IPCL_IN_SQUEUE 0x10000000 /* Waiting squeue to finish */
/* Conn Masks */
#define IPCL_TCP (IPCL_TCP4|IPCL_TCP6)
@@ -136,6 +141,8 @@ typedef void (*edesc_rpf)(void *, mblk_t *, void *);
(connp)->conn_ulp == IPPROTO_IPV6) && \
((connp)->conn_flags & IPCL_IPTUN))
+#define IPCL_IS_NONSTR(connp) ((connp)->conn_flags & IPCL_NONSTR)
+
typedef struct connf_s connf_t;
typedef struct
@@ -145,6 +152,21 @@ typedef struct
pc_t ctb_stack[CONN_STACK_DEPTH];
} conn_trace_t;
+typedef struct ip_helper_minor_info_s {
+ dev_t ip_minfo_dev; /* Device */
+ vmem_t *ip_minfo_arena; /* Arena */
+} ip_helper_minfo_t;
+
+/*
+ * ip helper stream info
+ */
+typedef struct ip_helper_stream_info_s {
+ ldi_handle_t ip_helper_stream_handle;
+ queue_t *ip_helper_stream_rq;
+ queue_t *ip_helper_stream_wq;
+ ip_helper_minfo_t *ip_helper_stream_minfo;
+} ip_helper_stream_info_t;
+
/*
* The initial fields in the conn_t are setup by the kmem_cache constructor,
* and are preserved when it is freed. Fields after that are bzero'ed when
@@ -236,6 +258,7 @@ struct conn_s {
queue_t *conn_wq; /* Write queue */
dev_t conn_dev; /* Minor number */
vmem_t *conn_minor_arena; /* Minor arena */
+ ip_helper_stream_info_t *conn_helper_info;
cred_t *conn_cred; /* Credentials */
connf_t *conn_g_fanout; /* Global Hash bucket head */
@@ -300,6 +323,11 @@ struct conn_s {
#define conn_nexthop_v4 V4_PART_OF_V6(conn_nexthop_v6)
cred_t *conn_peercred; /* Peer credentials, if any */
+ kcondvar_t conn_sq_cv; /* For non-STREAMS socket IO */
+ kthread_t *conn_sq_caller; /* Caller of squeue sync ops */
+ sock_upcalls_t *conn_upcalls; /* Upcalls to sockfs */
+ sock_upper_handle_t conn_upper_handle; /* Upper handle: sonode * */
+
unsigned int
conn_ulp_labeled : 1, /* ULP label is synced */
conn_mlp_type : 2, /* mlp_type_t; tsol/tndb.h */
@@ -308,6 +336,8 @@ struct conn_s {
conn_anon_port : 1, /* user bound anonymously */
conn_mac_exempt : 1, /* unlabeled with loose MAC */
conn_spare : 26;
+
+ boolean_t conn_flow_cntrld;
netstack_t *conn_netstack; /* Corresponds to a netstack_hold */
#ifdef CONN_DEBUG
#define CONN_TRACE_MAX 10
@@ -582,6 +612,14 @@ conn_t *ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *, ipha_t *, tcph_t *,
ip_stack_t *);
conn_t *ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *, ip6_t *, tcph_t *,
ip_stack_t *);
+
+extern int ip_create_helper_stream(conn_t *connp, ldi_ident_t li);
+extern void ip_close_helper_stream(conn_t *connp);
+
+extern int ip_get_options(conn_t *, int, int, void *, t_uscalar_t *, cred_t *);
+extern int ip_set_options(conn_t *, int, int, const void *, t_uscalar_t,
+ cred_t *);
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/inet/mi.c b/usr/src/uts/common/inet/mi.c
index a8848a3499..f88fe3709b 100644
--- a/usr/src/uts/common/inet/mi.c
+++ b/usr/src/uts/common/inet/mi.c
@@ -24,8 +24,6 @@
*/
/* Copyright (c) 1990 Mentat Inc. */
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <inet/common.h> /* for various inet/mi.h and inet/nd.h needs */
#include <sys/stream.h>
@@ -46,6 +44,9 @@
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/kobj.h>
+#include <sys/stropts.h>
+#include <sys/strsubr.h>
+#include <inet/proto_set.h>
#define ISDIGIT(ch) ((ch) >= '0' && (ch) <= '9')
#define ISUPPER(ch) ((ch) >= 'A' && (ch) <= 'Z')
@@ -64,7 +65,7 @@
* allocation strategy is changed.
*/
-typedef struct stroptions *STROPTP;
+typedef struct stroptions *STROPTP;
typedef union T_primitives *TPRIMP;
/* Timer block states. */
@@ -903,93 +904,6 @@ mi_offset_paramc(mblk_t *mp, size_t offset, size_t len)
return (NULL);
}
-
-boolean_t
-mi_set_sth_hiwat(queue_t *q, size_t size)
-{
- MBLKP mp;
- STROPTP stropt;
-
- if (!(mp = allocb(sizeof (*stropt), BPRI_LO)))
- return (B_FALSE);
- mp->b_datap->db_type = M_SETOPTS;
- mp->b_wptr += sizeof (*stropt);
- stropt = (STROPTP)mp->b_rptr;
- stropt->so_flags = SO_HIWAT;
- stropt->so_hiwat = size;
- putnext(q, mp);
- return (B_TRUE);
-}
-
-boolean_t
-mi_set_sth_lowat(queue_t *q, size_t size)
-{
- MBLKP mp;
- STROPTP stropt;
-
- if (!(mp = allocb(sizeof (*stropt), BPRI_LO)))
- return (B_FALSE);
- mp->b_datap->db_type = M_SETOPTS;
- mp->b_wptr += sizeof (*stropt);
- stropt = (STROPTP)mp->b_rptr;
- stropt->so_flags = SO_LOWAT;
- stropt->so_lowat = size;
- putnext(q, mp);
- return (B_TRUE);
-}
-
-/* ARGSUSED */
-boolean_t
-mi_set_sth_maxblk(queue_t *q, ssize_t size)
-{
- MBLKP mp;
- STROPTP stropt;
-
- if (!(mp = allocb(sizeof (*stropt), BPRI_LO)))
- return (B_FALSE);
- mp->b_datap->db_type = M_SETOPTS;
- mp->b_wptr += sizeof (*stropt);
- stropt = (STROPTP)mp->b_rptr;
- stropt->so_flags = SO_MAXBLK;
- stropt->so_maxblk = size;
- putnext(q, mp);
- return (B_TRUE);
-}
-
-boolean_t
-mi_set_sth_copyopt(queue_t *q, int copyopt)
-{
- MBLKP mp;
- STROPTP stropt;
-
- if (!(mp = allocb(sizeof (*stropt), BPRI_LO)))
- return (B_FALSE);
- mp->b_datap->db_type = M_SETOPTS;
- mp->b_wptr += sizeof (*stropt);
- stropt = (STROPTP)mp->b_rptr;
- stropt->so_flags = SO_COPYOPT;
- stropt->so_copyopt = (ushort_t)copyopt;
- putnext(q, mp);
- return (B_TRUE);
-}
-
-boolean_t
-mi_set_sth_wroff(queue_t *q, size_t size)
-{
- MBLKP mp;
- STROPTP stropt;
-
- if (!(mp = allocb(sizeof (*stropt), BPRI_LO)))
- return (B_FALSE);
- mp->b_datap->db_type = M_SETOPTS;
- mp->b_wptr += sizeof (*stropt);
- stropt = (STROPTP)mp->b_rptr;
- stropt->so_flags = SO_WROFF;
- stropt->so_wroff = (ushort_t)size;
- putnext(q, mp);
- return (B_TRUE);
-}
-
int
mi_sprintf(char *buf, char *fmt, ...)
{
diff --git a/usr/src/uts/common/inet/mi.h b/usr/src/uts/common/inet/mi.h
index 6cae6a1acf..53608ca316 100644
--- a/usr/src/uts/common/inet/mi.h
+++ b/usr/src/uts/common/inet/mi.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -28,8 +27,6 @@
#ifndef _INET_MI_H
#define _INET_MI_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -39,6 +36,7 @@ extern "C" {
#include <sys/types.h>
#include <sys/vmem.h>
#include <sys/varargs.h>
+#include <netinet/in.h>
#define MI_MIN_DEV INET_MIN_DEV /* minimum minor device number */
#define MI_COPY_IN 1
@@ -137,13 +135,6 @@ extern int mi_open_link(void **mi_head, IDP ptr, dev_t *devp, int flag,
extern uint8_t *mi_offset_param(mblk_t *mp, size_t offset, size_t len);
extern uint8_t *mi_offset_paramc(mblk_t *mp, size_t offset, size_t len);
-
-extern boolean_t mi_set_sth_hiwat(queue_t *q, size_t size);
-extern boolean_t mi_set_sth_lowat(queue_t *q, size_t size);
-extern boolean_t mi_set_sth_maxblk(queue_t *q, ssize_t size);
-extern boolean_t mi_set_sth_copyopt(queue_t *q, int copyopt);
-extern boolean_t mi_set_sth_wroff(queue_t *q, size_t size);
-
/*PRINTFLIKE2*/
extern int mi_sprintf(char *buf, char *fmt, ...)
__KPRINTFLIKE(2);
diff --git a/usr/src/uts/common/inet/optcom.c b/usr/src/uts/common/inet/optcom.c
index 3de4044e58..f241599426 100644
--- a/usr/src/uts/common/inet/optcom.c
+++ b/usr/src/uts/common/inet/optcom.c
@@ -19,13 +19,11 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* This file contains common code for handling Options Management requests.
*/
@@ -38,6 +36,7 @@
#define _SUN_TPI_VERSION 2
#include <sys/tihdr.h>
#include <sys/socket.h>
+#include <sys/socketvar.h>
#include <sys/ddi.h>
#include <sys/debug.h> /* for ASSERT */
#include <sys/policy.h>
@@ -52,6 +51,8 @@
#include "optcom.h"
#include <inet/optcom.h>
+#include <inet/ipclassifier.h>
+#include <inet/proto_set.h>
/*
* Function prototypes
@@ -69,7 +70,6 @@ static void do_opt_current(queue_t *, struct T_opthdr *, uchar_t **,
static int do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
uint_t optset_context, uchar_t **resptrp, t_uscalar_t *worst_statusp,
cred_t *, optdb_obj_t *dbobjp, mblk_t *first_mp);
-static opdes_t *opt_chk_lookup(t_uscalar_t, t_uscalar_t, opdes_t *, uint_t);
static boolean_t opt_level_valid(t_uscalar_t, optlevel_t *, uint_t);
static size_t opt_level_allopts_lengths(t_uscalar_t, opdes_t *, uint_t);
static boolean_t opt_length_ok(opdes_t *, struct T_opthdr *);
@@ -186,6 +186,9 @@ optcom_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
* the sq framework arranges to restart this operation and passes control to
* the restart function ip_restart_optmgmt() which in turn calls
* svr4_optcom_req() or tpi_optcom_req() to restart the option processing.
+ *
+ * XXX Remove the asynchronous behavior of svr_optcom_req() and
+ * tpi_optcom_req().
*/
int
svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
@@ -214,6 +217,7 @@ svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
boolean_t pass_to_next = B_FALSE;
struct T_optmgmt_ack *toa;
struct T_optmgmt_req *tor;
+ int error;
/*
* Allocate M_CTL and prepend to the packet for restarting this
@@ -409,85 +413,17 @@ no_mem:;
if (opt->name == T_ALLOPT)
goto bad_opt;
- /* Find the option in the opt_arr. */
- if ((optd = opt_chk_lookup(opt->level, opt->name,
- opt_arr, opt_arr_cnt)) == NULL) {
- /*
- * Not found, that is a bad thing if
- * the caller is a tpi provider
- */
- if (topmost_tpiprovider)
- goto bad_opt;
- else
- continue; /* skip unmodified */
- }
-
- /* Additional checks dependent on operation. */
- switch (tor->MGMT_flags) {
- case T_NEGOTIATE:
- if (!OA_WRITE_OR_EXECUTE(optd, cr)) {
- /* can't negotiate option */
- if (!(OA_MATCHED_PRIV(optd, cr)) &&
- OA_WX_ANYPRIV(optd)) {
- /*
- * not privileged but privilege
- * will help negotiate option.
- */
- optcom_err_ack(q, mp, TACCES, 0);
- return (0);
- } else
- goto bad_opt;
- }
- /*
- * Verify size for options
- * Note: For retaining compatibility with historical
- * behavior, variable lengths options will have their
- * length verified in the setfn() processing.
- * In order to be compatible with SunOS 4.X we return
- * EINVAL errors for bad lengths.
- */
- if (!(optd->opdes_props & OP_VARLEN)) {
- /* fixed length - size must match */
- if (opt->len != optd->opdes_size) {
- optcom_err_ack(q, mp, TSYSERR, EINVAL);
- return (0);
- }
- }
- break;
-
- case T_CHECK:
- if (!OA_RWX_ANYPRIV(optd))
- /* any of "rwx" permission but not not none */
- goto bad_opt;
- /*
- * XXX Since T_CURRENT was not there in TLI and the
- * official TLI inspired TPI standard, getsockopt()
- * API uses T_CHECK (for T_CURRENT semantics)
- * The following fallthru makes sense because of its
- * historical use as semantic equivalent to T_CURRENT.
- */
- /* FALLTHRU */
- case T_CURRENT:
- if (!OA_READ_PERMISSION(optd, cr)) {
- /* can't read option value */
- if (!(OA_MATCHED_PRIV(optd, cr)) &&
- OA_R_ANYPRIV(optd)) {
- /*
- * not privileged but privilege
- * will help in reading option value.
- */
- optcom_err_ack(q, mp, TACCES, 0);
- return (0);
- } else
- goto bad_opt;
- }
- break;
-
- default:
- optcom_err_ack(q, mp, TBADFLAG, 0);
+ error = proto_opt_check(opt->level, opt->name, opt->len, NULL,
+ opt_arr, opt_arr_cnt, topmost_tpiprovider,
+ tor->MGMT_flags == T_NEGOTIATE, tor->MGMT_flags == T_CHECK,
+ cr);
+ if (error < 0) {
+ optcom_err_ack(q, mp, -error, 0);
+ return (0);
+ } else if (error > 0) {
+ optcom_err_ack(q, mp, TSYSERR, error);
return (0);
}
- /* We liked it. Keep going. */
} /* end for loop scanning option buffer */
/* Now complete the operation as required. */
@@ -609,7 +545,7 @@ restart:
* non-fatal by svr4_optcom_req() and are
* returned by setfn() when it is passed an
* option it does not handle. Since the option
- * passed opt_chk_lookup(), it is implied that
+ * passed proto_opt_lookup(), it is implied that
* it is valid but was either handled upstream
* or will be handled downstream.
*/
@@ -892,7 +828,7 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
/* Find the option in the opt_arr. */
if (opt->name != T_ALLOPT) {
- optd = opt_chk_lookup(opt->level, opt->name,
+ optd = proto_opt_lookup(opt->level, opt->name,
opt_arr, opt_arr_cnt);
if (optd == NULL) {
/*
@@ -972,7 +908,7 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
case T_CURRENT:
/*
- * The opt_chk_lookup() routine call above approved of
+ * The proto_opt_lookup() routine call above approved of
* this option so we can work on the status for it
* based on the permissions for the operation. (This
* can override any status for it set at higher levels)
@@ -1044,7 +980,7 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp,
}
}
/*
- * The opt_chk_lookup() routine above() approved of
+ * The proto_opt_lookup() routine above() approved of
* this option so we can work on the status for it based
* on the permissions for the operation. (This can
* override anything set at a higher level).
@@ -1309,7 +1245,7 @@ do_opt_default(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
/*
* lookup the option in the table and fill default value
*/
- optd = opt_chk_lookup(reqopt->level, reqopt->name,
+ optd = proto_opt_lookup(reqopt->level, reqopt->name,
opt_arr, opt_arr_cnt);
if (optd == NULL) {
@@ -1609,8 +1545,7 @@ do_opt_current(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp,
}
}
-
-
+/* ARGSUSED */
static int
do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
uint_t optset_context, uchar_t **resptrp, t_uscalar_t *worst_statusp,
@@ -1819,7 +1754,6 @@ do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt,
* Then delete "ignored" options from option buffer and return success.
*
*/
-
int
tpi_optcom_buf(queue_t *q, mblk_t *mp, t_scalar_t *opt_lenp,
t_scalar_t opt_offset, cred_t *cr, optdb_obj_t *dbobjp,
@@ -1890,7 +1824,7 @@ tpi_optcom_buf(queue_t *q, mblk_t *mp, t_scalar_t *opt_lenp,
}
/* Find the option in the opt_arr. */
- optd = opt_chk_lookup(opt->level, opt->name,
+ optd = proto_opt_lookup(opt->level, opt->name,
opt_arr, opt_arr_cnt);
if (optd == NULL) {
@@ -2043,21 +1977,6 @@ error_ret:
return (error);
}
-static opdes_t *
-opt_chk_lookup(t_uscalar_t level, t_uscalar_t name, opdes_t *opt_arr,
- uint_t opt_arr_cnt)
-{
- opdes_t *optd;
-
- for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt];
- optd++) {
- if (level == (uint_t)optd->opdes_level &&
- name == (uint_t)optd->opdes_name)
- return (optd);
- }
- return (NULL);
-}
-
static boolean_t
opt_level_valid(t_uscalar_t level, optlevel_t *valid_level_arr,
uint_t valid_level_arr_cnt)
@@ -2287,3 +2206,68 @@ optcom_pkt_set(uchar_t *invalp, uint_t inlen, boolean_t sticky,
*optlenp = inlen + reservelen;
return (0);
}
+
+int
+process_auxiliary_options(conn_t *connp, void *control, t_uscalar_t controllen,
+ void *optbuf, optdb_obj_t *dbobjp, int (*opt_set_fn)(conn_t *, uint_t, int,
+ int, uint_t, uchar_t *, uint_t *, uchar_t *, void *, cred_t *))
+{
+ struct cmsghdr *cmsg;
+ opdes_t *optd;
+ t_uscalar_t outlen;
+ int error = EOPNOTSUPP;
+ t_uscalar_t len;
+ uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt;
+ opdes_t *opt_arr = dbobjp->odb_opt_des_arr;
+
+ for (cmsg = (struct cmsghdr *)control;
+ CMSG_VALID(cmsg, control, (uintptr_t)control + controllen);
+ cmsg = CMSG_NEXT(cmsg)) {
+
+ len = (t_uscalar_t)CMSG_CONTENTLEN(cmsg);
+ /* Find the option in the opt_arr. */
+ optd = proto_opt_lookup(cmsg->cmsg_level, cmsg->cmsg_type,
+ opt_arr, opt_arr_cnt);
+ if (optd == NULL) {
+ return (EINVAL);
+ }
+ if (OA_READONLY_PERMISSION(optd, connp->conn_cred)) {
+ return (EACCES);
+ }
+ if (OA_MATCHED_PRIV(optd, connp->conn_cred)) {
+ /*
+ * For privileged options, we DO perform
+ * access checks as is common sense
+ */
+ if (!OA_WX_ANYPRIV(optd)) {
+ return (EACCES);
+ }
+ } else {
+ /*
+ * For non privileged, we fail instead following
+ * "ignore" semantics dictated by XTI spec for
+ * permissions problems.
+ */
+ if (!OA_WX_NOPRIV(optd)) { /* nopriv */
+ return (EACCES);
+ }
+ }
+ error = opt_set_fn(connp, SETFN_UD_NEGOTIATE, optd->opdes_level,
+ optd->opdes_name, len, (uchar_t *)CMSG_CONTENT(cmsg),
+ &outlen, (uchar_t *)CMSG_CONTENT(cmsg), (void *)optbuf,
+ connp->conn_cred);
+ if (error > 0) {
+ return (error);
+ } else if (outlen > len) {
+ return (EINVAL);
+ } else {
+ /*
+ * error can be -ve if the protocol wants to
+ * pass the option to IP. We donot pass auxiliary
+ * options to IP.
+ */
+ error = 0;
+ }
+ }
+ return (error);
+}
diff --git a/usr/src/uts/common/inet/optcom.h b/usr/src/uts/common/inet/optcom.h
index 1d2d1cb09d..07cb7cf946 100644
--- a/usr/src/uts/common/inet/optcom.h
+++ b/usr/src/uts/common/inet/optcom.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -27,14 +27,13 @@
#ifndef _INET_OPTCOM_H
#define _INET_OPTCOM_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
#if defined(_KERNEL) && defined(__STDC__)
+#include <inet/ipclassifier.h>
/* Options Description Structure */
typedef struct opdes_s {
t_uscalar_t opdes_name; /* option name */
@@ -139,6 +138,10 @@ typedef struct opdes_s {
#define OA_NO_PERMISSION(x, c) (OA_MATCHED_PRIV((x), (c)) ? \
((x)->opdes_access_priv == 0) : ((x)->opdes_access_nopriv == 0))
+#define PASS_OPT_TO_IP(connp) \
+ if (IPCL_IS_NONSTR(connp)) \
+ return (-EINVAL)
+
/*
* Other properties set in opdes_props field.
*/
@@ -217,6 +220,10 @@ extern t_uscalar_t optcom_max_optsize(opdes_t *, uint_t);
extern int optcom_pkt_set(uchar_t *, uint_t, boolean_t, uchar_t **, uint_t *,
uint_t);
+extern int process_auxiliary_options(conn_t *, void *, t_uscalar_t,
+ void *, optdb_obj_t *, int (*)(conn_t *, uint_t, int, int, uint_t,
+ uchar_t *, uint_t *, uchar_t *, void *, cred_t *));
+
#endif /* defined(_KERNEL) && defined(__STDC__) */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/inet/proto_set.c b/usr/src/uts/common/inet/proto_set.c
new file mode 100644
index 0000000000..45f07d2ed3
--- /dev/null
+++ b/usr/src/uts/common/inet/proto_set.c
@@ -0,0 +1,440 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <inet/common.h>
+#include <sys/stream.h>
+#include <sys/stropts.h>
+#include <sys/strsun.h>
+#include <sys/sysmacros.h>
+#include <sys/stropts.h>
+#include <sys/strsubr.h>
+#include <sys/tpicommon.h>
+#include <sys/socket_proto.h>
+#include <sys/policy.h>
+#include <inet/optcom.h>
+#include <inet/ipclassifier.h>
+
+boolean_t
+proto_set_rx_hiwat(queue_t *q, conn_t *connp, size_t size)
+{
+
+ if (connp != NULL && IPCL_IS_NONSTR(connp)) {
+ struct sock_proto_props sopp;
+
+ sopp.sopp_flags = SOCKOPT_RCVHIWAT;
+ sopp.sopp_rxhiwat = size;
+ (*connp->conn_upcalls->su_set_proto_props)
+ (connp->conn_upper_handle, &sopp);
+ } else {
+ MBLKP mp;
+ struct stroptions *stropt;
+
+ if (!(mp = allocb(sizeof (*stropt), BPRI_LO)))
+ return (B_FALSE);
+ mp->b_datap->db_type = M_SETOPTS;
+ mp->b_wptr += sizeof (*stropt);
+ stropt = (struct stroptions *)mp->b_rptr;
+ stropt->so_flags = SO_HIWAT;
+ stropt->so_hiwat = size;
+ putnext(q, mp);
+ }
+ return (B_TRUE);
+}
+
+boolean_t
+proto_set_rx_lowat(queue_t *q, conn_t *connp, size_t size)
+{
+
+ if (connp != NULL && IPCL_IS_NONSTR(connp)) {
+ struct sock_proto_props sopp;
+
+ sopp.sopp_flags = SOCKOPT_RCVLOWAT;
+ sopp.sopp_rxlowat = size;
+ (*connp->conn_upcalls->su_set_proto_props)
+ (connp->conn_upper_handle, &sopp);
+ } else {
+ MBLKP mp;
+ struct stroptions *stropt;
+
+ if (!(mp = allocb(sizeof (*stropt), BPRI_LO)))
+ return (B_FALSE);
+ mp->b_datap->db_type = M_SETOPTS;
+ mp->b_wptr += sizeof (*stropt);
+ stropt = (struct stroptions *)mp->b_rptr;
+ stropt->so_flags = SO_LOWAT;
+ stropt->so_lowat = size;
+ putnext(q, mp);
+ }
+ return (B_TRUE);
+}
+
+/*
+ * Set maximum packet size. This is the maximum amount of data the protocol
+ * wants to be given at any time, Larger data needs to be broken in multiples
+ * of maximum packet size and given to the protocol one at a time.
+ */
+boolean_t
+proto_set_maxpsz(queue_t *q, conn_t *connp, size_t size)
+{
+ if (connp != NULL && IPCL_IS_NONSTR(connp)) {
+ struct sock_proto_props sopp;
+
+ sopp.sopp_flags = SOCKOPT_MAXPSZ;
+ sopp.sopp_maxpsz = size;
+ (*connp->conn_upcalls->su_set_proto_props)
+ (connp->conn_upper_handle, &sopp);
+ return (B_TRUE);
+ } else {
+ struct stdata *stp;
+ queue_t *wq;
+ stp = STREAM(q);
+
+ /*
+ * At this point change of a queue parameter is not allowed
+ * when a multiplexor is sitting on top.
+ */
+ if (stp == NULL || stp->sd_flag & STPLEX)
+ return (B_FALSE);
+
+ claimstr(stp->sd_wrq);
+ wq = stp->sd_wrq->q_next;
+ ASSERT(wq != NULL);
+ (void) strqset(wq, QMAXPSZ, 0, size);
+ releasestr(stp->sd_wrq);
+ return (B_TRUE);
+ }
+}
+
+/* ARGSUSED */
+boolean_t
+proto_set_tx_maxblk(queue_t *q, conn_t *connp, ssize_t size)
+{
+ if (connp != NULL && IPCL_IS_NONSTR(connp)) {
+ struct sock_proto_props sopp;
+
+ sopp.sopp_flags = SOCKOPT_MAXBLK;
+ sopp.sopp_maxblk = size;
+ (*connp->conn_upcalls->su_set_proto_props)
+ (connp->conn_upper_handle, &sopp);
+ } else {
+ MBLKP mp;
+ struct stroptions *stropt;
+
+ if (!(mp = allocb(sizeof (*stropt), BPRI_LO)))
+ return (B_FALSE);
+ mp->b_datap->db_type = M_SETOPTS;
+ mp->b_wptr += sizeof (*stropt);
+ stropt = (struct stroptions *)mp->b_rptr;
+ stropt->so_flags = SO_MAXBLK;
+ stropt->so_maxblk = size;
+ putnext(q, mp);
+ }
+ return (B_TRUE);
+}
+
+boolean_t
+proto_set_tx_copyopt(queue_t *q, conn_t *connp, int copyopt)
+{
+ if (connp != NULL && IPCL_IS_NONSTR(connp)) {
+ struct sock_proto_props sopp;
+
+ sopp.sopp_flags = SOCKOPT_ZCOPY;
+ sopp.sopp_zcopyflag = (ushort_t)copyopt;
+ (*connp->conn_upcalls->su_set_proto_props)
+ (connp->conn_upper_handle, &sopp);
+ } else {
+ MBLKP mp;
+ struct stroptions *stropt;
+
+ if (!(mp = allocb(sizeof (*stropt), BPRI_LO)))
+ return (B_FALSE);
+ mp->b_datap->db_type = M_SETOPTS;
+ mp->b_wptr += sizeof (*stropt);
+ stropt = (struct stroptions *)mp->b_rptr;
+ stropt->so_flags = SO_COPYOPT;
+ stropt->so_copyopt = (ushort_t)copyopt;
+ putnext(q, mp);
+ }
+ return (B_TRUE);
+}
+
+boolean_t
+proto_set_tx_wroff(queue_t *q, conn_t *connp, size_t size)
+{
+ if (connp != NULL && IPCL_IS_NONSTR(connp)) {
+ struct sock_proto_props sopp;
+
+ sopp.sopp_flags = SOCKOPT_WROFF;
+ sopp.sopp_wroff = size;
+
+ /* XXX workaround for CR6757374 */
+ if (connp->conn_upper_handle != NULL)
+ (*connp->conn_upcalls->su_set_proto_props)
+ (connp->conn_upper_handle, &sopp);
+ } else {
+
+ MBLKP mp;
+ struct stroptions *stropt;
+ if (!(mp = allocb(sizeof (*stropt), BPRI_LO)))
+ return (B_FALSE);
+ mp->b_datap->db_type = M_SETOPTS;
+ mp->b_wptr += sizeof (*stropt);
+ stropt = (struct stroptions *)mp->b_rptr;
+ stropt->so_flags = SO_WROFF;
+ stropt->so_wroff = (ushort_t)size;
+ putnext(q, mp);
+ }
+ return (B_TRUE);
+}
+
+/*
+ * set OOBINLINE processing on the socket
+ */
+void
+proto_set_rx_oob_opt(conn_t *connp, boolean_t onoff)
+{
+ struct sock_proto_props sopp;
+
+ ASSERT(IPCL_IS_NONSTR(connp));
+
+ sopp.sopp_flags = SOCKOPT_OOBINLINE;
+ sopp.sopp_oobinline = onoff;
+ (*connp->conn_upcalls->su_set_proto_props)
+ (connp->conn_upper_handle, &sopp);
+}
+
+/*
+ * Translate a TLI(/XTI) error into a system error as best we can.
+ */
+static const int tli_errs[] = {
+ 0, /* no error */
+ EADDRNOTAVAIL, /* TBADADDR */
+ ENOPROTOOPT, /* TBADOPT */
+ EACCES, /* TACCES */
+ EBADF, /* TBADF */
+ EADDRNOTAVAIL, /* TNOADDR */
+ EPROTO, /* TOUTSTATE */
+ ECONNABORTED, /* TBADSEQ */
+ 0, /* TSYSERR - will never get */
+ EPROTO, /* TLOOK - should never be sent by transport */
+ EMSGSIZE, /* TBADDATA */
+ EMSGSIZE, /* TBUFOVFLW */
+ EPROTO, /* TFLOW */
+ EWOULDBLOCK, /* TNODATA */
+ EPROTO, /* TNODIS */
+ EPROTO, /* TNOUDERR */
+ EINVAL, /* TBADFLAG */
+ EPROTO, /* TNOREL */
+ EOPNOTSUPP, /* TNOTSUPPORT */
+ EPROTO, /* TSTATECHNG */
+ /* following represent error namespace expansion with XTI */
+ EPROTO, /* TNOSTRUCTYPE - never sent by transport */
+ EPROTO, /* TBADNAME - never sent by transport */
+ EPROTO, /* TBADQLEN - never sent by transport */
+ EADDRINUSE, /* TADDRBUSY */
+ EBADF, /* TINDOUT */
+ EBADF, /* TPROVMISMATCH */
+ EBADF, /* TRESQLEN */
+ EBADF, /* TRESADDR */
+ EPROTO, /* TQFULL - never sent by transport */
+ EPROTO, /* TPROTO */
+};
+
+int
+proto_tlitosyserr(int terr)
+{
+ ASSERT(terr != TSYSERR);
+ if (terr >= (sizeof (tli_errs) / sizeof (tli_errs[0])))
+ return (EPROTO);
+ else
+ return (tli_errs[terr]);
+}
+
+/*
+ * Verify that address is suitable for connect/sendmsg and is aligned properly
+ * Since this is a generic function we do not test for port being zero
+ * as some protocols like icmp do not require a port
+ */
+int
+proto_verify_ip_addr(int family, const struct sockaddr *name, socklen_t namelen)
+{
+
+ if (name == NULL || !OK_32PTR((char *)name))
+ return (EINVAL);
+
+ switch (family) {
+ case AF_INET:
+ if (name->sa_family != AF_INET) {
+ return (EAFNOSUPPORT);
+ }
+
+ if (namelen != (socklen_t)sizeof (struct sockaddr_in)) {
+ return (EINVAL);
+ }
+ break;
+ case AF_INET6: {
+#ifdef DEBUG
+ struct sockaddr_in6 *sin6;
+#endif /* DEBUG */
+
+ if (name->sa_family != AF_INET6) {
+ return (EAFNOSUPPORT);
+ }
+ if (namelen != (socklen_t)sizeof (struct sockaddr_in6)) {
+ return (EINVAL);
+ }
+#ifdef DEBUG
+ /* Verify that apps don't forget to clear sin6_scope_id etc */
+ sin6 = (struct sockaddr_in6 *)name;
+ if (sin6->sin6_scope_id != 0 &&
+ !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
+ zcmn_err(getzoneid(), CE_WARN,
+ "connect/send* with uninitialized sin6_scope_id "
+ "(%d) on socket. Pid = %d\n",
+ (int)sin6->sin6_scope_id, (int)curproc->p_pid);
+ }
+#endif /* DEBUG */
+ break;
+ }
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+/*
+ * Do a lookup of the options in the array.
+ * Rerurn NULL if there isn't a match.
+ */
+opdes_t *
+proto_opt_lookup(t_uscalar_t level, t_uscalar_t name, opdes_t *opt_arr,
+ uint_t opt_arr_cnt)
+{
+ opdes_t *optd;
+
+ for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt];
+ optd++) {
+ if (level == (uint_t)optd->opdes_level &&
+ name == (uint_t)optd->opdes_name)
+ return (optd);
+ }
+ return (NULL);
+}
+
+/*
+ * Do a lookup of the options in the array and do permission and length checking
+ * Returns zero if there is no error (note: for non-tpi-providers not being able
+ * to find the option is not an error). TPI errors are returned as -ve.
+ */
+int
+proto_opt_check(int level, int name, int len, t_uscalar_t *max_len,
+ opdes_t *opt_arr, uint_t opt_arr_cnt, boolean_t topmost_tpiprovider,
+ boolean_t negotiate, boolean_t check, cred_t *cr)
+{
+ opdes_t *optd;
+
+ /* Find the option in the opt_arr. */
+ if ((optd = proto_opt_lookup(level, name, opt_arr, opt_arr_cnt)) ==
+ NULL) {
+ /*
+ * Not found, that is a bad thing if
+ * the caller is a tpi provider
+ */
+ if (topmost_tpiprovider)
+ return (-TBADOPT);
+ else
+ return (0); /* skip unmodified */
+ }
+
+ /* Additional checks dependent on operation. */
+ if (negotiate) {
+ /* Cannot be true at the same time */
+ ASSERT(check == B_FALSE);
+
+ if (!OA_WRITE_OR_EXECUTE(optd, cr)) {
+ /* can't negotiate option */
+ if (!(OA_MATCHED_PRIV(optd, cr)) &&
+ OA_WX_ANYPRIV(optd)) {
+ /*
+ * not privileged but privilege
+ * will help negotiate option.
+ */
+ return (-TACCES);
+ } else {
+ return (-TBADOPT);
+ }
+ }
+ /*
+ * Verify size for options
+ * Note: For retaining compatibility with historical
+ * behavior, variable lengths options will have their
+ * length verified in the setfn() processing.
+ * In order to be compatible with SunOS 4.X we return
+ * EINVAL errors for bad lengths.
+ */
+ if (!(optd->opdes_props & OP_VARLEN)) {
+ /* fixed length - size must match */
+ if (len != optd->opdes_size) {
+ return (EINVAL);
+ }
+ }
+ } else {
+ if (check) {
+ if (!OA_RWX_ANYPRIV(optd))
+ /* any of "rwx" permission but not none */
+ return (-TBADOPT);
+ }
+ /*
+ * XXX Change the comments.
+ *
+ * XXX Since T_CURRENT was not there in TLI and the
+ * official TLI inspired TPI standard, getsockopt()
+ * API uses T_CHECK (for T_CURRENT semantics)
+ * The following fallthru makes sense because of its
+ * historical use as semantic equivalent to T_CURRENT.
+ */
+ /* FALLTHRU */
+ if (!OA_READ_PERMISSION(optd, cr)) {
+ /* can't read option value */
+ if (!(OA_MATCHED_PRIV(optd, cr)) &&
+ OA_R_ANYPRIV(optd)) {
+ /*
+ * not privileged but privilege
+ * will help in reading option value.
+ */
+ return (-TACCES);
+ } else {
+ return (-TBADOPT);
+ }
+ }
+ }
+ if (max_len != NULL)
+ *max_len = optd->opdes_size;
+
+ /* We liked it. Keep going. */
+ return (0);
+}
diff --git a/usr/src/uts/common/inet/proto_set.h b/usr/src/uts/common/inet/proto_set.h
new file mode 100644
index 0000000000..8e714c7c05
--- /dev/null
+++ b/usr/src/uts/common/inet/proto_set.h
@@ -0,0 +1,58 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _INET_PROTO_SET_H
+#define _INET_PROTO_SET_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/socket_proto.h>
+#include <inet/optcom.h>
+#include <inet/ipclassifier.h>
+
+extern boolean_t proto_set_rx_hiwat(queue_t *, struct conn_s *, size_t);
+extern boolean_t proto_set_rx_lowat(queue_t *, struct conn_s *, size_t);
+extern boolean_t proto_set_maxpsz(queue_t *, struct conn_s *, size_t);
+extern boolean_t proto_set_tx_maxblk(queue_t *, struct conn_s *,
+ ssize_t);
+extern boolean_t proto_set_tx_copyopt(queue_t *, struct conn_s *, int);
+extern boolean_t proto_set_tx_wroff(queue_t *, struct conn_s *, size_t);
+extern void proto_set_rx_oob_opt(struct conn_s *, boolean_t);
+
+extern int proto_tlitosyserr(int);
+extern int proto_verify_ip_addr(int, const struct sockaddr *, socklen_t);
+
+extern int proto_opt_check(int, int, int, t_uscalar_t *, opdes_t *,
+ uint_t, boolean_t, boolean_t, boolean_t, cred_t *);
+extern opdes_t *proto_opt_lookup(t_uscalar_t, t_uscalar_t, opdes_t *, uint_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _INET_PROTO_SET_H */
diff --git a/usr/src/uts/common/inet/rawip_impl.h b/usr/src/uts/common/inet/rawip_impl.h
index 638cea6c70..f818247b67 100644
--- a/usr/src/uts/common/inet/rawip_impl.h
+++ b/usr/src/uts/common/inet/rawip_impl.h
@@ -27,8 +27,6 @@
#ifndef _RAWIP_IMPL_H
#define _RAWIP_IMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -44,6 +42,7 @@ extern "C" {
#include <inet/common.h>
#include <inet/ip.h>
+#include <inet/optcom.h>
/* Named Dispatch Parameter Management Structure */
typedef struct icmpparam_s {
@@ -63,7 +62,9 @@ struct icmp_stack {
icmpparam_t *is_param_arr; /* ndd variable table */
kstat_t *is_ksp; /* kstats */
mib2_rawip_t is_rawip_mib; /* SNMP fixed size info */
+ ldi_ident_t is_ldi_ident;
};
+
typedef struct icmp_stack icmp_stack_t;
/* Internal icmp control structure, one per open stream */
@@ -76,7 +77,7 @@ typedef struct icmp_s {
uint_t icmp_state; /* TPI state */
in6_addr_t icmp_v6src; /* Source address of this stream */
in6_addr_t icmp_bound_v6src; /* Explicitely bound to address */
- in6_addr_t icmp_v6dst; /* Connected destination */
+ sin6_t icmp_v6dst; /* Connected destination */
/*
* IP format that packets transmitted from this struct should use.
* Value can be IP4_VERSION or IPV6_VERSION.
@@ -87,7 +88,6 @@ typedef struct icmp_s {
sa_family_t icmp_family; /* Family from socket() call */
/* Following protected by icmp_rwlock */
- uint32_t icmp_flowinfo; /* Connected flow id and tclass */
uint32_t icmp_max_hdr_len; /* For write offset in stream head */
uint_t icmp_proto;
uint_t icmp_ip_snd_options_len; /* Len of IPv4 options */
@@ -144,6 +144,15 @@ typedef struct icmp_s {
uint_t icmp_label_len_v6; /* sec. part of sticky opt */
in6_addr_t icmp_v6lastdst; /* most recent destination */
icmp_stack_t *icmp_is; /* Stack instance */
+ size_t icmp_xmit_hiwat;
+ size_t icmp_xmit_lowat;
+ size_t icmp_recv_hiwat;
+ size_t icmp_recv_lowat;
+ int icmp_delayed_error;
+ kmutex_t icmp_recv_lock;
+ mblk_t *icmp_fallback_queue_head;
+ mblk_t *icmp_fallback_queue_tail;
+ struct sockaddr_storage icmp_delayed_addr;
} icmp_t;
/*
@@ -155,10 +164,16 @@ extern optdb_obj_t icmp_opt_obj;
extern uint_t icmp_max_optsize;
extern mblk_t *icmp_snmp_get(queue_t *q, mblk_t *mpctl);
-extern void rawip_resume_bind(conn_t *, mblk_t *);
-extern void icmp_ddi_init(void);
-extern void icmp_ddi_destroy(void);
+extern void icmp_ddi_g_init(void);
+extern void icmp_ddi_g_destroy(void);
+
+extern sock_lower_handle_t rawip_create(int, int, int, sock_downcalls_t **,
+ uint_t *, int *, int, cred_t *);
+extern void rawip_fallback(sock_lower_handle_t, queue_t *, boolean_t,
+ so_proto_quiesced_cb_t);
+
+extern sock_downcalls_t sock_rawip_downcalls;
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/inet/rts_impl.h b/usr/src/uts/common/inet/rts_impl.h
index f89d1ec82c..de7cd8970b 100644
--- a/usr/src/uts/common/inet/rts_impl.h
+++ b/usr/src/uts/common/inet/rts_impl.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -27,8 +27,6 @@
#ifndef _RTS_IMPL_H
#define _RTS_IMPL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -44,6 +42,7 @@ extern "C" {
#include <inet/common.h>
#include <inet/ip.h>
+#include <inet/optcom.h>
/* Named Dispatch Parameter Management Structure */
typedef struct rtsparam_s {
@@ -61,6 +60,8 @@ struct rts_stack {
caddr_t rtss_g_nd;
rtsparam_t *rtss_params;
+
+ ldi_ident_t rtss_ldi_ident;
};
typedef struct rts_stack rts_stack_t;
@@ -84,10 +85,25 @@ typedef struct rts_s {
/* Written to only once at the time of opening the endpoint */
conn_t *rts_connp;
+
+ /* Outbound flow control */
+ size_t rts_xmit_hiwat;
+ size_t rts_xmit_lowat;
+
+ /* Inbound flow control */
+ size_t rts_recv_hiwat;
+ size_t rts_recv_lowat;
+
+ kmutex_t rts_send_mutex;
+ kmutex_t rts_recv_mutex;
+ kcondvar_t rts_send_cv;
+ kcondvar_t rts_io_cv;
} rts_t;
#define RTS_WPUT_PENDING 0x1 /* Waiting for write-side to complete */
+#define RTS_REQ_PENDING 0x1 /* For direct sockets */
#define RTS_WRW_PENDING 0x2 /* Routing socket write in progress */
+#define RTS_REQ_INPROG 0x2 /* For direct sockets */
/*
* Object to represent database of options to search passed to
@@ -98,8 +114,19 @@ typedef struct rts_s {
extern optdb_obj_t rts_opt_obj;
extern uint_t rts_max_optsize;
-extern void rts_ddi_init(void);
-extern void rts_ddi_destroy(void);
+extern void rts_ddi_g_init(void);
+extern void rts_ddi_g_destroy(void);
+
+extern int rts_tpi_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
+extern int rts_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
+ uint_t *, uchar_t *, void *, cred_t *, mblk_t *);
+extern int rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name,
+ uchar_t *ptr);
+
+extern sock_lower_handle_t rts_create(int, int, int, sock_downcalls_t **,
+ uint_t *, int *, int, cred_t *);
+
+extern sock_downcalls_t sock_rts_downcalls;
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/inet/sctp/sctp.c b/usr/src/uts/common/inet/sctp/sctp.c
index f76612f04f..1dc96a687b 100644
--- a/usr/src/uts/common/inet/sctp/sctp.c
+++ b/usr/src/uts/common/inet/sctp/sctp.c
@@ -279,13 +279,13 @@ sctp_clean_death(sctp_t *sctp, int err)
if (sctp->sctp_xmit_head || sctp->sctp_xmit_unsent) {
sctp_regift_xmitlist(sctp);
}
- if (sctp->sctp_ulp_disconnected(sctp->sctp_ulpd, err)) {
+ if (sctp->sctp_ulp_disconnected(sctp->sctp_ulpd, 0, err)) {
/*
* Socket is gone, detach.
*/
sctp->sctp_detached = B_TRUE;
sctp->sctp_ulpd = NULL;
- bzero(&sctp->sctp_upcalls, sizeof (sctp_upcalls_t));
+ sctp->sctp_upcalls = NULL;
}
}
@@ -447,7 +447,7 @@ sctp_close(sctp_t *sctp)
RUN_SCTP(sctp);
sctp->sctp_detached = 1;
sctp->sctp_ulpd = NULL;
- bzero(&sctp->sctp_upcalls, sizeof (sctp_upcalls_t));
+ sctp->sctp_upcalls = NULL;
bzero(&sctp->sctp_events, sizeof (sctp->sctp_events));
/* If the graceful shutdown has not been completed, just return. */
@@ -1341,8 +1341,8 @@ sctp_icmp_error_ipv6(sctp_t *sctp, mblk_t *mp)
* If parent pointer is passed in, inherit settings from it.
*/
sctp_t *
-sctp_create(void *sctp_ulpd, sctp_t *parent, int family, int flags,
- const sctp_upcalls_t *sctp_upcalls, sctp_sockbuf_limits_t *sbl,
+sctp_create(void *ulpd, sctp_t *parent, int family, int flags,
+ sock_upcalls_t *upcalls, sctp_sockbuf_limits_t *sbl,
cred_t *credp)
{
sctp_t *sctp, *psctp;
@@ -1507,12 +1507,11 @@ sctp_create(void *sctp_ulpd, sctp_t *parent, int family, int flags,
sctp->sctp_adv_pap = sctp->sctp_lastack_rxd;
/* Information required by upper layer */
- if (sctp_ulpd != NULL) {
- sctp->sctp_ulpd = sctp_ulpd;
+ if (ulpd != NULL) {
+ sctp->sctp_ulpd = ulpd;
- ASSERT(sctp_upcalls != NULL);
- bcopy(sctp_upcalls, &sctp->sctp_upcalls,
- sizeof (sctp_upcalls_t));
+ ASSERT(upcalls != NULL);
+ sctp->sctp_upcalls = upcalls;
ASSERT(sbl != NULL);
/* Fill in the socket buffer limits for sctpsockfs */
sbl->sbl_txlowat = sctp->sctp_xmit_lowater;
@@ -1520,8 +1519,8 @@ sctp_create(void *sctp_ulpd, sctp_t *parent, int family, int flags,
sbl->sbl_rxbuf = sctp->sctp_rwnd;
sbl->sbl_rxlowat = SCTP_RECV_LOWATER;
}
- /* If no sctp_ulpd, must be creating the default sctp */
- ASSERT(sctp_ulpd != NULL || sctps->sctps_gsctp == NULL);
+ /* If no ulpd, must be creating the default sctp */
+ ASSERT(ulpd != NULL || sctps->sctps_gsctp == NULL);
/* Insert this in the global list. */
SCTP_LINK(sctp, sctps);
diff --git a/usr/src/uts/common/inet/sctp/sctp_bind.c b/usr/src/uts/common/inet/sctp/sctp_bind.c
index 2091d91ab5..dfb70fc202 100644
--- a/usr/src/uts/common/inet/sctp/sctp_bind.c
+++ b/usr/src/uts/common/inet/sctp/sctp_bind.c
@@ -24,8 +24,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/stream.h>
@@ -174,12 +172,16 @@ sctp_bind(sctp_t *sctp, struct sockaddr *sa, socklen_t len)
int err = 0;
ASSERT(sctp != NULL);
- ASSERT(sa);
RUN_SCTP(sctp);
- if (sctp->sctp_state > SCTPS_BOUND ||
- (sctp->sctp_connp->conn_state_flags & CONN_CLOSING)) {
+ if ((sctp->sctp_state >= SCTPS_BOUND) ||
+ (sctp->sctp_connp->conn_state_flags & CONN_CLOSING) ||
+ (sa == NULL || len == 0)) {
+ /*
+ * Multiple binds not allowed for any SCTP socket
+ * Also binding with null address is not supported.
+ */
err = EINVAL;
goto done;
}
diff --git a/usr/src/uts/common/inet/sctp/sctp_common.c b/usr/src/uts/common/inet/sctp/sctp_common.c
index 548a326806..10aff2af34 100644
--- a/usr/src/uts/common/inet/sctp/sctp_common.c
+++ b/usr/src/uts/common/inet/sctp/sctp_common.c
@@ -398,6 +398,8 @@ void
sctp_set_ulp_prop(sctp_t *sctp)
{
int hdrlen;
+ struct sock_proto_props sopp;
+
sctp_stack_t *sctps = sctp->sctp_sctps;
if (sctp->sctp_current->isv4) {
@@ -408,9 +410,12 @@ sctp_set_ulp_prop(sctp_t *sctp)
ASSERT(sctp->sctp_ulpd);
ASSERT(sctp->sctp_current->sfa_pmss == sctp->sctp_mss);
- sctp->sctp_ulp_prop(sctp->sctp_ulpd,
- sctps->sctps_wroff_xtra + hdrlen + sizeof (sctp_data_hdr_t),
- sctp->sctp_mss - sizeof (sctp_data_hdr_t));
+ bzero(&sopp, sizeof (sopp));
+ sopp.sopp_flags = SOCKOPT_MAXBLK|SOCKOPT_WROFF;
+ sopp.sopp_wroff = sctps->sctps_wroff_xtra + hdrlen +
+ sizeof (sctp_data_hdr_t);
+ sopp.sopp_maxblk = sctp->sctp_mss - sizeof (sctp_data_hdr_t);
+ sctp->sctp_ulp_prop(sctp->sctp_ulpd, &sopp);
}
void
diff --git a/usr/src/uts/common/inet/sctp/sctp_conn.c b/usr/src/uts/common/inet/sctp/sctp_conn.c
index 716abc13bc..b4a9b56fdd 100644
--- a/usr/src/uts/common/inet/sctp/sctp_conn.c
+++ b/usr/src/uts/common/inet/sctp/sctp_conn.c
@@ -20,12 +20,10 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/stream.h>
@@ -152,8 +150,11 @@ sctp_accept_comm(sctp_t *listener, sctp_t *acceptor, mblk_t *cr_pkt,
acceptor->sctp_rwnd = listener->sctp_rwnd;
acceptor->sctp_irwnd = acceptor->sctp_rwnd;
acceptor->sctp_pd_point = acceptor->sctp_rwnd;
+ acceptor->sctp_upcalls = listener->sctp_upcalls;
+#if 0
bcopy(&listener->sctp_upcalls, &acceptor->sctp_upcalls,
sizeof (sctp_upcalls_t));
+#endif
return (0);
}
@@ -169,6 +170,7 @@ sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len,
int err;
conn_t *connp, *econnp;
sctp_stack_t *sctps;
+ struct sock_proto_props sopp;
/*
* No need to check for duplicate as this is the listener
@@ -292,22 +294,25 @@ sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len,
/* Connection established, so send up the conn_ind */
if ((eager->sctp_ulpd = sctp->sctp_ulp_newconn(sctp->sctp_ulpd,
- eager)) == NULL) {
+ (sock_lower_handle_t)eager, NULL, NULL, 0,
+ &eager->sctp_upcalls)) == NULL) {
sctp_close_eager(eager);
BUMP_MIB(&sctps->sctps_mib, sctpListenDrop);
return (NULL);
}
ASSERT(SCTP_IS_DETACHED(eager));
eager->sctp_detached = B_FALSE;
+ bzero(&sopp, sizeof (sopp));
+ sopp.sopp_flags = SOCKOPT_MAXBLK|SOCKOPT_WROFF;
+ sopp.sopp_maxblk = strmsgsz;
if (eager->sctp_family == AF_INET) {
- eager->sctp_ulp_prop(eager->sctp_ulpd,
- sctps->sctps_wroff_xtra + sizeof (sctp_data_hdr_t) +
- sctp->sctp_hdr_len, strmsgsz);
+ sopp.sopp_wroff = sctps->sctps_wroff_xtra +
+ sizeof (sctp_data_hdr_t) + sctp->sctp_hdr_len;
} else {
- eager->sctp_ulp_prop(eager->sctp_ulpd,
- sctps->sctps_wroff_xtra + sizeof (sctp_data_hdr_t) +
- sctp->sctp_hdr6_len, strmsgsz);
+ sopp.sopp_wroff = sctps->sctps_wroff_xtra +
+ sizeof (sctp_data_hdr_t) + sctp->sctp_hdr6_len;
}
+ eager->sctp_ulp_prop(eager->sctp_ulpd, &sopp);
return (eager);
}
@@ -333,6 +338,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
int err;
sctp_faddr_t *cur_fp;
sctp_stack_t *sctps = sctp->sctp_sctps;
+ struct sock_proto_props sopp;
/*
* Determine packet type based on type of address passed in
@@ -599,9 +605,11 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen)
BUMP_LOCAL(sctp->sctp_opkts);
notify_ulp:
- sctp->sctp_ulp_prop(sctp->sctp_ulpd,
- sctps->sctps_wroff_xtra + hdrlen + sizeof (sctp_data_hdr_t),
- 0);
+ bzero(&sopp, sizeof (sopp));
+ sopp.sopp_flags = SOCKOPT_WROFF;
+ sopp.sopp_wroff = sctps->sctps_wroff_xtra + hdrlen +
+ sizeof (sctp_data_hdr_t);
+ sctp->sctp_ulp_prop(sctp->sctp_ulpd, &sopp);
return (0);
default:
diff --git a/usr/src/uts/common/inet/sctp/sctp_cookie.c b/usr/src/uts/common/inet/sctp/sctp_cookie.c
index 93184bcd27..e089a901d3 100644
--- a/usr/src/uts/common/inet/sctp/sctp_cookie.c
+++ b/usr/src/uts/common/inet/sctp/sctp_cookie.c
@@ -1049,10 +1049,8 @@ sctp_send_cookie_echo(sctp_t *sctp, sctp_chunk_hdr_t *iackch, mblk_t *iackmp)
* unsent, since there won't be any sent-unack'ed
* here.
*/
- if (!SCTP_IS_DETACHED(sctp)) {
- sctp->sctp_ulp_xmitted(sctp->sctp_ulpd,
- sctp->sctp_unsent);
- }
+ if (!SCTP_IS_DETACHED(sctp))
+ SCTP_TXQ_UPDATE(sctp);
}
if (sctp->sctp_xmit_unsent == NULL)
sctp->sctp_xmit_unsent_tail = NULL;
diff --git a/usr/src/uts/common/inet/sctp/sctp_impl.h b/usr/src/uts/common/inet/sctp/sctp_impl.h
index 5f41226bf3..089edc3835 100644
--- a/usr/src/uts/common/inet/sctp/sctp_impl.h
+++ b/usr/src/uts/common/inet/sctp/sctp_impl.h
@@ -608,16 +608,16 @@ typedef struct sctp_s {
kcondvar_t sctp_cv;
boolean_t sctp_running;
- void *sctp_ulpd; /* SCTP upper layer desc. */
+#define sctp_ulpd sctp_connp->conn_upper_handle
+#define sctp_upcalls sctp_connp->conn_upcalls
- struct sctp_upcalls_s sctp_upcalls; /* upcalls for sctp_ulpd */
-#define sctp_ulp_newconn sctp_upcalls.su_newconn
-#define sctp_ulp_connected sctp_upcalls.su_connected
-#define sctp_ulp_disconnected sctp_upcalls.su_disconnected
-#define sctp_ulp_disconnecting sctp_upcalls.su_disconnecting
-#define sctp_ulp_recv sctp_upcalls.su_recv
-#define sctp_ulp_xmitted sctp_upcalls.su_xmitted
-#define sctp_ulp_prop sctp_upcalls.su_properties
+#define sctp_ulp_newconn sctp_upcalls->su_newconn
+#define sctp_ulp_connected sctp_upcalls->su_connected
+#define sctp_ulp_disconnected sctp_upcalls->su_disconnected
+#define sctp_ulp_opctl sctp_upcalls->su_opctl
+#define sctp_ulp_recv sctp_upcalls->su_recv
+#define sctp_ulp_xmitted sctp_upcalls->su_txq_full
+#define sctp_ulp_prop sctp_upcalls->su_set_proto_props
int32_t sctp_state;
@@ -768,8 +768,9 @@ typedef struct sctp_s {
sctp_rexmitting : 1, /* SCTP is retransmitting */
sctp_zero_win_probe : 1, /* doing zero win probe */
+ sctp_txq_full : 1, /* the tx queue is full */
sctp_ulp_discon_done : 1, /* ulp_disconnecting done */
- sctp_dummy : 7;
+ sctp_dummy : 6;
} sctp_bits;
struct {
uint32_t
@@ -809,6 +810,7 @@ typedef struct sctp_s {
#define sctp_linklocal sctp_bits.sctp_linklocal
#define sctp_rexmitting sctp_bits.sctp_rexmitting
#define sctp_zero_win_probe sctp_bits.sctp_zero_win_probe
+#define sctp_txq_full sctp_bits.sctp_txq_full
#define sctp_ulp_discon_done sctp_bits.sctp_ulp_discon_done
#define sctp_recvsndrcvinfo sctp_events.sctp_recvsndrcvinfo
@@ -935,6 +937,15 @@ typedef struct sctp_s {
uint32_t sctp_err_len; /* Total error chunks length */
} sctp_t;
+#define SCTP_TXQ_LEN(sctp) ((sctp)->sctp_unsent + (sctp)->sctp_unacked)
+#define SCTP_TXQ_UPDATE(sctp) \
+ if ((sctp)->sctp_txq_full && SCTP_TXQ_LEN(sctp) <= \
+ (sctp)->sctp_xmit_lowater) { \
+ (sctp)->sctp_txq_full = 0; \
+ (sctp)->sctp_ulp_xmitted((sctp)->sctp_ulpd, \
+ B_FALSE); \
+ }
+
#endif /* (defined(_KERNEL) || defined(_KMEMUSER)) */
extern void sctp_ack_timer(sctp_t *);
diff --git a/usr/src/uts/common/inet/sctp/sctp_input.c b/usr/src/uts/common/inet/sctp/sctp_input.c
index 71a85ad04e..87c79eedff 100644
--- a/usr/src/uts/common/inet/sctp/sctp_input.c
+++ b/usr/src/uts/common/inet/sctp/sctp_input.c
@@ -24,8 +24,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/stream.h>
@@ -1192,6 +1190,7 @@ sctp_data_chunk(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *mp, mblk_t **dups,
boolean_t tpfinished = B_TRUE;
int32_t new_rwnd;
sctp_stack_t *sctps = sctp->sctp_sctps;
+ int error;
/* The following are used multiple times, so we inline them */
#define SCTP_ACK_IT(sctp, tsn) \
@@ -1292,8 +1291,8 @@ sctp_data_chunk(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *mp, mblk_t **dups,
oftsn = sctp->sctp_ftsn;
if (isfrag) {
- int error = 0;
+ error = 0;
/* fragmented data chunk */
dmp->b_rptr = (uchar_t *)dc;
if (ubit) {
@@ -1408,13 +1407,18 @@ sctp_data_chunk(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *mp, mblk_t **dups,
sctp->sctp_rxqueued -= dlen;
if (can_deliver) {
+
dmp->b_rptr = (uchar_t *)(dc + 1);
if (sctp_input_add_ancillary(sctp, &dmp, dc, fp, ipp) == 0) {
dprint(1, ("sctp_data_chunk: delivering %lu bytes\n",
msgdsize(dmp)));
sctp->sctp_rwnd -= dlen;
+ /*
+ * Override b_flag for SCTP sockfs internal use
+ */
+ dmp->b_flag = tpfinished ? 0 : SCTP_PARTIAL_DATA;
new_rwnd = sctp->sctp_ulp_recv(sctp->sctp_ulpd, dmp,
- tpfinished ? 0 : SCTP_PARTIAL_DATA);
+ msgdsize(dmp), 0, &error, NULL);
if (new_rwnd > sctp->sctp_rwnd) {
sctp->sctp_rwnd = new_rwnd;
}
@@ -1492,8 +1496,13 @@ sctp_data_chunk(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *mp, mblk_t **dups,
dprint(1, ("sctp_data_chunk: delivering %lu "
"bytes\n", msgdsize(dmp)));
sctp->sctp_rwnd -= dlen;
+ /*
+ * Override b_flag for SCTP sockfs internal use
+ */
+ dmp->b_flag = tpfinished ?
+ 0 : SCTP_PARTIAL_DATA;
new_rwnd = sctp->sctp_ulp_recv(sctp->sctp_ulpd,
- dmp, tpfinished ? 0 : SCTP_PARTIAL_DATA);
+ dmp, msgdsize(dmp), 0, &error, NULL);
if (new_rwnd > sctp->sctp_rwnd) {
sctp->sctp_rwnd = new_rwnd;
}
@@ -1806,10 +1815,8 @@ sctp_check_abandoned_msg(sctp_t *sctp, mblk_t *meta)
* Update ULP the amount of queued data, which is
* sent-unack'ed + unsent.
*/
- if (!SCTP_IS_DETACHED(sctp)) {
- sctp->sctp_ulp_xmitted(sctp->sctp_ulpd,
- sctp->sctp_unacked + sctp->sctp_unsent);
- }
+ if (!SCTP_IS_DETACHED(sctp))
+ SCTP_TXQ_UPDATE(sctp);
return (0);
}
return (-1);
@@ -1922,10 +1929,8 @@ cum_ack_done:
* Update ULP the amount of queued data, which is
* sent-unack'ed + unsent.
*/
- if (!SCTP_IS_DETACHED(sctp)) {
- sctp->sctp_ulp_xmitted(sctp->sctp_ulpd,
- sctp->sctp_unacked + sctp->sctp_unsent);
- }
+ if (!SCTP_IS_DETACHED(sctp))
+ SCTP_TXQ_UPDATE(sctp);
/* Time to send a shutdown? */
if (sctp->sctp_state == SCTPS_SHUTDOWN_PENDING) {
@@ -2141,6 +2146,7 @@ sctp_process_forward_tsn(sctp_t *sctp, sctp_chunk_hdr_t *ch, sctp_faddr_t *fp,
}
if (can_deliver) {
int32_t nrwnd;
+ int error;
dmp->b_rptr = (uchar_t *)(dc + 1);
dmp->b_next = NULL;
@@ -2149,8 +2155,15 @@ sctp_process_forward_tsn(sctp_t *sctp, sctp_chunk_hdr_t *ch, sctp_faddr_t *fp,
&dmp, dc, fp, ipp) == 0) {
sctp->sctp_rxqueued -= dlen;
sctp->sctp_rwnd -= dlen;
+ /*
+ * Override b_flag for SCTP sockfs
+ * internal use
+ */
+
+ dmp->b_flag = 0;
nrwnd = sctp->sctp_ulp_recv(
- sctp->sctp_ulpd, dmp, 0);
+ sctp->sctp_ulpd, dmp, msgdsize(dmp),
+ 0, &error, NULL);
if (nrwnd > sctp->sctp_rwnd)
sctp->sctp_rwnd = nrwnd;
} else {
@@ -3947,7 +3960,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
sctp_stop_faddr_timers(sctp);
if (!SCTP_IS_DETACHED(sctp)) {
sctp->sctp_ulp_connected(
- sctp->sctp_ulpd);
+ sctp->sctp_ulpd, 0, NULL, -1);
sctp_set_ulp_prop(sctp);
}
sctp->sctp_state = SCTPS_ESTABLISHED;
@@ -3983,7 +3996,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
case CHUNK_COOKIE_ACK:
if (!SCTP_IS_DETACHED(sctp)) {
sctp->sctp_ulp_connected(
- sctp->sctp_ulpd);
+ sctp->sctp_ulpd, 0, NULL, -1);
sctp_set_ulp_prop(sctp);
}
if (sctp->sctp_unacked == 0)
@@ -4020,7 +4033,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp)
if (!SCTP_IS_DETACHED(sctp)) {
sctp->sctp_ulp_connected(
- sctp->sctp_ulpd);
+ sctp->sctp_ulpd, 0, NULL, -1);
sctp_set_ulp_prop(sctp);
}
if (sctp->sctp_unacked == 0)
diff --git a/usr/src/uts/common/inet/sctp/sctp_notify.c b/usr/src/uts/common/inet/sctp/sctp_notify.c
index f516154ce6..3ede878954 100644
--- a/usr/src/uts/common/inet/sctp/sctp_notify.c
+++ b/usr/src/uts/common/inet/sctp/sctp_notify.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/stream.h>
@@ -39,9 +37,12 @@
#include <netinet/sctp.h>
#include <inet/common.h>
+#include <inet/ipclassifier.h>
#include <inet/ip.h>
+
#include "sctp_impl.h"
+/* ARGSUSED */
static void
sctp_notify(sctp_t *sctp, mblk_t *emp, size_t len)
{
@@ -49,6 +50,7 @@ sctp_notify(sctp_t *sctp, mblk_t *emp, size_t len)
mblk_t *mp;
sctp_faddr_t *fp;
int32_t rwnd = 0;
+ int error;
if ((mp = allocb(sizeof (*tudi) + sizeof (void *) +
sizeof (struct sockaddr_in6), BPRI_HI)) == NULL) {
@@ -108,7 +110,13 @@ sctp_notify(sctp_t *sctp, mblk_t *emp, size_t len)
ASSERT(len == rwnd);
#endif
- rwnd = sctp->sctp_ulp_recv(sctp->sctp_ulpd, mp, SCTP_NOTIFICATION);
+ /*
+ * Override b_flag for SCTP sockfs internal use
+ */
+ mp->b_flag = (short)SCTP_NOTIFICATION;
+
+ rwnd = sctp->sctp_ulp_recv(sctp->sctp_ulpd, mp, msgdsize(mp), 0,
+ &error, NULL);
if (rwnd > sctp->sctp_rwnd) {
sctp->sctp_rwnd = rwnd;
}
diff --git a/usr/src/uts/common/inet/sctp/sctp_opt_data.c b/usr/src/uts/common/inet/sctp/sctp_opt_data.c
index c24c81c01f..b3921cf6ad 100644
--- a/usr/src/uts/common/inet/sctp/sctp_opt_data.c
+++ b/usr/src/uts/common/inet/sctp/sctp_opt_data.c
@@ -20,12 +20,10 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/stream.h>
#define _SUN_TPI_VERSION 2
@@ -1386,8 +1384,11 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
}
us = (struct sctp_uc_swap *)invalp;
sctp->sctp_ulpd = us->sus_handle;
+ sctp->sctp_upcalls = us->sus_upcalls;
+#if 0
bcopy(us->sus_upcalls, &sctp->sctp_upcalls,
sizeof (sctp_upcalls_t));
+#endif
break;
}
case SCTP_PRSCTP:
diff --git a/usr/src/uts/common/inet/sctp/sctp_output.c b/usr/src/uts/common/inet/sctp/sctp_output.c
index 8065f1dcf1..938573b1be 100644
--- a/usr/src/uts/common/inet/sctp/sctp_output.c
+++ b/usr/src/uts/common/inet/sctp/sctp_output.c
@@ -288,6 +288,13 @@ sctp_sendmsg(sctp_t *sctp, mblk_t *mp, int flags)
}
sctp->sctp_unsent += msg_len;
BUMP_LOCAL(sctp->sctp_msgcount);
+ /*
+ * Notify sockfs if the tx queue is full.
+ */
+ if (SCTP_TXQ_LEN(sctp) >= sctp->sctp_xmit_hiwater) {
+ sctp->sctp_txq_full = 1;
+ sctp->sctp_ulp_xmitted(sctp->sctp_ulpd, B_TRUE);
+ }
if (sctp->sctp_state == SCTPS_ESTABLISHED)
sctp_output(sctp, UINT_MAX);
process_sendq:
@@ -366,10 +373,8 @@ nextmsg:
* Update ULP the amount of queued data, which is
* sent-unack'ed + unsent.
*/
- if (!SCTP_IS_DETACHED(sctp)) {
- sctp->sctp_ulp_xmitted(sctp->sctp_ulpd,
- sctp->sctp_unacked + sctp->sctp_unsent);
- }
+ if (!SCTP_IS_DETACHED(sctp))
+ SCTP_TXQ_UPDATE(sctp);
sctp_sendfail_event(sctp, mdblk, 0, B_FALSE);
goto try_next;
}
@@ -875,10 +880,8 @@ chunkified:
* Update ULP the amount of queued data, which is
* sent-unack'ed + unsent.
*/
- if (!SCTP_IS_DETACHED(sctp)) {
- sctp->sctp_ulp_xmitted(sctp->sctp_ulpd,
- sctp->sctp_unacked + sctp->sctp_unsent);
- }
+ if (!SCTP_IS_DETACHED(sctp))
+ SCTP_TXQ_UPDATE(sctp);
sctp_sendfail_event(sctp, meta, 0, B_TRUE);
next_msg:
meta = tmp_meta;
@@ -1541,10 +1544,8 @@ ftsn_done:
* Update ULP the amount of queued data, which is
* sent-unack'ed + unsent.
*/
- if (!SCTP_IS_DETACHED(sctp)) {
- sctp->sctp_ulp_xmitted(sctp->sctp_ulpd,
- sctp->sctp_unacked + sctp->sctp_unsent);
- }
+ if (!SCTP_IS_DETACHED(sctp))
+ SCTP_TXQ_UPDATE(sctp);
}
}
diff --git a/usr/src/uts/common/inet/sctp/sctp_shutdown.c b/usr/src/uts/common/inet/sctp/sctp_shutdown.c
index e8311a018f..b58016eb15 100644
--- a/usr/src/uts/common/inet/sctp/sctp_shutdown.c
+++ b/usr/src/uts/common/inet/sctp/sctp_shutdown.c
@@ -20,12 +20,10 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/stream.h>
@@ -165,7 +163,7 @@ sctp_shutdown_received(sctp_t *sctp, sctp_chunk_hdr_t *sch, boolean_t crwsd,
/* Don't allow sending new data */
if (!SCTP_IS_DETACHED(sctp) && !sctp->sctp_ulp_discon_done) {
- sctp->sctp_ulp_disconnecting(sctp->sctp_ulpd);
+ sctp->sctp_ulp_opctl(sctp->sctp_ulpd, SOCK_OPCTL_SHUT_SEND, 0);
sctp->sctp_ulp_discon_done = B_TRUE;
}
diff --git a/usr/src/uts/common/inet/sctp_itf.h b/usr/src/uts/common/inet/sctp_itf.h
index 4a94cab233..eb7597ac0a 100644
--- a/usr/src/uts/common/inet/sctp_itf.h
+++ b/usr/src/uts/common/inet/sctp_itf.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _INET_SCTP_ITF_H
#define _INET_SCTP_ITF_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -54,21 +51,6 @@ extern "C" {
#define SCTP_ITF_VER 1
/*
- * This struct holds all the upcalls the SCTP kernel module will
- * invoke for different events. When calling sctp_create() to create
- * a SCTP handle, the caller must provide this information.
- */
-typedef struct sctp_upcalls_s {
- void * (*su_newconn)(void *parenthandle, void *connind);
- void (*su_connected)(void *handle);
- int (*su_disconnected)(void *handle, int error);
- void (*su_disconnecting)(void *handle);
- int (*su_recv)(void *handle, mblk_t *mp, int flags);
- void (*su_xmitted)(void *handle, int txqueued);
- void (*su_properties)(void *handle, int wroff, size_t maxblk);
-} sctp_upcalls_t;
-
-/*
* This struct holds various flow control limits the caller of
* sctp_create() should observe when interacting with SCTP.
*/
@@ -82,9 +64,10 @@ typedef struct sctp_sockbuf_limits_s {
/*
* Parameter to SCTP_UC_SWAP setsockopt
*/
+struct sock_upcalls_s;
struct sctp_uc_swap {
- void *sus_handle;
- sctp_upcalls_t *sus_upcalls;
+ void *sus_handle;
+ struct sock_upcalls_s *sus_upcalls;
};
struct sctp_s;
@@ -102,7 +85,7 @@ extern void sctp_close(struct sctp_s *conn);
extern int sctp_connect(struct sctp_s *conn, const struct sockaddr *dst,
socklen_t addrlen);
extern struct sctp_s *sctp_create(void *newhandle, struct sctp_s *parent,
- int family, int flags, const sctp_upcalls_t *su,
+ int family, int flags, struct sock_upcalls_s *su,
sctp_sockbuf_limits_t *sbl, cred_t *cr);
extern int sctp_disconnect(struct sctp_s *conn);
extern int sctp_get_opt(struct sctp_s *conn, int level, int opt, void *opts,
diff --git a/usr/src/uts/common/inet/sockmods/sockmod_sctp.c b/usr/src/uts/common/inet/sockmods/sockmod_sctp.c
new file mode 100644
index 0000000000..2600cfa181
--- /dev/null
+++ b/usr/src/uts/common/inet/sockmods/sockmod_sctp.c
@@ -0,0 +1,221 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/sysmacros.h>
+#include <sys/strsubr.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/modctl.h>
+#include <sys/cmn_err.h>
+#include <netinet/sctp.h>
+#include <fs/sockfs/sockcommon.h>
+#include "socksctp.h"
+
+struct sonode *socksctp_create(struct sockparams *, int, int, int,
+ int, int, int *, cred_t *);
+void socksctp_destroy(struct sonode *);
+
+static int socksctp_constructor(void *, void *, int);
+static void socksctp_destructor(void *, void *);
+
+static __smod_priv_t sosctp_priv = {
+ socksctp_create,
+ socksctp_destroy,
+ NULL
+};
+
+static smod_reg_t sinfo = {
+ SOCKMOD_VERSION,
+ "socksctp",
+ SOCK_UC_VERSION,
+ SOCK_DC_VERSION,
+ NULL,
+ &sosctp_priv
+};
+
+kmem_cache_t *sosctp_assoccache;
+static kmem_cache_t *sosctp_sockcache;
+
+/*
+ * Module linkage information for the kernel.
+ */
+static struct modlsockmod modlsockmod = {
+ &mod_sockmodops, "SCTP socket module", &sinfo
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1,
+ &modlsockmod,
+ NULL
+};
+
+static int
+socksctp_init(void)
+{
+ sosctp_sockcache = kmem_cache_create("sctpsock",
+ sizeof (struct sctp_sonode), 0, socksctp_constructor,
+ socksctp_destructor, NULL, NULL, NULL, 0);
+ sosctp_assoccache = kmem_cache_create("sctp_assoc",
+ sizeof (struct sctp_soassoc), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ return (0);
+}
+
+static void
+socksctp_fini(void)
+{
+ kmem_cache_destroy(sosctp_sockcache);
+ kmem_cache_destroy(sosctp_assoccache);
+}
+
+/*ARGSUSED*/
+static int
+socksctp_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ struct sctp_sonode *ss = buf;
+ struct sonode *so = &ss->ss_so;
+
+ ss->ss_type = SOSCTP_SOCKET;
+ return (sonode_constructor((void *)so, cdrarg, kmflags));
+}
+
+/*ARGSUSED*/
+static void
+socksctp_destructor(void *buf, void *cdrarg)
+{
+ struct sctp_sonode *ss = buf;
+ struct sonode *so = &ss->ss_so;
+
+ sonode_destructor((void *)so, cdrarg);
+}
+
+/*
+ * Creates a sctp socket data structure.
+ */
+/* ARGSUSED */
+struct sonode *
+socksctp_create(struct sockparams *sp, int family, int type, int protocol,
+ int version, int sflags, int *errorp, cred_t *cr)
+{
+ struct sctp_sonode *ss;
+ struct sonode *so;
+ int kmflags = (sflags & SOCKET_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
+
+ if (version == SOV_STREAM) {
+ *errorp = EINVAL;
+ return (NULL);
+ }
+
+ /*
+ * We only support two types of SCTP socket. Let sotpi_create()
+ * handle all other cases, such as raw socket.
+ */
+ if (!(family == AF_INET || family == AF_INET6) ||
+ !(type == SOCK_STREAM || type == SOCK_SEQPACKET)) {
+ *errorp = EINVAL;
+ return (NULL);
+ }
+
+ ss = kmem_cache_alloc(sosctp_sockcache, kmflags);
+ if (ss == NULL) {
+ *errorp = ENOMEM;
+ return (NULL);
+ }
+
+ so = &ss->ss_so;
+
+ ss->ss_maxassoc = 0;
+ ss->ss_assoccnt = 0;
+ ss->ss_assocs = NULL;
+
+ if (type == SOCK_STREAM) {
+ sonode_init(so, sp, family, type, protocol,
+ &sosctp_sonodeops);
+ } else {
+ sonode_init(so, sp, family, type, protocol,
+ &sosctp_seq_sonodeops);
+ ASSERT(type == SOCK_SEQPACKET);
+ mutex_enter(&so->so_lock);
+ (void) sosctp_aid_grow(ss, 1, kmflags);
+ mutex_exit(&so->so_lock);
+ }
+
+ if (version == SOV_DEFAULT) {
+ version = so_default_version;
+ }
+ so->so_version = (short)version;
+
+ dprint(2, ("sosctp_create: %p domain %d type %d\n", (void *)so, family,
+ type));
+
+ return (so);
+}
+
+/*
+ * Free SCTP socket data structure.
+ */
+void
+socksctp_destroy(struct sonode *so)
+{
+ struct sctp_sonode *ss;
+
+ ASSERT((so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) &&
+ so->so_protocol == IPPROTO_SCTP);
+
+ sosctp_fini(so, CRED());
+
+ ss = SOTOSSO(so);
+ kmem_cache_free(sosctp_sockcache, ss);
+}
+
+int
+_init(void)
+{
+ int error = 0;
+
+ (void) socksctp_init();
+
+ if ((error = mod_install(&modlinkage)) != 0)
+ socksctp_fini();
+
+ return (error);
+}
+
+int
+_fini(void)
+{
+ int error = 0;
+
+ if ((error = mod_remove(&modlinkage)) == 0)
+ socksctp_fini();
+
+ return (error);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
diff --git a/usr/src/uts/common/inet/sockmods/sockmod_sdp.c b/usr/src/uts/common/inet/sockmods/sockmod_sdp.c
new file mode 100644
index 0000000000..f609cbe069
--- /dev/null
+++ b/usr/src/uts/common/inet/sockmods/sockmod_sdp.c
@@ -0,0 +1,154 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/sysmacros.h>
+#include <sys/strsubr.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/modctl.h>
+#include <sys/cmn_err.h>
+#include <sys/tihdr.h>
+#include <sys/vfs.h>
+#include <fs/sockfs/nl7c.h>
+#include <inet/kssl/ksslapi.h>
+#include <inet/sdp_itf.h>
+#include <fs/sockfs/sockcommon.h>
+#include "socksdp.h"
+
+struct sonode *socksdp_create(struct sockparams *, int, int, int,
+ int, int, int *, cred_t *);
+static void socksdp_destroy(struct sonode *);
+
+static __smod_priv_t sosdp_priv = {
+ socksdp_create,
+ socksdp_destroy,
+ NULL
+};
+
+static smod_reg_t sinfo = {
+ SOCKMOD_VERSION,
+ "socksdp",
+ SOCK_UC_VERSION,
+ SOCK_DC_VERSION,
+ NULL,
+ &sosdp_priv
+};
+
+/*
+ * Module linkage information for the kernel
+ */
+static struct modlsockmod modlsockmod = {
+ &mod_sockmodops, "SDP socket module", &sinfo
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1,
+ &modlsockmod,
+ NULL
+};
+
+/*
+ * Creates a sdp socket data structure.
+ */
+/* ARGSUSED */
+struct sonode *
+socksdp_create(struct sockparams *sp, int family, int type, int protocol,
+ int version, int sflags, int *errorp, cred_t *cr)
+{
+ struct sonode *so;
+ int kmflags = (sflags & SOCKET_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
+
+ dprint(4, ("Inside sosdp_create: domain:%d proto:%d type:%d",
+ family, protocol, type));
+
+ *errorp = 0;
+ if (is_system_labeled()) {
+ *errorp = EOPNOTSUPP;
+ return (NULL);
+ }
+
+ if (version == SOV_STREAM) {
+ *errorp = EINVAL;
+ return (NULL);
+ }
+
+ /*
+ * We only support one type of SDP socket. Let sotpi_create()
+ * handle all other cases, such as raw socket.
+ */
+ if (!(family == AF_INET || family == AF_INET6) ||
+ !(type == SOCK_STREAM)) {
+ *errorp = EINVAL;
+ return (NULL);
+ }
+
+ so = kmem_cache_alloc(socket_cache, kmflags);
+ if (so == NULL) {
+ *errorp = ENOMEM;
+ return (NULL);
+ }
+
+ sonode_init(so, sp, family, type, protocol, &sosdp_sonodeops);
+ so->so_pollev |= SO_POLLEV_ALWAYS;
+
+ dprint(2, ("sosdp_create: %p domain %d type %d\n", (void *)so, family,
+ type));
+
+ if (version == SOV_DEFAULT) {
+ version = so_default_version;
+ }
+ so->so_version = (short)version;
+
+ return (so);
+}
+
+static void
+socksdp_destroy(struct sonode *so)
+{
+ ASSERT(so->so_ops == &sosdp_sonodeops);
+
+ sosdp_fini(so, CRED());
+
+ kmem_cache_free(socket_cache, so);
+}
+
+int
+_init(void)
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_fini(void)
+{
+ return (mod_remove(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
diff --git a/usr/src/uts/common/inet/sockmods/socksctp.c b/usr/src/uts/common/inet/sockmods/socksctp.c
new file mode 100644
index 0000000000..e013940703
--- /dev/null
+++ b/usr/src/uts/common/inet/sockmods/socksctp.c
@@ -0,0 +1,2105 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/debug.h>
+#include <sys/errno.h>
+#include <sys/stropts.h>
+#include <sys/cmn_err.h>
+#include <sys/sysmacros.h>
+#include <sys/filio.h>
+
+#include <sys/project.h>
+#include <sys/tihdr.h>
+#include <sys/strsubr.h>
+#include <sys/esunddi.h>
+#include <sys/ddi.h>
+
+#include <sys/sockio.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/strsun.h>
+
+#include <netinet/sctp.h>
+#include <inet/sctp_itf.h>
+#include <fs/sockfs/sockcommon.h>
+#include "socksctp.h"
+
+/*
+ * SCTP sockfs sonode operations, 1-1 socket
+ */
+static int sosctp_init(struct sonode *, struct sonode *, struct cred *, int);
+static int sosctp_accept(struct sonode *, int, struct cred *, struct sonode **);
+static int sosctp_bind(struct sonode *, struct sockaddr *, socklen_t, int,
+ struct cred *);
+static int sosctp_listen(struct sonode *, int, struct cred *);
+static int sosctp_connect(struct sonode *, const struct sockaddr *, socklen_t,
+ int, int, struct cred *);
+static int sosctp_recvmsg(struct sonode *, struct nmsghdr *, struct uio *,
+ struct cred *);
+static int sosctp_sendmsg(struct sonode *, struct nmsghdr *, struct uio *,
+ struct cred *);
+static int sosctp_getpeername(struct sonode *, struct sockaddr *, socklen_t *,
+ boolean_t, struct cred *);
+static int sosctp_getsockname(struct sonode *, struct sockaddr *, socklen_t *,
+ struct cred *);
+static int sosctp_shutdown(struct sonode *, int, struct cred *);
+static int sosctp_getsockopt(struct sonode *, int, int, void *, socklen_t *,
+ int, struct cred *);
+static int sosctp_setsockopt(struct sonode *, int, int, const void *,
+ socklen_t, struct cred *);
+static int sosctp_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
+ int32_t *);
+static int sosctp_close(struct sonode *, int, struct cred *);
+void sosctp_fini(struct sonode *, struct cred *);
+
+/*
+ * SCTP sockfs sonode operations, 1-N socket
+ */
+static int sosctp_seq_connect(struct sonode *, const struct sockaddr *,
+ socklen_t, int, int, struct cred *);
+static int sosctp_seq_sendmsg(struct sonode *, struct nmsghdr *, struct uio *,
+ struct cred *);
+
+/*
+ * Socket association upcalls, 1-N socket connection
+ */
+sock_upper_handle_t sctp_assoc_newconn(sock_upper_handle_t,
+ sock_lower_handle_t, sock_downcalls_t *, struct cred *, pid_t,
+ sock_upcalls_t **);
+static void sctp_assoc_connected(sock_upper_handle_t, sock_connid_t,
+ struct cred *, pid_t);
+static int sctp_assoc_disconnected(sock_upper_handle_t, sock_connid_t, int);
+static void sctp_assoc_disconnecting(sock_upper_handle_t, sock_opctl_action_t,
+ uintptr_t arg);
+static ssize_t sctp_assoc_recv(sock_upper_handle_t, mblk_t *, size_t, int,
+ int *, boolean_t *);
+static void sctp_assoc_xmitted(sock_upper_handle_t, boolean_t);
+static void sctp_assoc_properties(sock_upper_handle_t,
+ struct sock_proto_props *);
+
+sonodeops_t sosctp_sonodeops = {
+ sosctp_init, /* sop_init */
+ sosctp_accept, /* sop_accept */
+ sosctp_bind, /* sop_bind */
+ sosctp_listen, /* sop_listen */
+ sosctp_connect, /* sop_connect */
+ sosctp_recvmsg, /* sop_recvmsg */
+ sosctp_sendmsg, /* sop_sendmsg */
+ so_sendmblk_notsupp, /* sop_sendmblk */
+ sosctp_getpeername, /* sop_getpeername */
+ sosctp_getsockname, /* sop_getsockname */
+ sosctp_shutdown, /* sop_shutdown */
+ sosctp_getsockopt, /* sop_getsockopt */
+ sosctp_setsockopt, /* sop_setsockopt */
+ sosctp_ioctl, /* sop_ioctl */
+ so_poll, /* sop_poll */
+ sosctp_close, /* sop_close */
+};
+
+sonodeops_t sosctp_seq_sonodeops = {
+ sosctp_init, /* sop_init */
+ so_accept_notsupp, /* sop_accept */
+ sosctp_bind, /* sop_bind */
+ sosctp_listen, /* sop_listen */
+ sosctp_seq_connect, /* sop_connect */
+ sosctp_recvmsg, /* sop_recvmsg */
+ sosctp_seq_sendmsg, /* sop_sendmsg */
+ so_sendmblk_notsupp, /* sop_sendmblk */
+ so_getpeername_notsupp, /* sop_getpeername */
+ sosctp_getsockname, /* sop_getsockname */
+ so_shutdown_notsupp, /* sop_shutdown */
+ sosctp_getsockopt, /* sop_getsockopt */
+ sosctp_setsockopt, /* sop_setsockopt */
+ sosctp_ioctl, /* sop_ioctl */
+ so_poll, /* sop_poll */
+ sosctp_close, /* sop_close */
+};
+
+sock_upcalls_t sosctp_sock_upcalls = {
+ so_newconn,
+ so_connected,
+ so_disconnected,
+ so_opctl,
+ so_queue_msg,
+ so_set_prop,
+ so_txq_full,
+ NULL, /* su_signal_oob */
+};
+
+sock_upcalls_t sosctp_assoc_upcalls = {
+ sctp_assoc_newconn,
+ sctp_assoc_connected,
+ sctp_assoc_disconnected,
+ sctp_assoc_disconnecting,
+ sctp_assoc_recv,
+ sctp_assoc_properties,
+ sctp_assoc_xmitted,
+ NULL, /* su_recv_space */
+ NULL, /* su_signal_oob */
+};
+
+/* ARGSUSED */
+static int
+sosctp_init(struct sonode *so, struct sonode *pso, struct cred *cr, int flags)
+{
+ struct sctp_sonode *ss;
+ struct sctp_sonode *pss;
+ sctp_sockbuf_limits_t sbl;
+ sock_upcalls_t *upcalls;
+
+ ss = SOTOSSO(so);
+
+ if (pso != NULL) {
+ /*
+ * Passive open, just inherit settings from parent. We should
+ * not end up here for SOCK_SEQPACKET type sockets, since no
+ * new sonode is created in that case.
+ */
+ ASSERT(so->so_type == SOCK_STREAM);
+ pss = SOTOSSO(pso);
+
+ mutex_enter(&pso->so_lock);
+ so->so_state |= (SS_ISBOUND | SS_ISCONNECTED |
+ (pso->so_state & SS_ASYNC));
+ sosctp_so_inherit(pss, ss);
+ so->so_proto_props = pso->so_proto_props;
+ so->so_mode = pso->so_mode;
+ mutex_exit(&pso->so_lock);
+
+ return (0);
+ }
+
+ if (so->so_type == SOCK_STREAM) {
+ upcalls = &sosctp_sock_upcalls;
+ so->so_mode = SM_CONNREQUIRED;
+ } else {
+ ASSERT(so->so_type == SOCK_SEQPACKET);
+ upcalls = &sosctp_assoc_upcalls;
+ }
+ so->so_proto_handle = (sock_lower_handle_t)sctp_create(so, NULL,
+ so->so_family, SCTP_CAN_BLOCK, upcalls, &sbl, cr);
+ if (so->so_proto_handle == NULL)
+ return (ENOMEM);
+
+ so->so_rcvbuf = sbl.sbl_rxbuf;
+ so->so_rcvlowat = sbl.sbl_rxlowat;
+ so->so_sndbuf = sbl.sbl_txbuf;
+ so->so_sndlowat = sbl.sbl_txlowat;
+
+ return (0);
+}
+
+/*
+ * Accept incoming connection.
+ */
+/*ARGSUSED*/
+static int
+sosctp_accept(struct sonode *so, int fflag, struct cred *cr,
+ struct sonode **nsop)
+{
+ int error = 0;
+
+ if ((so->so_state & SS_ACCEPTCONN) == 0)
+ return (EINVAL);
+
+ error = so_acceptq_dequeue(so, (fflag & (FNONBLOCK|FNDELAY)), nsop);
+
+ return (error);
+}
+
+/*
+ * Bind local endpoint.
+ */
+/*ARGSUSED*/
+static int
+sosctp_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
+ int flags, struct cred *cr)
+{
+ int error;
+
+ if (!(flags & _SOBIND_LOCK_HELD)) {
+ mutex_enter(&so->so_lock);
+ so_lock_single(so); /* Set SOLOCKED */
+ } else {
+ ASSERT(MUTEX_HELD(&so->so_lock));
+ }
+
+ /*
+ * X/Open requires this check
+ */
+ if (so->so_state & SS_CANTSENDMORE) {
+ error = EINVAL;
+ goto done;
+ }
+
+
+ /*
+ * Protocol module does address family checks.
+ */
+ mutex_exit(&so->so_lock);
+
+ error = sctp_bind((struct sctp_s *)so->so_proto_handle, name, namelen);
+
+ mutex_enter(&so->so_lock);
+ if (error == 0) {
+ so->so_state |= SS_ISBOUND;
+ } else {
+ eprintsoline(so, error);
+ }
+done:
+ if (!(flags & _SOBIND_LOCK_HELD)) {
+ so_unlock_single(so, SOLOCKED);
+ mutex_exit(&so->so_lock);
+ } else {
+ /* If the caller held the lock don't release it here */
+ ASSERT(MUTEX_HELD(&so->so_lock));
+ ASSERT(so->so_flag & SOLOCKED);
+ }
+
+ return (error);
+}
+
+/*
+ * Turn socket into a listen socket.
+ */
+/* ARGSUSED */
+static int
+sosctp_listen(struct sonode *so, int backlog, struct cred *cr)
+{
+ int error = 0;
+
+ mutex_enter(&so->so_lock);
+ so_lock_single(so);
+
+ /*
+ * If this socket is trying to do connect, or if it has
+ * been connected, disallow.
+ */
+ if (so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED |
+ SS_ISDISCONNECTING | SS_CANTRCVMORE | SS_CANTSENDMORE)) {
+ error = EINVAL;
+ eprintsoline(so, error);
+ goto done;
+ }
+
+ if (backlog < 0) {
+ backlog = 0;
+ }
+
+ /*
+ * If listen() is only called to change backlog, we don't
+ * need to notify protocol module.
+ */
+ if (so->so_state & SS_ACCEPTCONN) {
+ so->so_backlog = backlog;
+ goto done;
+ }
+
+ mutex_exit(&so->so_lock);
+ error = sctp_listen((struct sctp_s *)so->so_proto_handle);
+ mutex_enter(&so->so_lock);
+ if (error == 0) {
+ so->so_state |= (SS_ACCEPTCONN|SS_ISBOUND);
+ so->so_backlog = backlog;
+ } else {
+ eprintsoline(so, error);
+ }
+done:
+ so_unlock_single(so, SOLOCKED);
+ mutex_exit(&so->so_lock);
+
+ return (error);
+}
+
+/*
+ * Active open.
+ */
+/*ARGSUSED*/
+static int
+sosctp_connect(struct sonode *so, const struct sockaddr *name,
+ socklen_t namelen, int fflag, int flags, struct cred *cr)
+{
+ int error = 0;
+
+ ASSERT(so->so_type == SOCK_STREAM);
+
+ mutex_enter(&so->so_lock);
+ so_lock_single(so);
+
+ /*
+ * Can't connect() after listen(), or if the socket is already
+ * connected.
+ */
+ if (so->so_state & (SS_ACCEPTCONN|SS_ISCONNECTED|SS_ISCONNECTING)) {
+ if (so->so_state & SS_ISCONNECTED) {
+ error = EISCONN;
+ } else if (so->so_state & SS_ISCONNECTING) {
+ error = EALREADY;
+ } else {
+ error = EOPNOTSUPP;
+ }
+ eprintsoline(so, error);
+ goto done;
+ }
+
+ /*
+ * Check for failure of an earlier call
+ */
+ if (so->so_error != 0) {
+ error = sogeterr(so, B_TRUE);
+ eprintsoline(so, error);
+ goto done;
+ }
+
+ /*
+ * Connection is closing, or closed, don't allow reconnect.
+ * TCP allows this to proceed, but the socket remains unwriteable.
+ * BSD returns EINVAL.
+ */
+ if (so->so_state & (SS_ISDISCONNECTING|SS_CANTRCVMORE|
+ SS_CANTSENDMORE)) {
+ error = EINVAL;
+ eprintsoline(so, error);
+ goto done;
+ }
+
+ if (name == NULL || namelen == 0) {
+ mutex_exit(&so->so_lock);
+ error = EINVAL;
+ eprintsoline(so, error);
+ goto done;
+ }
+
+ soisconnecting(so);
+ mutex_exit(&so->so_lock);
+
+ error = sctp_connect((struct sctp_s *)so->so_proto_handle,
+ name, namelen);
+
+ mutex_enter(&so->so_lock);
+ if (error == 0) {
+ /*
+ * Allow other threads to access the socket
+ */
+ error = sowaitconnected(so, fflag, 0);
+ }
+done:
+ so_unlock_single(so, SOLOCKED);
+ mutex_exit(&so->so_lock);
+ return (error);
+}
+
+/*
+ * Active open for 1-N sockets, create a new association and
+ * call connect on that.
+ * If there parent hasn't been bound yet (this is the first association),
+ * make it so.
+ */
+static int
+sosctp_seq_connect(struct sonode *so, const struct sockaddr *name,
+ socklen_t namelen, int fflag, int flags, struct cred *cr)
+{
+ struct sctp_soassoc *ssa;
+ struct sctp_sonode *ss;
+ int error;
+
+ ASSERT(so->so_type == SOCK_SEQPACKET);
+
+ mutex_enter(&so->so_lock);
+ so_lock_single(so);
+
+ if (name == NULL || namelen == 0) {
+ error = EINVAL;
+ eprintsoline(so, error);
+ goto done;
+ }
+
+ ss = SOTOSSO(so);
+
+ error = sosctp_assoc_createconn(ss, name, namelen, NULL, 0, fflag,
+ cr, &ssa);
+ if (error != 0) {
+ if ((error == EHOSTUNREACH) && (flags & _SOCONNECT_XPG4_2)) {
+ error = ENETUNREACH;
+ }
+ }
+ if (ssa != NULL) {
+ SSA_REFRELE(ss, ssa);
+ }
+
+done:
+ so_unlock_single(so, SOLOCKED);
+ mutex_exit(&so->so_lock);
+ return (error);
+}
+
+/*
+ * Receive data.
+ */
+/* ARGSUSED */
+static int
+sosctp_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
+ struct cred *cr)
+{
+ struct sctp_sonode *ss = SOTOSSO(so);
+ struct sctp_soassoc *ssa = NULL;
+ int flags, error = 0;
+ struct T_unitdata_ind *tind;
+ int len, count, readcnt = 0, rxqueued;
+ socklen_t controllen, namelen;
+ void *opt;
+ mblk_t *mp;
+ rval_t rval;
+
+ controllen = msg->msg_controllen;
+ namelen = msg->msg_namelen;
+ flags = msg->msg_flags;
+ msg->msg_flags = 0;
+ msg->msg_controllen = 0;
+ msg->msg_namelen = 0;
+
+ if (so->so_type == SOCK_STREAM) {
+ if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|
+ SS_CANTRCVMORE))) {
+ return (ENOTCONN);
+ }
+ } else {
+ /* NOTE: Will come here from vop_read() as well */
+ /* For 1-N socket, recv() cannot be used. */
+ if (namelen == 0)
+ return (EOPNOTSUPP);
+ /*
+ * If there are no associations, and no new connections are
+ * coming in, there's not going to be new messages coming
+ * in either.
+ */
+ if (so->so_rcv_q_head == NULL && ss->ss_assoccnt == 0 &&
+ !(so->so_state & SS_ACCEPTCONN)) {
+ return (ENOTCONN);
+ }
+ }
+
+ /*
+ * out-of-band data not supported.
+ */
+ if (flags & MSG_OOB) {
+ return (EOPNOTSUPP);
+ }
+
+ /*
+ * flag possibilities:
+ *
+ * MSG_PEEK Don't consume data
+ * MSG_WAITALL Wait for full quantity of data (ignored if MSG_PEEK)
+ * MSG_DONTWAIT Non-blocking (same as FNDELAY | FNONBLOCK)
+ *
+ * MSG_WAITALL can return less than the full buffer if either
+ *
+ * 1. we would block and we are non-blocking
+ * 2. a full message cannot be delivered
+ *
+ * Given that we always get a full message from proto below,
+ * MSG_WAITALL is not meaningful.
+ */
+
+ mutex_enter(&so->so_lock);
+
+ /*
+ * Allow just one reader at a time.
+ */
+ error = so_lock_read_intr(so,
+ uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
+ if (error) {
+ mutex_exit(&so->so_lock);
+ return (error);
+ }
+ mutex_exit(&so->so_lock);
+again:
+ error = so_dequeue_msg(so, &mp, uiop, &rval, flags | MSG_DUPCTRL);
+ if (mp != NULL) {
+ if (so->so_type == SOCK_SEQPACKET) {
+ ssa = *(struct sctp_soassoc **)DB_BASE(mp);
+ }
+
+ tind = (struct T_unitdata_ind *)mp->b_rptr;
+
+ len = tind->SRC_length;
+
+ if (namelen > 0 && len > 0) {
+
+ opt = sogetoff(mp, tind->SRC_offset, len, 1);
+
+ ASSERT(opt != NULL);
+
+ msg->msg_name = kmem_alloc(len, KM_SLEEP);
+ msg->msg_namelen = len;
+
+ bcopy(opt, msg->msg_name, len);
+ }
+
+ len = tind->OPT_length;
+ if (controllen == 0) {
+ if (len > 0) {
+ msg->msg_flags |= MSG_CTRUNC;
+ }
+ } else if (len > 0) {
+ opt = sogetoff(mp, tind->OPT_offset, len,
+ __TPI_ALIGN_SIZE);
+
+ ASSERT(opt != NULL);
+ sosctp_pack_cmsg(opt, msg, len);
+ }
+
+ if (mp->b_flag & SCTP_NOTIFICATION) {
+ msg->msg_flags |= MSG_NOTIFICATION;
+ }
+
+ if (!(mp->b_flag & SCTP_PARTIAL_DATA))
+ msg->msg_flags |= MSG_EOR;
+ freemsg(mp);
+ }
+done:
+ /*
+ * Determine if we need to update SCTP about the buffer
+ * space. For performance reason, we cannot update SCTP
+ * every time a message is read. The socket buffer low
+ * watermark is used as the threshold.
+ */
+ if (ssa == NULL) {
+ mutex_enter(&so->so_lock);
+ rxqueued = so->so_rcv_queued;
+
+ so->so_rcv_queued = rxqueued - readcnt;
+ count = so->so_rcvbuf - so->so_rcv_queued;
+
+ ASSERT(so->so_rcv_q_head != NULL ||
+ so->so_rcv_head != NULL ||
+ so->so_rcv_queued == 0);
+
+ so_unlock_read(so);
+ mutex_exit(&so->so_lock);
+
+ if (readcnt > 0 && (((count > 0) &&
+ (rxqueued >= so->so_rcvlowat)) ||
+ (so->so_rcv_queued == 0))) {
+ /*
+ * If amount of queued data is higher than watermark,
+ * updata SCTP's idea of available buffer space.
+ */
+ sctp_recvd((struct sctp_s *)so->so_proto_handle, count);
+ }
+ } else {
+ mutex_enter(&so->so_lock);
+ rxqueued = ssa->ssa_rcv_queued;
+
+ ssa->ssa_rcv_queued = rxqueued - readcnt;
+ count = so->so_rcvbuf - ssa->ssa_rcv_queued;
+
+ so_unlock_read(so);
+
+ if (readcnt > 0 &&
+ (((count > 0) && (rxqueued >= so->so_rcvlowat)) ||
+ (ssa->ssa_rcv_queued == 0))) {
+ /*
+ * If amount of queued data is higher than watermark,
+ * updata SCTP's idea of available buffer space.
+ */
+ mutex_exit(&so->so_lock);
+
+ sctp_recvd((struct sctp_s *)ssa->ssa_conn, count);
+
+ mutex_enter(&so->so_lock);
+ }
+ /*
+ * MOREDATA flag is set if all data could not be copied
+ */
+ if (!(flags & MSG_PEEK) && !(rval.r_val1 & MOREDATA)) {
+ SSA_REFRELE(ss, ssa);
+ }
+ mutex_exit(&so->so_lock);
+ }
+
+ return (error);
+}
+
+int
+sosctp_uiomove(mblk_t *hdr_mp, ssize_t count, ssize_t blk_size, int wroff,
+ struct uio *uiop, int flags, cred_t *cr)
+{
+ ssize_t size;
+ int error;
+ mblk_t *mp;
+ dblk_t *dp;
+
+ /*
+ * Loop until we have all data copied into mblk's.
+ */
+ while (count > 0) {
+ size = MIN(count, blk_size);
+
+ /*
+ * As a message can be splitted up and sent in different
+ * packets, each mblk will have the extra space before
+ * data to accommodate what SCTP wants to put in there.
+ */
+ while ((mp = allocb_cred(size + wroff, cr)) == NULL) {
+ if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
+ (flags & MSG_DONTWAIT)) {
+ return (EAGAIN);
+ }
+ if ((error = strwaitbuf(size + wroff, BPRI_MED))) {
+ return (error);
+ }
+ }
+
+ dp = mp->b_datap;
+ dp->db_cpid = curproc->p_pid;
+ ASSERT(wroff <= dp->db_lim - mp->b_wptr);
+ mp->b_rptr += wroff;
+ error = uiomove(mp->b_rptr, size, UIO_WRITE, uiop);
+ if (error != 0) {
+ freeb(mp);
+ return (error);
+ }
+ mp->b_wptr = mp->b_rptr + size;
+ count -= size;
+ hdr_mp->b_cont = mp;
+ hdr_mp = mp;
+ }
+ return (0);
+}
+
+/*
+ * Send message.
+ */
+static int
+sosctp_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
+ struct cred *cr)
+{
+ struct sctp_sonode *ss = SOTOSSO(so);
+ mblk_t *mctl;
+ struct cmsghdr *cmsg;
+ struct sctp_sndrcvinfo *sinfo;
+ int optlen, flags, fflag;
+ ssize_t count, msglen;
+ int error;
+
+ ASSERT(so->so_type == SOCK_STREAM);
+
+ flags = msg->msg_flags;
+ if (flags & MSG_OOB) {
+ /*
+ * No out-of-band data support.
+ */
+ return (EOPNOTSUPP);
+ }
+
+ if (msg->msg_controllen != 0) {
+ optlen = msg->msg_controllen;
+ cmsg = sosctp_find_cmsg(msg->msg_control, optlen, SCTP_SNDRCV);
+ if (cmsg != NULL) {
+ if (cmsg->cmsg_len <
+ (sizeof (*sinfo) + sizeof (*cmsg))) {
+ eprintsoline(so, EINVAL);
+ return (EINVAL);
+ }
+ sinfo = (struct sctp_sndrcvinfo *)(cmsg + 1);
+
+ /* Both flags should not be set together. */
+ if ((sinfo->sinfo_flags & MSG_EOF) &&
+ (sinfo->sinfo_flags & MSG_ABORT)) {
+ eprintsoline(so, EINVAL);
+ return (EINVAL);
+ }
+
+ /* Initiate a graceful shutdown. */
+ if (sinfo->sinfo_flags & MSG_EOF) {
+ /* Can't include data in MSG_EOF message. */
+ if (uiop->uio_resid != 0) {
+ eprintsoline(so, EINVAL);
+ return (EINVAL);
+ }
+
+ /*
+ * This is the same sequence as done in
+ * shutdown(SHUT_WR).
+ */
+ mutex_enter(&so->so_lock);
+ so_lock_single(so);
+ socantsendmore(so);
+ cv_broadcast(&so->so_snd_cv);
+ so->so_state |= SS_ISDISCONNECTING;
+ mutex_exit(&so->so_lock);
+
+ pollwakeup(&so->so_poll_list, POLLOUT);
+ sctp_recvd((struct sctp_s *)so->so_proto_handle,
+ so->so_rcvbuf);
+ error = sctp_disconnect(
+ (struct sctp_s *)so->so_proto_handle);
+
+ mutex_enter(&so->so_lock);
+ so_unlock_single(so, SOLOCKED);
+ mutex_exit(&so->so_lock);
+ return (error);
+ }
+ }
+ } else {
+ optlen = 0;
+ }
+
+ mutex_enter(&so->so_lock);
+ for (;;) {
+ if (so->so_state & SS_CANTSENDMORE) {
+ mutex_exit(&so->so_lock);
+ return (EPIPE);
+ }
+
+ if (so->so_error != 0) {
+ error = sogeterr(so, B_TRUE);
+ mutex_exit(&so->so_lock);
+ return (error);
+ }
+
+ if (!so->so_snd_qfull)
+ break;
+
+ if (so->so_state & SS_CLOSING) {
+ mutex_exit(&so->so_lock);
+ return (EINTR);
+ }
+ /*
+ * Xmit window full in a blocking socket.
+ */
+ if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
+ (flags & MSG_DONTWAIT)) {
+ mutex_exit(&so->so_lock);
+ return (EAGAIN);
+ } else {
+ /*
+ * Wait for space to become available and try again.
+ */
+ error = cv_wait_sig(&so->so_snd_cv, &so->so_lock);
+ if (!error) { /* signal */
+ mutex_exit(&so->so_lock);
+ return (EINTR);
+ }
+ }
+ }
+ msglen = count = uiop->uio_resid;
+
+ /* Don't allow sending a message larger than the send buffer size. */
+ /* XXX Transport module need to enforce this */
+ if (msglen > so->so_sndbuf) {
+ mutex_exit(&so->so_lock);
+ return (EMSGSIZE);
+ }
+
+ /*
+ * Allow piggybacking data on handshake messages (SS_ISCONNECTING).
+ */
+ if (!(so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED))) {
+ /*
+ * We need to check here for listener so that the
+ * same error will be returned as with a TCP socket.
+ * In this case, sosctp_connect() returns EOPNOTSUPP
+ * while a TCP socket returns ENOTCONN instead. Catch it
+ * here to have the same behavior as a TCP socket.
+ *
+ * We also need to make sure that the peer address is
+ * provided before we attempt to do the connect.
+ */
+ if ((so->so_state & SS_ACCEPTCONN) ||
+ msg->msg_name == NULL) {
+ mutex_exit(&so->so_lock);
+ error = ENOTCONN;
+ goto error_nofree;
+ }
+ mutex_exit(&so->so_lock);
+ fflag = uiop->uio_fmode;
+ if (flags & MSG_DONTWAIT) {
+ fflag |= FNDELAY;
+ }
+ error = sosctp_connect(so, msg->msg_name, msg->msg_namelen,
+ fflag, (so->so_version == SOV_XPG4_2) * _SOCONNECT_XPG4_2,
+ cr);
+ if (error) {
+ /*
+ * Check for non-fatal errors, socket connected
+ * while the lock had been lifted.
+ */
+ if (error != EISCONN && error != EALREADY) {
+ goto error_nofree;
+ }
+ error = 0;
+ }
+ } else {
+ mutex_exit(&so->so_lock);
+ }
+
+ mctl = sctp_alloc_hdr(msg->msg_name, msg->msg_namelen,
+ msg->msg_control, optlen, SCTP_CAN_BLOCK);
+ if (mctl == NULL) {
+ error = EINTR;
+ goto error_nofree;
+ }
+
+ /* Copy in the message. */
+ if ((error = sosctp_uiomove(mctl, count, ss->ss_wrsize, ss->ss_wroff,
+ uiop, flags, cr)) != 0) {
+ goto error_ret;
+ }
+ error = sctp_sendmsg((struct sctp_s *)so->so_proto_handle, mctl, 0);
+ if (error == 0)
+ return (0);
+
+error_ret:
+ freemsg(mctl);
+error_nofree:
+ mutex_enter(&so->so_lock);
+ if ((error == EPIPE) && (so->so_state & SS_CANTSENDMORE)) {
+ /*
+ * We received shutdown between the time lock was
+ * lifted and call to sctp_sendmsg().
+ */
+ mutex_exit(&so->so_lock);
+ return (EPIPE);
+ }
+ mutex_exit(&so->so_lock);
+ return (error);
+}
+
+/*
+ * Send message on 1-N socket. Connects automatically if there is
+ * no association.
+ */
+static int
+sosctp_seq_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
+ struct cred *cr)
+{
+ struct sctp_sonode *ss;
+ struct sctp_soassoc *ssa;
+ struct cmsghdr *cmsg;
+ struct sctp_sndrcvinfo *sinfo;
+ int aid = 0;
+ mblk_t *mctl;
+ int namelen, optlen, flags;
+ ssize_t count, msglen;
+ int error;
+ uint16_t s_flags = 0;
+
+ ASSERT(so->so_type == SOCK_SEQPACKET);
+
+ /*
+ * There shouldn't be problems with alignment, as the memory for
+ * msg_control was alloced with kmem_alloc.
+ */
+ cmsg = sosctp_find_cmsg(msg->msg_control, msg->msg_controllen,
+ SCTP_SNDRCV);
+ if (cmsg != NULL) {
+ if (cmsg->cmsg_len < (sizeof (*sinfo) + sizeof (*cmsg))) {
+ eprintsoline(so, EINVAL);
+ return (EINVAL);
+ }
+ sinfo = (struct sctp_sndrcvinfo *)(cmsg + 1);
+ s_flags = sinfo->sinfo_flags;
+ aid = sinfo->sinfo_assoc_id;
+ }
+
+ ss = SOTOSSO(so);
+ namelen = msg->msg_namelen;
+
+ if (msg->msg_controllen > 0) {
+ optlen = msg->msg_controllen;
+ } else {
+ optlen = 0;
+ }
+
+ mutex_enter(&so->so_lock);
+
+ /*
+ * If there is no association id, connect to address specified
+ * in msg_name. Otherwise look up the association using the id.
+ */
+ if (aid == 0) {
+ /*
+ * Connect and shutdown cannot be done together, so check for
+ * MSG_EOF.
+ */
+ if (msg->msg_name == NULL || namelen == 0 ||
+ (s_flags & MSG_EOF)) {
+ error = EINVAL;
+ eprintsoline(so, error);
+ goto done;
+ }
+ flags = uiop->uio_fmode;
+ if (msg->msg_flags & MSG_DONTWAIT) {
+ flags |= FNDELAY;
+ }
+ so_lock_single(so);
+ error = sosctp_assoc_createconn(ss, msg->msg_name, namelen,
+ msg->msg_control, optlen, flags, cr, &ssa);
+ if (error) {
+ if ((so->so_version == SOV_XPG4_2) &&
+ (error == EHOSTUNREACH)) {
+ error = ENETUNREACH;
+ }
+ if (ssa == NULL) {
+ /*
+ * Fatal error during connect(). Bail out.
+ * If ssa exists, it means that the handshake
+ * is in progress.
+ */
+ eprintsoline(so, error);
+ so_unlock_single(so, SOLOCKED);
+ goto done;
+ }
+ /*
+ * All the errors are non-fatal ones, don't return
+ * e.g. EINPROGRESS from sendmsg().
+ */
+ error = 0;
+ }
+ so_unlock_single(so, SOLOCKED);
+ } else {
+ if ((error = sosctp_assoc(ss, aid, &ssa)) != 0) {
+ eprintsoline(so, error);
+ goto done;
+ }
+ }
+
+ /*
+ * Now we have an association.
+ */
+ flags = msg->msg_flags;
+
+ /*
+ * MSG_EOF initiates graceful shutdown.
+ */
+ if (s_flags & MSG_EOF) {
+ if (uiop->uio_resid) {
+ /*
+ * Can't include data in MSG_EOF message.
+ */
+ error = EINVAL;
+ } else {
+ mutex_exit(&so->so_lock);
+ ssa->ssa_state |= SS_ISDISCONNECTING;
+ sctp_recvd((struct sctp_s *)ssa->ssa_conn,
+ so->so_rcvbuf);
+ error = sctp_disconnect((struct sctp_s *)ssa->ssa_conn);
+ mutex_enter(&so->so_lock);
+ }
+ goto refrele;
+ }
+
+ for (;;) {
+ if (ssa->ssa_state & SS_CANTSENDMORE) {
+ SSA_REFRELE(ss, ssa);
+ mutex_exit(&so->so_lock);
+ return (EPIPE);
+ }
+ if (ssa->ssa_error != 0) {
+ error = ssa->ssa_error;
+ ssa->ssa_error = 0;
+ goto refrele;
+ }
+
+ if (!ssa->ssa_snd_qfull)
+ break;
+
+ if (so->so_state & SS_CLOSING) {
+ error = EINTR;
+ goto refrele;
+ }
+ if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
+ (flags & MSG_DONTWAIT)) {
+ error = EAGAIN;
+ goto refrele;
+ } else {
+ /*
+ * Wait for space to become available and try again.
+ */
+ error = cv_wait_sig(&so->so_snd_cv, &so->so_lock);
+ if (!error) { /* signal */
+ error = EINTR;
+ goto refrele;
+ }
+ }
+ }
+
+ msglen = count = uiop->uio_resid;
+
+ /* Don't allow sending a message larger than the send buffer size. */
+ if (msglen > so->so_sndbuf) {
+ error = EMSGSIZE;
+ goto refrele;
+ }
+
+ /*
+ * Update TX buffer usage here so that we can lift the socket lock.
+ */
+ mutex_exit(&so->so_lock);
+
+ mctl = sctp_alloc_hdr(msg->msg_name, namelen, msg->msg_control,
+ optlen, SCTP_CAN_BLOCK);
+ if (mctl == NULL) {
+ error = EINTR;
+ goto lock_rele;
+ }
+
+ /* Copy in the message. */
+ if ((error = sosctp_uiomove(mctl, count, ssa->ssa_wrsize,
+ ssa->ssa_wroff, uiop, flags, cr)) != 0) {
+ goto lock_rele;
+ }
+ error = sctp_sendmsg((struct sctp_s *)ssa->ssa_conn, mctl, 0);
+lock_rele:
+ mutex_enter(&so->so_lock);
+ if (error != 0) {
+ freemsg(mctl);
+ if ((error == EPIPE) && (ssa->ssa_state & SS_CANTSENDMORE)) {
+ /*
+ * We received shutdown between the time lock was
+ * lifted and call to sctp_sendmsg().
+ */
+ SSA_REFRELE(ss, ssa);
+ mutex_exit(&so->so_lock);
+ return (EPIPE);
+ }
+ }
+
+refrele:
+ SSA_REFRELE(ss, ssa);
+done:
+ mutex_exit(&so->so_lock);
+ return (error);
+}
+
+/*
+ * Get address of remote node.
+ */
+/* ARGSUSED */
+static int
+sosctp_getpeername(struct sonode *so, struct sockaddr *addr, socklen_t *addrlen,
+ boolean_t accept, struct cred *cr)
+{
+ return (sctp_getpeername((struct sctp_s *)so->so_proto_handle, addr,
+ addrlen));
+}
+
+/*
+ * Get local address.
+ */
+/* ARGSUSED */
+static int
+sosctp_getsockname(struct sonode *so, struct sockaddr *addr, socklen_t *addrlen,
+ struct cred *cr)
+{
+ return (sctp_getsockname((struct sctp_s *)so->so_proto_handle, addr,
+ addrlen));
+}
+
+/*
+ * Called from shutdown().
+ */
+/* ARGSUSED */
+static int
+sosctp_shutdown(struct sonode *so, int how, struct cred *cr)
+{
+ uint_t state_change;
+ int wakesig = 0;
+ int error = 0;
+
+ mutex_enter(&so->so_lock);
+ /*
+ * Record the current state and then perform any state changes.
+ * Then use the difference between the old and new states to
+ * determine which needs to be done.
+ */
+ state_change = so->so_state;
+
+ switch (how) {
+ case SHUT_RD:
+ socantrcvmore(so);
+ break;
+ case SHUT_WR:
+ socantsendmore(so);
+ break;
+ case SHUT_RDWR:
+ socantsendmore(so);
+ socantrcvmore(so);
+ break;
+ default:
+ mutex_exit(&so->so_lock);
+ return (EINVAL);
+ }
+
+ state_change = so->so_state & ~state_change;
+
+ if (state_change & SS_CANTRCVMORE) {
+ if (so->so_rcv_q_head == NULL) {
+ cv_signal(&so->so_rcv_cv);
+ }
+ wakesig = POLLIN|POLLRDNORM;
+
+ socket_sendsig(so, SOCKETSIG_READ);
+ }
+ if (state_change & SS_CANTSENDMORE) {
+ cv_broadcast(&so->so_snd_cv);
+ wakesig |= POLLOUT;
+
+ so->so_state |= SS_ISDISCONNECTING;
+ }
+ mutex_exit(&so->so_lock);
+
+ pollwakeup(&so->so_poll_list, wakesig);
+
+ if (state_change & SS_CANTSENDMORE) {
+ sctp_recvd((struct sctp_s *)so->so_proto_handle, so->so_rcvbuf);
+ error = sctp_disconnect((struct sctp_s *)so->so_proto_handle);
+ }
+
+ /*
+ * HACK: sctp_disconnect() may return EWOULDBLOCK. But this error is
+ * not documented in standard socket API. Catch it here.
+ */
+ if (error == EWOULDBLOCK)
+ error = 0;
+ return (error);
+}
+
+/*
+ * Get socket options.
+ */
+/*ARGSUSED5*/
+static int
+sosctp_getsockopt(struct sonode *so, int level, int option_name,
+ void *optval, socklen_t *optlenp, int flags, struct cred *cr)
+{
+ if (level == IPPROTO_SCTP) {
+ /*
+ * Should go through ioctl().
+ */
+ return (EINVAL);
+ }
+ return (sctp_get_opt((struct sctp_s *)so->so_proto_handle, level,
+ option_name, optval, optlenp));
+}
+
+/*
+ * Set socket options
+ */
+/* ARGSUSED */
+static int
+sosctp_setsockopt(struct sonode *so, int level, int option_name,
+ const void *optval, t_uscalar_t optlen, struct cred *cr)
+{
+ struct sctp_sonode *ss = SOTOSSO(so);
+ struct sctp_soassoc *ssa = NULL;
+ sctp_assoc_t id;
+ int error, rc;
+ void *conn = NULL;
+
+ mutex_enter(&so->so_lock);
+
+ /*
+ * For some SCTP level options, one can select the association this
+ * applies to.
+ */
+ if (so->so_type == SOCK_STREAM) {
+ conn = so->so_proto_handle;
+ } else {
+ /*
+ * SOCK_SEQPACKET only
+ */
+ id = 0;
+ if (level == IPPROTO_SCTP) {
+ switch (option_name) {
+ case SCTP_RTOINFO:
+ case SCTP_ASSOCINFO:
+ case SCTP_SET_PEER_PRIMARY_ADDR:
+ case SCTP_PRIMARY_ADDR:
+ case SCTP_PEER_ADDR_PARAMS:
+ /*
+ * Association ID is the first element
+ * params struct
+ */
+ if (optlen < sizeof (sctp_assoc_t)) {
+ error = EINVAL;
+ eprintsoline(so, error);
+ goto done;
+ }
+ id = *(sctp_assoc_t *)optval;
+ break;
+ case SCTP_DEFAULT_SEND_PARAM:
+ if (optlen != sizeof (struct sctp_sndrcvinfo)) {
+ error = EINVAL;
+ eprintsoline(so, error);
+ goto done;
+ }
+ id = ((struct sctp_sndrcvinfo *)
+ optval)->sinfo_assoc_id;
+ break;
+ case SCTP_INITMSG:
+ /*
+ * Only applies to future associations
+ */
+ conn = so->so_proto_handle;
+ break;
+ default:
+ break;
+ }
+ } else if (level == SOL_SOCKET) {
+ if (option_name == SO_LINGER) {
+ error = EOPNOTSUPP;
+ eprintsoline(so, error);
+ goto done;
+ }
+ /*
+ * These 2 options are applied to all associations.
+ * The other socket level options are only applied
+ * to the socket (not associations).
+ */
+ if ((option_name != SO_RCVBUF) &&
+ (option_name != SO_SNDBUF)) {
+ conn = so->so_proto_handle;
+ }
+ } else {
+ conn = NULL;
+ }
+
+ /*
+ * If association ID was specified, do op on that assoc.
+ * Otherwise set the default setting of a socket.
+ */
+ if (id != 0) {
+ if ((error = sosctp_assoc(ss, id, &ssa)) != 0) {
+ eprintsoline(so, error);
+ goto done;
+ }
+ conn = ssa->ssa_conn;
+ }
+ }
+ dprint(2, ("sosctp_setsockopt %p (%d) - conn %p %d %d id:%d\n",
+ (void *)ss, so->so_type, (void *)conn, level, option_name, id));
+
+ ASSERT(ssa == NULL || (ssa != NULL && conn != NULL));
+ if (conn != NULL) {
+ mutex_exit(&so->so_lock);
+ error = sctp_set_opt((struct sctp_s *)conn, level, option_name,
+ optval, optlen);
+ mutex_enter(&so->so_lock);
+ if (ssa != NULL)
+ SSA_REFRELE(ss, ssa);
+ } else {
+ /*
+ * 1-N socket, and we have to apply the operation to ALL
+ * associations. Like with anything of this sort, the
+ * problem is what to do if the operation fails.
+ * Just try to apply the setting to everyone, but store
+ * error number if someone returns such. And since we are
+ * looping through all possible aids, some of them can be
+ * invalid. We just ignore this kind (sosctp_assoc()) of
+ * errors.
+ */
+ sctp_assoc_t aid;
+
+ mutex_exit(&so->so_lock);
+ error = sctp_set_opt((struct sctp_s *)so->so_proto_handle,
+ level, option_name, optval, optlen);
+ mutex_enter(&so->so_lock);
+ for (aid = 1; aid < ss->ss_maxassoc; aid++) {
+ if (sosctp_assoc(ss, aid, &ssa) != 0)
+ continue;
+ mutex_exit(&so->so_lock);
+ rc = sctp_set_opt((struct sctp_s *)ssa->ssa_conn, level,
+ option_name, optval, optlen);
+ mutex_enter(&so->so_lock);
+ SSA_REFRELE(ss, ssa);
+ if (error == 0) {
+ error = rc;
+ }
+ }
+ }
+done:
+ mutex_exit(&so->so_lock);
+ return (error);
+}
+
+/*ARGSUSED*/
+static int
+sosctp_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
+ struct cred *cr, int32_t *rvalp)
+{
+ struct sctp_sonode *ss;
+ int32_t value;
+ int error;
+ int intval;
+ pid_t pid;
+ struct sctp_soassoc *ssa;
+ void *conn;
+ void *buf;
+ STRUCT_DECL(sctpopt, opt);
+ uint32_t optlen;
+ int buflen;
+
+ ss = SOTOSSO(so);
+
+ /* handle socket specific ioctls */
+ switch (cmd) {
+ case FIONBIO:
+ if (so_copyin((void *)arg, &value, sizeof (int32_t),
+ (mode & (int)FKIOCTL))) {
+ return (EFAULT);
+ }
+ mutex_enter(&so->so_lock);
+ if (value) {
+ so->so_state |= SS_NDELAY;
+ } else {
+ so->so_state &= ~SS_NDELAY;
+ }
+ mutex_exit(&so->so_lock);
+ return (0);
+
+ case FIOASYNC:
+ if (so_copyin((void *)arg, &value, sizeof (int32_t),
+ (mode & (int)FKIOCTL))) {
+ return (EFAULT);
+ }
+ mutex_enter(&so->so_lock);
+
+ if (value) {
+ /* Turn on SIGIO */
+ so->so_state |= SS_ASYNC;
+ } else {
+ /* Turn off SIGIO */
+ so->so_state &= ~SS_ASYNC;
+ }
+ mutex_exit(&so->so_lock);
+ return (0);
+
+ case SIOCSPGRP:
+ case FIOSETOWN:
+ if (so_copyin((void *)arg, &pid, sizeof (pid_t),
+ (mode & (int)FKIOCTL))) {
+ return (EFAULT);
+ }
+ mutex_enter(&so->so_lock);
+
+ error = (pid != so->so_pgrp) ? socket_chgpgrp(so, pid) : 0;
+ mutex_exit(&so->so_lock);
+ return (error);
+
+ case SIOCGPGRP:
+ case FIOGETOWN:
+ if (so_copyout(&so->so_pgrp, (void *)arg,
+ sizeof (pid_t), (mode & (int)FKIOCTL)))
+ return (EFAULT);
+ return (0);
+
+ case FIONREAD:
+ /* XXX: Cannot be used unless standard buffer is used */
+ /*
+ * Return number of bytes of data in all data messages
+ * in queue in "arg".
+ * For stream socket, amount of available data.
+ * For sock_dgram, # of available bytes + addresses.
+ */
+ intval = (so->so_state & SS_ACCEPTCONN) ? 0 :
+ MIN(so->so_rcv_queued, INT_MAX);
+ if (so_copyout(&intval, (void *)arg, sizeof (intval),
+ (mode & (int)FKIOCTL)))
+ return (EFAULT);
+ return (0);
+ case SIOCATMARK:
+ /*
+ * No support for urgent data.
+ */
+ intval = 0;
+
+ if (so_copyout(&intval, (void *)arg, sizeof (int),
+ (mode & (int)FKIOCTL)))
+ return (EFAULT);
+ return (0);
+ case SIOCSCTPGOPT:
+ STRUCT_INIT(opt, mode);
+
+ if (so_copyin((void *)arg, STRUCT_BUF(opt), STRUCT_SIZE(opt),
+ (mode & (int)FKIOCTL))) {
+ return (EFAULT);
+ }
+ if ((optlen = STRUCT_FGET(opt, sopt_len)) > SO_MAXARGSIZE)
+ return (EINVAL);
+
+ /*
+ * Find the correct sctp_t based on whether it is 1-N socket
+ * or not.
+ */
+ intval = STRUCT_FGET(opt, sopt_aid);
+ mutex_enter(&so->so_lock);
+ if ((so->so_type == SOCK_SEQPACKET) && intval) {
+ if ((error = sosctp_assoc(ss, intval, &ssa)) != 0) {
+ mutex_exit(&so->so_lock);
+ return (error);
+ }
+ conn = ssa->ssa_conn;
+ ASSERT(conn != NULL);
+ } else {
+ conn = so->so_proto_handle;
+ ssa = NULL;
+ }
+ mutex_exit(&so->so_lock);
+
+ /* Copyin the option buffer and then call sctp_get_opt(). */
+ buflen = optlen;
+ /* Let's allocate a buffer enough to hold an int */
+ if (buflen < sizeof (uint32_t))
+ buflen = sizeof (uint32_t);
+ buf = kmem_alloc(buflen, KM_SLEEP);
+ if (so_copyin(STRUCT_FGETP(opt, sopt_val), buf, optlen,
+ (mode & (int)FKIOCTL))) {
+ if (ssa != NULL) {
+ mutex_enter(&so->so_lock);
+ SSA_REFRELE(ss, ssa);
+ mutex_exit(&so->so_lock);
+ }
+ kmem_free(buf, buflen);
+ return (EFAULT);
+ }
+ /* The option level has to be IPPROTO_SCTP */
+ error = sctp_get_opt((struct sctp_s *)conn, IPPROTO_SCTP,
+ STRUCT_FGET(opt, sopt_name), buf, &optlen);
+ if (ssa != NULL) {
+ mutex_enter(&so->so_lock);
+ SSA_REFRELE(ss, ssa);
+ mutex_exit(&so->so_lock);
+ }
+ optlen = MIN(buflen, optlen);
+ /* No error, copyout the result with the correct buf len. */
+ if (error == 0) {
+ STRUCT_FSET(opt, sopt_len, optlen);
+ if (so_copyout(STRUCT_BUF(opt), (void *)arg,
+ STRUCT_SIZE(opt), (mode & (int)FKIOCTL))) {
+ error = EFAULT;
+ } else if (so_copyout(buf, STRUCT_FGETP(opt, sopt_val),
+ optlen, (mode & (int)FKIOCTL))) {
+ error = EFAULT;
+ }
+ }
+ kmem_free(buf, buflen);
+ return (error);
+
+ case SIOCSCTPSOPT:
+ STRUCT_INIT(opt, mode);
+
+ if (so_copyin((void *)arg, STRUCT_BUF(opt), STRUCT_SIZE(opt),
+ (mode & (int)FKIOCTL))) {
+ return (EFAULT);
+ }
+ if ((optlen = STRUCT_FGET(opt, sopt_len)) > SO_MAXARGSIZE)
+ return (EINVAL);
+
+ /*
+ * Find the correct sctp_t based on whether it is 1-N socket
+ * or not.
+ */
+ intval = STRUCT_FGET(opt, sopt_aid);
+ mutex_enter(&so->so_lock);
+ if (intval != 0) {
+ if ((error = sosctp_assoc(ss, intval, &ssa)) != 0) {
+ mutex_exit(&so->so_lock);
+ return (error);
+ }
+ conn = ssa->ssa_conn;
+ ASSERT(conn != NULL);
+ } else {
+ conn = so->so_proto_handle;
+ ssa = NULL;
+ }
+ mutex_exit(&so->so_lock);
+
+ /* Copyin the option buffer and then call sctp_set_opt(). */
+ buf = kmem_alloc(optlen, KM_SLEEP);
+ if (so_copyin(STRUCT_FGETP(opt, sopt_val), buf, optlen,
+ (mode & (int)FKIOCTL))) {
+ if (ssa != NULL) {
+ mutex_enter(&so->so_lock);
+ SSA_REFRELE(ss, ssa);
+ mutex_exit(&so->so_lock);
+ }
+ kmem_free(buf, intval);
+ return (EFAULT);
+ }
+ /* The option level has to be IPPROTO_SCTP */
+ error = sctp_set_opt((struct sctp_s *)conn, IPPROTO_SCTP,
+ STRUCT_FGET(opt, sopt_name), buf, optlen);
+ if (ssa) {
+ mutex_enter(&so->so_lock);
+ SSA_REFRELE(ss, ssa);
+ mutex_exit(&so->so_lock);
+ }
+ kmem_free(buf, optlen);
+ return (error);
+
+ case SIOCSCTPPEELOFF: {
+ struct sonode *nso;
+ struct sctp_uc_swap us;
+ int nfd;
+ struct file *nfp;
+ struct vnode *nvp = NULL;
+ struct sockparams *sp;
+
+ dprint(2, ("sctppeeloff %p\n", (void *)ss));
+
+ if (so->so_type != SOCK_SEQPACKET) {
+ return (EOPNOTSUPP);
+ }
+ if (so_copyin((void *)arg, &intval, sizeof (intval),
+ (mode & (int)FKIOCTL))) {
+ return (EFAULT);
+ }
+ if (intval == 0) {
+ return (EINVAL);
+ }
+
+ /*
+ * Find sockparams. This is different from parent's entry,
+ * as the socket type is different.
+ */
+ error = solookup(so->so_family, SOCK_STREAM, so->so_protocol,
+ &sp);
+
+ /*
+ * Allocate the user fd.
+ */
+ if ((nfd = ufalloc(0)) == -1) {
+ eprintsoline(so, EMFILE);
+ return (EMFILE);
+ }
+
+ /*
+ * Copy the fd out.
+ */
+ if (so_copyout(&nfd, (void *)arg, sizeof (nfd),
+ (mode & (int)FKIOCTL))) {
+ error = EFAULT;
+ goto err;
+ }
+ mutex_enter(&so->so_lock);
+
+ /*
+ * Don't use sosctp_assoc() in order to peel off disconnected
+ * associations.
+ */
+ ssa = ((uint32_t)intval >= ss->ss_maxassoc) ? NULL :
+ ss->ss_assocs[intval].ssi_assoc;
+ if (ssa == NULL) {
+ mutex_exit(&so->so_lock);
+ error = EINVAL;
+ goto err;
+ }
+ SSA_REFHOLD(ssa);
+
+ nso = socksctp_create(sp, so->so_family, SOCK_STREAM,
+ so->so_protocol, so->so_version, SOCKET_NOSLEEP,
+ &error, cr);
+ if (nso == NULL) {
+ SSA_REFRELE(ss, ssa);
+ mutex_exit(&so->so_lock);
+ goto err;
+ }
+ /* cannot fail, only inheriting properties */
+ (void) sosctp_init(nso, so, CRED(), 0);
+ nvp = SOTOV(nso);
+ so_lock_single(so);
+ mutex_exit(&so->so_lock);
+ us.sus_handle = SOTOSSO(nso);
+ us.sus_upcalls = &sosctp_sock_upcalls;
+
+ /*
+ * Upcalls to new socket are blocked for the duration of
+ * downcall.
+ */
+ mutex_enter(&nso->so_lock);
+
+ error = sctp_set_opt((struct sctp_s *)ssa->ssa_conn,
+ IPPROTO_SCTP, SCTP_UC_SWAP, &us, sizeof (us));
+ if (error) {
+ goto peelerr;
+ }
+ error = falloc(nvp, FWRITE|FREAD, &nfp, NULL);
+ if (error) {
+ goto peelerr;
+ }
+
+ /*
+ * fill in the entries that falloc reserved
+ */
+ nfp->f_vnode = nvp;
+ mutex_exit(&nfp->f_tlock);
+ setf(nfd, nfp);
+
+ mutex_enter(&so->so_lock);
+
+ sosctp_assoc_move(ss, SOTOSSO(nso), ssa);
+
+ mutex_exit(&nso->so_lock);
+
+ ssa->ssa_conn = NULL;
+ sosctp_assoc_free(ss, ssa);
+
+ so_unlock_single(so, SOLOCKED);
+ mutex_exit(&so->so_lock);
+
+ return (0);
+
+err:
+ setf(nfd, NULL);
+ eprintsoline(so, error);
+ return (error);
+
+peelerr:
+ mutex_exit(&nso->so_lock);
+ mutex_enter(&so->so_lock);
+ ASSERT(nso->so_count == 1);
+ nso->so_count = 0;
+ so_unlock_single(so, SOLOCKED);
+ SSA_REFRELE(ss, ssa);
+ mutex_exit(&so->so_lock);
+
+ setf(nfd, NULL);
+ ASSERT(nvp->v_count == 1);
+ socket_destroy(nso);
+ eprintsoline(so, error);
+ return (error);
+ }
+ default:
+ return (EINVAL);
+ }
+}
+
+/*ARGSUSED*/
+static int
+sosctp_close(struct sonode *so, int flag, struct cred *cr)
+{
+ struct sctp_sonode *ss;
+ struct sctp_sa_id *ssi;
+ struct sctp_soassoc *ssa;
+ int32_t i;
+
+ ss = SOTOSSO(so);
+
+ /*
+ * Initiate connection shutdown. Update SCTP's receive
+ * window.
+ */
+ sctp_recvd((struct sctp_s *)so->so_proto_handle,
+ so->so_rcvbuf - so->so_rcv_queued);
+ (void) sctp_disconnect((struct sctp_s *)so->so_proto_handle);
+
+ /*
+ * New associations can't come in, but old ones might get
+ * closed in upcall. Protect against that by taking a reference
+ * on the association.
+ */
+ mutex_enter(&so->so_lock);
+ ssi = ss->ss_assocs;
+ for (i = 0; i < ss->ss_maxassoc; i++, ssi++) {
+ if ((ssa = ssi->ssi_assoc) != NULL) {
+ SSA_REFHOLD(ssa);
+ sosctp_assoc_isdisconnected(ssa, 0);
+ mutex_exit(&so->so_lock);
+
+ sctp_recvd((struct sctp_s *)ssa->ssa_conn,
+ so->so_rcvbuf - ssa->ssa_rcv_queued);
+ (void) sctp_disconnect((struct sctp_s *)ssa->ssa_conn);
+
+ mutex_enter(&so->so_lock);
+ SSA_REFRELE(ss, ssa);
+ }
+ }
+ mutex_exit(&so->so_lock);
+
+ return (0);
+}
+
+/*
+ * Closes incoming connections which were never accepted, frees
+ * resources.
+ */
+/* ARGSUSED */
+void
+sosctp_fini(struct sonode *so, struct cred *cr)
+{
+ struct sctp_sonode *ss;
+ struct sctp_sa_id *ssi;
+ struct sctp_soassoc *ssa;
+ int32_t i;
+
+ ss = SOTOSSO(so);
+
+ ASSERT(so->so_ops == &sosctp_sonodeops ||
+ so->so_ops == &sosctp_seq_sonodeops);
+
+ /* We are the sole owner of so now */
+ mutex_enter(&so->so_lock);
+
+ so_rcv_flush(so);
+
+ /* Free all pending connections */
+ so_acceptq_flush(so);
+
+ ssi = ss->ss_assocs;
+ for (i = 0; i < ss->ss_maxassoc; i++, ssi++) {
+ if ((ssa = ssi->ssi_assoc) != NULL) {
+ SSA_REFHOLD(ssa);
+ mutex_exit(&so->so_lock);
+
+ sctp_close((struct sctp_s *)ssa->ssa_conn);
+
+ mutex_enter(&so->so_lock);
+ ssa->ssa_conn = NULL;
+ sosctp_assoc_free(ss, ssa);
+ }
+ }
+ if (ss->ss_assocs != NULL) {
+ ASSERT(ss->ss_assoccnt == 0);
+ kmem_free(ss->ss_assocs,
+ ss->ss_maxassoc * sizeof (struct sctp_sa_id));
+ }
+ mutex_exit(&so->so_lock);
+
+ if (so->so_proto_handle)
+ sctp_close((struct sctp_s *)so->so_proto_handle);
+ so->so_proto_handle = NULL;
+
+ sonode_fini(so);
+}
+
+/*
+ * Upcalls from SCTP
+ */
+
+/*
+ * This is the upcall function for 1-N (SOCK_SEQPACKET) socket when a new
+ * association is created. Note that the first argument (handle) is of type
+ * sctp_sonode *, which is the one changed to a listener for new
+ * associations. All the other upcalls for 1-N socket take sctp_soassoc *
+ * as handle. The only exception is the su_properties upcall, which
+ * can take both types as handle.
+ */
+/* ARGSUSED */
+sock_upper_handle_t
+sctp_assoc_newconn(sock_upper_handle_t parenthandle,
+ sock_lower_handle_t connind, sock_downcalls_t *dc,
+ struct cred *peer_cred, pid_t peer_cpid, sock_upcalls_t **ucp)
+{
+ struct sonode *lso = (struct sonode *)parenthandle;
+ struct sctp_sonode *lss = SOTOSSO(lso);
+ struct sctp_soassoc *ssa;
+ sctp_assoc_t id;
+
+ ASSERT(lss->ss_type == SOSCTP_SOCKET);
+ ASSERT(lso->so_state & SS_ACCEPTCONN);
+ ASSERT(lso->so_proto_handle != NULL); /* closed conn */
+ ASSERT(lso->so_type == SOCK_SEQPACKET);
+
+ mutex_enter(&lso->so_lock);
+
+ if ((id = sosctp_aid_get(lss)) == -1) {
+ /*
+ * Array not large enough; increase size.
+ */
+ if (sosctp_aid_grow(lss, lss->ss_maxassoc, KM_NOSLEEP) < 0) {
+ mutex_exit(&lso->so_lock);
+ return (NULL);
+ }
+ id = sosctp_aid_get(lss);
+ ASSERT(id != -1);
+ }
+
+ /*
+ * Create soassoc for this connection
+ */
+ ssa = sosctp_assoc_create(lss, KM_NOSLEEP);
+ if (ssa == NULL) {
+ mutex_exit(&lso->so_lock);
+ return (NULL);
+ }
+ sosctp_aid_reserve(lss, id, 1);
+ lss->ss_assocs[id].ssi_assoc = ssa;
+ ++lss->ss_assoccnt;
+ ssa->ssa_id = id;
+ ssa->ssa_conn = (struct sctp_s *)connind;
+ ssa->ssa_state = (SS_ISBOUND | SS_ISCONNECTED);
+ ssa->ssa_wroff = lss->ss_wroff;
+ ssa->ssa_wrsize = lss->ss_wrsize;
+
+ mutex_exit(&lso->so_lock);
+
+ *ucp = &sosctp_assoc_upcalls;
+
+ return ((sock_upper_handle_t)ssa);
+}
+
+/* ARGSUSED */
+static void
+sctp_assoc_connected(sock_upper_handle_t handle, sock_connid_t id,
+ struct cred *peer_cred, pid_t peer_cpid)
+{
+ struct sctp_soassoc *ssa = (struct sctp_soassoc *)handle;
+ struct sonode *so = &ssa->ssa_sonode->ss_so;
+
+ ASSERT(so->so_type == SOCK_SEQPACKET);
+ ASSERT(ssa->ssa_conn);
+
+ mutex_enter(&so->so_lock);
+ sosctp_assoc_isconnected(ssa);
+ mutex_exit(&so->so_lock);
+}
+
+/* ARGSUSED */
+static int
+sctp_assoc_disconnected(sock_upper_handle_t handle, sock_connid_t id, int error)
+{
+ struct sctp_soassoc *ssa = (struct sctp_soassoc *)handle;
+ struct sonode *so = &ssa->ssa_sonode->ss_so;
+ int ret;
+
+ ASSERT(so->so_type == SOCK_SEQPACKET);
+ ASSERT(ssa->ssa_conn != NULL);
+
+ mutex_enter(&so->so_lock);
+ sosctp_assoc_isdisconnected(ssa, error);
+ if (ssa->ssa_refcnt == 1) {
+ ret = 1;
+ ssa->ssa_conn = NULL;
+ } else {
+ ret = 0;
+ }
+ SSA_REFRELE(SOTOSSO(so), ssa);
+
+ cv_broadcast(&so->so_snd_cv);
+
+ mutex_exit(&so->so_lock);
+
+ return (ret);
+}
+
+/* ARGSUSED */
+static void
+sctp_assoc_disconnecting(sock_upper_handle_t handle, sock_opctl_action_t action,
+ uintptr_t arg)
+{
+ struct sctp_soassoc *ssa = (struct sctp_soassoc *)handle;
+ struct sonode *so = &ssa->ssa_sonode->ss_so;
+
+ ASSERT(so->so_type == SOCK_SEQPACKET);
+ ASSERT(ssa->ssa_conn != NULL);
+ ASSERT(action == SOCK_OPCTL_SHUT_SEND);
+
+ mutex_enter(&so->so_lock);
+ sosctp_assoc_isdisconnecting(ssa);
+ mutex_exit(&so->so_lock);
+}
+
+/* ARGSUSED */
+static ssize_t
+sctp_assoc_recv(sock_upper_handle_t handle, mblk_t *mp, size_t len, int flags,
+ int *errorp, boolean_t *forcepush)
+{
+ struct sctp_soassoc *ssa = (struct sctp_soassoc *)handle;
+ struct sctp_sonode *ss = ssa->ssa_sonode;
+ struct sonode *so = &ss->ss_so;
+ struct T_unitdata_ind *tind;
+ mblk_t *mp2;
+ union sctp_notification *sn;
+ struct sctp_sndrcvinfo *sinfo;
+
+ ASSERT(ssa->ssa_type == SOSCTP_ASSOC);
+ ASSERT(so->so_type == SOCK_SEQPACKET);
+ ASSERT(ssa->ssa_conn != NULL); /* closed conn */
+ ASSERT(mp != NULL);
+
+ ASSERT(errorp != NULL);
+ *errorp = 0;
+
+ /*
+ * Should be getting T_unitdata_req's only.
+ * Must have address as part of packet.
+ */
+ tind = (struct T_unitdata_ind *)mp->b_rptr;
+ ASSERT((DB_TYPE(mp) == M_PROTO) &&
+ (tind->PRIM_type == T_UNITDATA_IND));
+ ASSERT(tind->SRC_length);
+
+ mutex_enter(&so->so_lock);
+
+ /*
+ * Override b_flag for SCTP sockfs internal use
+ */
+ mp->b_flag = (short)flags;
+
+ /*
+ * For notify messages, need to fill in association id.
+ * For data messages, sndrcvinfo could be in ancillary data.
+ */
+ if (flags & SCTP_NOTIFICATION) {
+ mp2 = mp->b_cont;
+ sn = (union sctp_notification *)mp2->b_rptr;
+ switch (sn->sn_header.sn_type) {
+ case SCTP_ASSOC_CHANGE:
+ sn->sn_assoc_change.sac_assoc_id = ssa->ssa_id;
+ break;
+ case SCTP_PEER_ADDR_CHANGE:
+ sn->sn_paddr_change.spc_assoc_id = ssa->ssa_id;
+ break;
+ case SCTP_REMOTE_ERROR:
+ sn->sn_remote_error.sre_assoc_id = ssa->ssa_id;
+ break;
+ case SCTP_SEND_FAILED:
+ sn->sn_send_failed.ssf_assoc_id = ssa->ssa_id;
+ break;
+ case SCTP_SHUTDOWN_EVENT:
+ sn->sn_shutdown_event.sse_assoc_id = ssa->ssa_id;
+ break;
+ case SCTP_ADAPTATION_INDICATION:
+ sn->sn_adaptation_event.sai_assoc_id = ssa->ssa_id;
+ break;
+ case SCTP_PARTIAL_DELIVERY_EVENT:
+ sn->sn_pdapi_event.pdapi_assoc_id = ssa->ssa_id;
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+ } else {
+ if (tind->OPT_length > 0) {
+ struct cmsghdr *cmsg;
+ char *cend;
+
+ cmsg = (struct cmsghdr *)
+ ((uchar_t *)mp->b_rptr + tind->OPT_offset);
+ cend = (char *)cmsg + tind->OPT_length;
+ for (;;) {
+ if ((char *)(cmsg + 1) > cend ||
+ ((char *)cmsg + cmsg->cmsg_len) > cend) {
+ break;
+ }
+ if ((cmsg->cmsg_level == IPPROTO_SCTP) &&
+ (cmsg->cmsg_type == SCTP_SNDRCV)) {
+ sinfo = (struct sctp_sndrcvinfo *)
+ (cmsg + 1);
+ sinfo->sinfo_assoc_id = ssa->ssa_id;
+ break;
+ }
+ if (cmsg->cmsg_len > 0) {
+ cmsg = (struct cmsghdr *)
+ ((uchar_t *)cmsg + cmsg->cmsg_len);
+ } else {
+ break;
+ }
+ }
+ }
+ }
+
+ /*
+ * SCTP has reserved space in the header for storing a pointer.
+ * Put the pointer to assocation there, and queue the data.
+ */
+ SSA_REFHOLD(ssa);
+ ASSERT((mp->b_rptr - DB_BASE(mp)) >= sizeof (ssa));
+ *(struct sctp_soassoc **)DB_BASE(mp) = ssa;
+
+ mutex_exit(&so->so_lock);
+
+ return (so_queue_msg((sock_upper_handle_t)so, mp, len, 0, errorp,
+ NULL));
+}
+
+static void
+sctp_assoc_xmitted(sock_upper_handle_t handle, boolean_t qfull)
+{
+ struct sctp_soassoc *ssa = (struct sctp_soassoc *)handle;
+ struct sctp_sonode *ss = ssa->ssa_sonode;
+
+ ASSERT(ssa->ssa_type == SOSCTP_ASSOC);
+ ASSERT(ss->ss_so.so_type == SOCK_SEQPACKET);
+ ASSERT(ssa->ssa_conn != NULL);
+
+ mutex_enter(&ss->ss_so.so_lock);
+
+ ssa->ssa_snd_qfull = qfull;
+
+ /*
+ * Wake blocked writers.
+ */
+ cv_broadcast(&ss->ss_so.so_snd_cv);
+
+ mutex_exit(&ss->ss_so.so_lock);
+}
+
+static void
+sctp_assoc_properties(sock_upper_handle_t handle,
+ struct sock_proto_props *soppp)
+{
+ struct sctp_soassoc *ssa = (struct sctp_soassoc *)handle;
+ struct sctp_sonode *ss;
+
+ if (ssa->ssa_type == SOSCTP_ASSOC) {
+ ss = ssa->ssa_sonode;
+ mutex_enter(&ss->ss_so.so_lock);
+
+ /*
+ * Only change them if they're set.
+ */
+ if (soppp->sopp_wroff != 0) {
+ ssa->ssa_wroff = soppp->sopp_wroff;
+ }
+ if (soppp->sopp_maxblk != 0) {
+ ssa->ssa_wrsize = soppp->sopp_maxblk;
+ }
+ } else {
+ ss = (struct sctp_sonode *)handle;
+ mutex_enter(&ss->ss_so.so_lock);
+
+ if (soppp->sopp_wroff != 0) {
+ ss->ss_wroff = soppp->sopp_wroff;
+ }
+ if (soppp->sopp_maxblk != 0) {
+ ss->ss_wrsize = soppp->sopp_maxblk;
+ }
+ }
+
+ mutex_exit(&ss->ss_so.so_lock);
+}
diff --git a/usr/src/uts/common/fs/sockfs/socksctp.h b/usr/src/uts/common/inet/sockmods/socksctp.h
index dfbd818e40..55d56df7ae 100644
--- a/usr/src/uts/common/fs/sockfs/socksctp.h
+++ b/usr/src/uts/common/inet/sockmods/socksctp.h
@@ -26,8 +26,6 @@
#ifndef _SOCKSCTP_H_
#define _SOCKSCTP_H_
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -47,15 +45,8 @@ struct sctp_sonode {
sctp_assoc_t ss_maxassoc; /* assoc array size for 1-N */
sctp_assoc_t ss_assoccnt; /* current # of assocs */
struct sctp_sa_id *ss_assocs; /* assoc array for 1-N */
- kcondvar_t ss_txdata_cv; /* wait TX window to open */
- int ss_wroff;
- size_t ss_wrsize;
- int ss_txqueued; /* queued tx bytes */
- kcondvar_t ss_rxdata_cv; /* for waiting RX data */
- mblk_t *ss_rxdata; /* queued rx data */
- mblk_t **ss_rxtail; /* ptr to last message */
- int ss_rxqueued; /* queued rx bytes/# of conn */
- struct pollhead ss_poll_list;
+#define ss_wroff ss_so.so_proto_props.sopp_wroff
+#define ss_wrsize ss_so.so_proto_props.sopp_maxblk
};
/*
@@ -69,14 +60,13 @@ struct sctp_soassoc {
struct sctp_s *ssa_conn; /* opaque ptr passed to SCTP */
uint_t ssa_state; /* same as so_state */
int ssa_error; /* same as so_error */
- int ssa_txqueued; /* queued tx bytes */
+ boolean_t ssa_snd_qfull;
int ssa_wroff;
size_t ssa_wrsize;
- int ssa_rxqueued; /* queued rx bytes/# of conn */
+ int ssa_rcv_queued; /* queued rx bytes/# of conn */
};
/* 1-N socket association cache defined in socksctp.c */
-extern kmem_cache_t *sosctp_assoccache;
/*
* Association array element.
@@ -91,18 +81,14 @@ struct sctp_sa_id {
struct sctp_soassoc *ssi_assoc;
};
-extern sctp_upcalls_t sosctp_sock_upcalls;
-extern sctp_upcalls_t sosctp_assoc_upcalls;
-extern struct vnodeops *socksctp_vnodeops;
-extern const fs_operation_def_t socksctp_vnodeops_template[];
-
-extern void sosctp_free(struct sonode *so);
-extern int sosctp_chgpgrp(struct sctp_sonode *ss, pid_t pid);
-extern void sosctp_sendsig(struct sctp_sonode *ss, int event);
+extern sonodeops_t sosctp_sonodeops;
+extern sonodeops_t sosctp_seq_sonodeops;
+extern sock_upcalls_t sosctp_sock_upcalls;
+extern sock_upcalls_t sosctp_assoc_upcalls;
-extern int sosctp_bind(struct sonode *so, struct sockaddr *name,
- socklen_t namelen, int flags);
-extern int sosctp_recvmsg(struct sonode *, struct nmsghdr *, struct uio *);
+extern struct sonode *socksctp_create(struct sockparams *, int, int,
+ int, int, int, int *, cred_t *);
+extern void sosctp_fini(struct sonode *, struct cred *);
extern int sosctp_aid_grow(struct sctp_sonode *ss, sctp_assoc_t maxid,
int kmflags);
extern sctp_assoc_t sosctp_aid_get(struct sctp_sonode *ss);
@@ -119,7 +105,7 @@ extern struct sctp_soassoc *sosctp_assoc_create(struct sctp_sonode *ss,
extern void sosctp_assoc_free(struct sctp_sonode *ss, struct sctp_soassoc *ssa);
extern int sosctp_assoc_createconn(struct sctp_sonode *ss,
const struct sockaddr *name, socklen_t namelen,
- const uchar_t *control, socklen_t controllen, int fflag,
+ const uchar_t *control, socklen_t controllen, int fflag, struct cred *,
struct sctp_soassoc **ssap);
extern void sosctp_assoc_move(struct sctp_sonode *ss, struct sctp_sonode *nss,
struct sctp_soassoc *ssa);
@@ -165,12 +151,6 @@ extern int sosctp_uiomove(mblk_t *hdr_mp, ssize_t count, ssize_t blk_size,
} \
}
-/*
- * Event flags to sosctp_sendsig().
- */
-#define SCTPSIG_WRITE 0x1
-#define SCTPSIG_READ 0x2
-
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/fs/sockfs/socksctpsubr.c b/usr/src/uts/common/inet/sockmods/socksctpsubr.c
index e741bd29f7..fab1a4534d 100644
--- a/usr/src/uts/common/fs/sockfs/socksctpsubr.c
+++ b/usr/src/uts/common/inet/sockmods/socksctpsubr.c
@@ -24,8 +24,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
@@ -36,9 +34,6 @@
#include <sys/cmn_err.h>
#include <sys/sysmacros.h>
-#include <sys/vfs.h>
-#include <sys/vfs_opreg.h>
-
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/strsun.h>
@@ -46,8 +41,10 @@
#include <netinet/sctp.h>
#include <inet/sctp_itf.h>
+#include <fs/sockfs/sockcommon.h>
#include "socksctp.h"
+extern kmem_cache_t *sosctp_assoccache;
/*
* Find a free association id. See os/fio.c file descriptor allocator
* for description of the algorithm.
@@ -178,8 +175,10 @@ sosctp_assoc_create(struct sctp_sonode *ss, int kmflag)
ssa->ssa_sonode = ss;
ssa->ssa_state = 0;
ssa->ssa_error = 0;
+#if 0
ssa->ssa_txqueued = 0;
- ssa->ssa_rxqueued = 0;
+#endif
+ ssa->ssa_snd_qfull = 0;
}
dprint(2, ("sosctp_assoc_create %p %p\n", (void *)ss, (void *)ssa));
return (ssa);
@@ -305,55 +304,6 @@ sosctp_find_cmsg(const uchar_t *control, socklen_t clen, int type)
}
/*
- * Wait until the socket is connected or there is an error.
- * fmode should contain any nonblocking flags.
- */
-int
-sosctp_waitconnected(struct sonode *so, int fmode)
-{
- int error = 0;
-
- ASSERT(MUTEX_HELD(&so->so_lock));
- ASSERT((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) ||
- so->so_error != 0);
-
- while ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) ==
- SS_ISCONNECTING && so->so_error == 0) {
-
- dprint(3, ("waiting for SS_ISCONNECTED on %p\n", (void *)so));
- if (fmode & (FNDELAY|FNONBLOCK))
- return (EINPROGRESS);
-
- if (!cv_wait_sig_swap(&so->so_state_cv, &so->so_lock)) {
- /*
- * Return EINTR and let the application use
- * nonblocking techniques for detecting when
- * the connection has been established.
- */
- return (EINTR);
- }
- dprint(3, ("awoken on %p\n", (void *)so));
- }
-
- if (so->so_error != 0) {
- error = sogeterr(so);
- ASSERT(error != 0);
- dprint(3, ("sosctp_waitconnected: error %d\n", error));
- return (error);
- }
- if (!(so->so_state & SS_ISCONNECTED)) {
- /*
- * Another thread could have consumed so_error
- * e.g. by calling read. - take from sowaitconnected()
- */
- error = ECONNREFUSED;
- dprint(3, ("sosctp_waitconnected: error %d\n", error));
- return (error);
- }
- return (0);
-}
-
-/*
* Wait until the association is connected or there is an error.
* fmode should contain any nonblocking flags.
*/
@@ -373,6 +323,8 @@ sosctp_assoc_waitconnected(struct sctp_soassoc *ssa, int fmode)
if (fmode & (FNDELAY|FNONBLOCK))
return (EINPROGRESS);
+ if (so->so_state & SS_CLOSING)
+ return (EINTR);
if (!cv_wait_sig_swap(&so->so_state_cv, &so->so_lock)) {
/*
* Return EINTR and let the application use
@@ -408,7 +360,7 @@ sosctp_assoc_waitconnected(struct sctp_soassoc *ssa, int fmode)
int
sosctp_assoc_createconn(struct sctp_sonode *ss, const struct sockaddr *name,
socklen_t namelen, const uchar_t *control, socklen_t controllen, int fflag,
- struct sctp_soassoc **ssap)
+ struct cred *cr, struct sctp_soassoc **ssap)
{
struct sonode *so = &ss->ss_so;
struct sctp_soassoc *ssa;
@@ -427,8 +379,8 @@ sosctp_assoc_createconn(struct sctp_sonode *ss, const struct sockaddr *name,
bzero(&laddr, sizeof (laddr));
laddr.ss_family = so->so_family;
- error = sosctp_bind(so, (struct sockaddr *)&laddr,
- sizeof (laddr), _SOBIND_LOCK_HELD);
+ error = SOP_BIND(so, (struct sockaddr *)&laddr,
+ sizeof (laddr), _SOBIND_LOCK_HELD, cr);
if (error) {
*ssap = NULL;
return (error);
@@ -456,8 +408,8 @@ sosctp_assoc_createconn(struct sctp_sonode *ss, const struct sockaddr *name,
ssa = sosctp_assoc_create(ss, KM_SLEEP);
ssa->ssa_wroff = ss->ss_wroff;
ssa->ssa_wrsize = ss->ss_wrsize;
- ssa->ssa_conn = sctp_create(ssa, so->so_priv, so->so_family,
- SCTP_CAN_BLOCK, &sosctp_assoc_upcalls, &sbl, CRED());
+ ssa->ssa_conn = sctp_create(ssa, (struct sctp_s *)so->so_proto_handle,
+ so->so_family, SCTP_CAN_BLOCK, &sosctp_assoc_upcalls, &sbl, cr);
mutex_enter(&so->so_lock);
ss->ss_assocs[id].ssi_assoc = ssa;
@@ -561,7 +513,7 @@ void
sosctp_assoc_move(struct sctp_sonode *ss, struct sctp_sonode *nss,
struct sctp_soassoc *ssa)
{
- mblk_t *mp, **nmp;
+ mblk_t *mp, **nmp, *last_mp;
struct sctp_soassoc *tmp;
sosctp_so_inherit(ss, nss);
@@ -571,26 +523,39 @@ sosctp_assoc_move(struct sctp_sonode *ss, struct sctp_sonode *nss,
(ssa->ssa_state & (SS_ISCONNECTED|SS_ISCONNECTING|
SS_ISDISCONNECTING|SS_CANTSENDMORE|SS_CANTRCVMORE|SS_ISBOUND));
nss->ss_so.so_error = ssa->ssa_error;
- nss->ss_txqueued = ssa->ssa_txqueued;
+#if 0
+ nss->ss_so.so_txqueued = ssa->ssa_txqueued;
+#endif
+ nss->ss_so.so_snd_qfull = ssa->ssa_snd_qfull;
nss->ss_wroff = ssa->ssa_wroff;
nss->ss_wrsize = ssa->ssa_wrsize;
- nss->ss_rxqueued = ssa->ssa_rxqueued;
- nss->ss_so.so_priv = ssa->ssa_conn;
+ nss->ss_so.so_rcv_queued = ssa->ssa_rcv_queued;
+ nss->ss_so.so_proto_handle = (sock_lower_handle_t)ssa->ssa_conn;
- if (nss->ss_rxqueued > 0) {
- nmp = &ss->ss_rxdata;
+ if (nss->ss_so.so_rcv_queued > 0) {
+ nmp = &ss->ss_so.so_rcv_q_head;
+ last_mp = NULL;
while ((mp = *nmp) != NULL) {
tmp = *(struct sctp_soassoc **)DB_BASE(mp);
if (tmp == ssa) {
*nmp = mp->b_next;
- *nss->ss_rxtail = mp;
- nss->ss_rxtail = &mp->b_next;
+ ASSERT(DB_TYPE(mp) != M_DATA);
+ if (nss->ss_so.so_rcv_q_last_head == NULL) {
+ nss->ss_so.so_rcv_q_head = mp;
+ } else {
+ nss->ss_so.so_rcv_q_last_head->b_next =
+ mp;
+ }
+ nss->ss_so.so_rcv_q_last_head = mp;
+ nss->ss_so.so_rcv_q_last_head->b_prev = last_mp;
+ mp->b_next = NULL;
} else {
nmp = &mp->b_next;
+ last_mp = mp;
}
}
- ss->ss_rxtail = nmp;
- *nss->ss_rxtail = NULL;
+ ss->ss_so.so_rcv_q_last_head = last_mp;
+ ss->ss_so.so_rcv_q_last_head->b_prev = last_mp;
}
}
@@ -643,97 +608,3 @@ sosctp_assoc_isdisconnected(struct sctp_soassoc *ssa, int error)
ssa->ssa_error = (ushort_t)error;
cv_broadcast(&so->so_state_cv);
}
-
-/*
- * Change the process/process group to which SIGIO is sent.
- */
-int
-sosctp_chgpgrp(struct sctp_sonode *ss, pid_t pid)
-{
- int error;
-
- ASSERT(MUTEX_HELD(&ss->ss_so.so_lock));
- if (pid != 0) {
- /*
- * Permissions check by sending signal 0.
- * Note that when kill fails it does a
- * set_errno causing the system call to fail.
- */
- error = kill(pid, 0);
- if (error != 0) {
- return (error);
- }
- }
- ss->ss_so.so_pgrp = pid;
- return (0);
-}
-
-/*
- * Generate a SIGIO, for 'writable' events include siginfo structure,
- * for read events just send the signal.
- */
-static void
-sosctp_sigproc(proc_t *proc, int event)
-{
- k_siginfo_t info;
-
- if (event & SCTPSIG_WRITE) {
- info.si_signo = SIGPOLL;
- info.si_code = POLL_OUT;
- info.si_errno = 0;
- info.si_fd = 0; /* not set with TCP either */
- info.si_band = 0;
- sigaddq(proc, NULL, &info, KM_NOSLEEP);
- }
- if (event & SCTPSIG_READ) {
- sigtoproc(proc, NULL, SIGPOLL);
- }
-}
-
-void
-sosctp_sendsig(struct sctp_sonode *ss, int event)
-{
- proc_t *proc;
- struct sonode *so = &ss->ss_so;
-
- ASSERT(MUTEX_HELD(&ss->ss_so.so_lock));
-
- if (so->so_pgrp == 0 || !(so->so_state & SS_ASYNC)) {
- return;
- }
- dprint(3, ("sending sig to %d\n", so->so_pgrp));
-
- if (so->so_pgrp > 0) {
- /*
- * XXX This unfortunately still generates
- * a signal when a fd is closed but
- * the proc is active.
- */
- mutex_enter(&pidlock);
- proc = prfind(so->so_pgrp);
- if (proc == NULL) {
- mutex_exit(&pidlock);
- return;
- }
- mutex_enter(&proc->p_lock);
- mutex_exit(&pidlock);
- sosctp_sigproc(proc, event);
- mutex_exit(&proc->p_lock);
- } else {
- /*
- * Send to process group. Hold pidlock across
- * calls to sosctp_sigproc().
- */
- pid_t pgrp = -so->so_pgrp;
-
- mutex_enter(&pidlock);
- proc = pgfind(pgrp);
- while (proc != NULL) {
- mutex_enter(&proc->p_lock);
- sosctp_sigproc(proc, event);
- proc = proc->p_pglink;
- mutex_exit(&proc->p_lock);
- }
- mutex_exit(&pidlock);
- }
-}
diff --git a/usr/src/uts/common/fs/sockfs/socksdp.c b/usr/src/uts/common/inet/sockmods/socksdp.c
index 7376783fc0..fdbdca5cb3 100644
--- a/usr/src/uts/common/fs/sockfs/socksdp.c
+++ b/usr/src/uts/common/inet/sockmods/socksdp.c
@@ -30,7 +30,6 @@
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/vfs.h>
-#include <sys/vfs_opreg.h>
#include <sys/vnode.h>
#include <sys/debug.h>
#include <sys/errno.h>
@@ -38,6 +37,9 @@
#include <sys/cmn_err.h>
#include <sys/sysmacros.h>
+#include <sys/filio.h>
+#include <sys/sockio.h>
+
#include <sys/project.h>
#include <sys/tihdr.h>
#include <sys/strsubr.h>
@@ -50,22 +52,37 @@
#include <inet/sdp_itf.h>
#include "socksdp.h"
+#include <fs/sockfs/sockcommon.h>
/*
* SDP sockfs sonode operations
*/
-static int sosdp_accept(struct sonode *, int, struct sonode **);
-static int sosdp_listen(struct sonode *, int);
+static int sosdp_init(struct sonode *, struct sonode *, struct cred *, int);
+static int sosdp_accept(struct sonode *, int, struct cred *, struct sonode **);
+static int sosdp_bind(struct sonode *, struct sockaddr *, socklen_t, int,
+ struct cred *);
+static int sosdp_listen(struct sonode *, int, struct cred *);
static int sosdp_connect(struct sonode *, const struct sockaddr *, socklen_t,
- int, int);
-static int sosdp_sendmsg(struct sonode *, struct nmsghdr *, struct uio *);
-static int sosdp_getpeername(struct sonode *);
-static int sosdp_getsockname(struct sonode *);
-static int sosdp_shutdown(struct sonode *, int);
+ int, int, struct cred *);
+static int sosdp_recvmsg(struct sonode *, struct nmsghdr *, struct uio *,
+ struct cred *);
+static int sosdp_sendmsg(struct sonode *, struct nmsghdr *, struct uio *,
+ struct cred *);
+static int sosdp_getpeername(struct sonode *, struct sockaddr *, socklen_t *,
+ boolean_t, struct cred *);
+static int sosdp_getsockname(struct sonode *, struct sockaddr *, socklen_t *,
+ struct cred *);
+static int sosdp_shutdown(struct sonode *, int, struct cred *);
static int sosdp_getsockopt(struct sonode *, int, int, void *, socklen_t *,
- int);
+ int, struct cred *);
static int sosdp_setsockopt(struct sonode *, int, int, const void *,
- socklen_t);
+ socklen_t, struct cred *);
+static int sosdp_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
+ int32_t *);
+static int sosdp_poll(struct sonode *, short, int, short *,
+ struct pollhead **);
+static int sosdp_close(struct sonode *, int, struct cred *);
+void sosdp_fini(struct sonode *, struct cred *);
/*
@@ -80,20 +97,23 @@ static void sdp_sock_xmitted(void *handle, int txqueued);
static void sdp_sock_urgdata(void *handle);
static void sdp_sock_ordrel(void *handle);
-static kmem_cache_t *sosdp_sockcache;
-
sonodeops_t sosdp_sonodeops = {
- sosdp_accept, /* sop_accept */
- sosdp_bind, /* sop_bind */
- sosdp_listen, /* sop_listen */
- sosdp_connect, /* sop_connect */
- sosdp_recvmsg, /* sop_recvmsg */
- sosdp_sendmsg, /* sop_sendmsg */
- sosdp_getpeername, /* sop_getpeername */
- sosdp_getsockname, /* sop_getsockname */
- sosdp_shutdown, /* sop_shutdown */
- sosdp_getsockopt, /* sop_getsockopt */
- sosdp_setsockopt /* sop_setsockopt */
+ sosdp_init, /* sop_init */
+ sosdp_accept, /* sop_accept */
+ sosdp_bind, /* sop_bind */
+ sosdp_listen, /* sop_listen */
+ sosdp_connect, /* sop_connect */
+ sosdp_recvmsg, /* sop_recvmsg */
+ sosdp_sendmsg, /* sop_sendmsg */
+ so_sendmblk_notsupp, /* sop_sendmblk */
+ sosdp_getpeername, /* sop_getpeername */
+ sosdp_getsockname, /* sop_getsockname */
+ sosdp_shutdown, /* sop_shutdown */
+ sosdp_getsockopt, /* sop_getsockopt */
+ sosdp_setsockopt, /* sop_setsockopt */
+ sosdp_ioctl, /* sop_ioctl */
+ sosdp_poll, /* sop_poll */
+ sosdp_close, /* sop_close */
};
sdp_upcalls_t sosdp_sock_upcalls = {
@@ -107,320 +127,57 @@ sdp_upcalls_t sosdp_sock_upcalls = {
sdp_sock_ordrel,
};
-
-/*ARGSUSED*/
+/* ARGSUSED */
static int
-sosdp_sock_constructor(void *buf, void *cdrarg, int kmflags)
-{
- struct sdp_sonode *ss = buf;
- struct sonode *so = &ss->ss_so;
- struct vnode *vp;
-
- ss->ss_type = SOSDP_SOCKET;
- so->so_oobmsg = NULL;
- so->so_ack_mp = NULL;
- so->so_conn_ind_head = NULL;
- so->so_conn_ind_tail = NULL;
- so->so_discon_ind_mp = NULL;
- so->so_ux_bound_vp = NULL;
- so->so_unbind_mp = NULL;
- so->so_ops = NULL;
- so->so_accessvp = NULL;
- so->so_priv = NULL;
-
- so->so_nl7c_flags = 0;
- so->so_nl7c_uri = NULL;
- so->so_nl7c_rcv_mp = NULL;
-
- so->so_direct = NULL;
-
- vp = vn_alloc(kmflags);
- if (vp == NULL) {
- return (-1);
- }
- so->so_vnode = vp;
-
- vn_setops(vp, socksdp_vnodeops);
- vp->v_data = (caddr_t)so;
-
- mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&so->so_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&so->so_ack_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&so->so_connind_cv, NULL, CV_DEFAULT, NULL);
- cv_init(&so->so_want_cv, NULL, CV_DEFAULT, NULL);
- return (0);
-}
-
-/*ARGSUSED*/
-static void
-sosdp_sock_destructor(void *buf, void *cdrarg)
-{
- struct sdp_sonode *ss = buf;
- struct sonode *so = &ss->ss_so;
- struct vnode *vp = SOTOV(so);
-
- ASSERT(so->so_direct == NULL);
-
- ASSERT(so->so_nl7c_flags == 0);
- ASSERT(so->so_nl7c_uri == NULL);
- ASSERT(so->so_nl7c_rcv_mp == NULL);
-
- ASSERT(so->so_oobmsg == NULL);
- ASSERT(so->so_ack_mp == NULL);
- ASSERT(so->so_conn_ind_head == NULL);
- ASSERT(so->so_conn_ind_tail == NULL);
- ASSERT(so->so_discon_ind_mp == NULL);
- ASSERT(so->so_ux_bound_vp == NULL);
- ASSERT(so->so_unbind_mp == NULL);
- ASSERT(so->so_ops == NULL || so->so_ops == &sosdp_sonodeops);
-
- ASSERT(vn_matchops(vp, socksdp_vnodeops));
- ASSERT(vp->v_data == (caddr_t)so);
-
- vn_free(vp);
-
- mutex_destroy(&so->so_lock);
- mutex_destroy(&so->so_plumb_lock);
- cv_destroy(&so->so_state_cv);
- cv_destroy(&so->so_ack_cv);
- cv_destroy(&so->so_connind_cv);
- cv_destroy(&so->so_want_cv);
-}
-
-
-int
-sosdp_init(void)
-{
- int error;
-
- error = vn_make_ops("socksdp", socksdp_vnodeops_template,
- &socksdp_vnodeops);
- if (error != 0) {
- cmn_err(CE_WARN, "sosdp_init: bad vnode ops template");
- return (error);
- }
-
- sosdp_sockcache = kmem_cache_create("sdpsock",
- sizeof (struct sdp_sonode), 0, sosdp_sock_constructor,
- sosdp_sock_destructor, NULL, NULL, NULL, 0);
- return (0);
-}
-
-static struct vnode *
-sosdp_makevp(struct vnode *accessvp, int domain, int type, int protocol,
- int kmflags)
+sosdp_init(struct sonode *so, struct sonode *pso, struct cred *cr, int flags)
{
- struct sdp_sonode *ss;
- struct sonode *so;
- struct vnode *vp;
- time_t now;
-
- ss = kmem_cache_alloc(sosdp_sockcache, kmflags);
- if (ss == NULL) {
- return (NULL);
- }
- so = &ss->ss_so;
- so->so_cache = sosdp_sockcache;
- so->so_obj = ss;
- vp = SOTOV(so);
- now = gethrestime_sec();
-
- so->so_flag = 0;
- so->so_accessvp = accessvp;
- so->so_dev = accessvp->v_rdev;
-
- so->so_state = 0;
- so->so_mode = 0;
-
- so->so_fsid = sockdev;
- so->so_atime = now;
- so->so_mtime = now;
- so->so_ctime = now;
- so->so_count = 0;
-
- so->so_family = domain;
- so->so_type = type;
- so->so_protocol = protocol;
- so->so_pushcnt = 0;
-
- so->so_options = 0;
- so->so_linger.l_onoff = 0;
- so->so_linger.l_linger = 0;
- so->so_sndbuf = 0;
- so->so_rcvbuf = 0;
- so->so_error = 0;
- so->so_delayed_error = 0;
-
- ASSERT(so->so_oobmsg == NULL);
- so->so_oobcnt = 0;
- so->so_oobsigcnt = 0;
- so->so_pgrp = 0;
- so->so_provinfo = NULL;
-
- so->so_laddr_sa = (struct sockaddr *)&ss->ss_laddr;
- so->so_faddr_sa = (struct sockaddr *)&ss->ss_faddr;
- so->so_laddr_maxlen = so->so_faddr_maxlen = sizeof (ss->ss_laddr);
- so->so_laddr_len = so->so_faddr_len = 0;
- so->so_eaddr_mp = NULL;
- so->so_delayed_error = 0;
-
- so->so_peercred = NULL;
-
- ASSERT(so->so_ack_mp == NULL);
- ASSERT(so->so_conn_ind_head == NULL);
- ASSERT(so->so_conn_ind_tail == NULL);
- ASSERT(so->so_ux_bound_vp == NULL);
- ASSERT(so->so_unbind_mp == NULL);
-
- vn_reinit(vp);
- vp->v_vfsp = rootvfs;
- vp->v_type = VSOCK;
- vp->v_rdev = so->so_dev;
-
- so->so_ops = &sosdp_sonodeops;
-
- ss->ss_rxqueued = 0;
- bzero(&ss->ss_poll_list, sizeof (ss->ss_poll_list));
-
- vn_exists(vp);
- return (vp);
-}
-
-/*
- * Creates a sdp socket data structure.
- * tso is non-NULL if it's passive open.
- */
-struct sonode *
-sosdp_create(vnode_t *accessvp, int domain, int type, int protocol,
- int version, struct sonode *tso, int *errorp)
-{
- struct sonode *so;
- vnode_t *vp;
- int error;
- int soflags;
- cred_t *cr;
-
- dprint(4, ("Inside sosdp_create: domain:%d proto:%d type:%d",
- domain, protocol, type));
-
- if (is_system_labeled()) {
- *errorp = EOPNOTSUPP;
- return (NULL);
- }
-
- if (version == SOV_STREAM) {
- *errorp = EINVAL;
- return (NULL);
- }
- ASSERT(accessvp != NULL);
+ int error = 0;
+ sdp_sockbuf_limits_t sbl;
+ sdp_upcalls_t *upcalls;
- /*
- * We only support one type of SDP socket. Let sotpi_create()
- * handle all other cases, such as raw socket.
- */
- if (!(domain == AF_INET || domain == AF_INET6) ||
- !(type == SOCK_STREAM)) {
- return (sotpi_create(accessvp, domain, type, protocol, version,
- NULL, errorp));
- }
+ if (pso != NULL) {
+ /* passive open, just inherit settings from parent */
- if (tso == NULL) {
- vp = sosdp_makevp(accessvp, domain, type, protocol, KM_SLEEP);
- ASSERT(vp != NULL);
+ mutex_enter(&so->so_lock);
- soflags = FREAD | FWRITE;
- } else {
- vp = sosdp_makevp(accessvp, domain, type, protocol,
- KM_NOSLEEP);
- if (vp == NULL) {
- /*
- * sosdp_makevp() only fails when there is no memory.
- */
- *errorp = ENOMEM;
- return (NULL);
- }
- soflags = FREAD | FWRITE | SO_ACCEPTOR;
- }
- /*
- * This function may be called in interrupt context, and CRED()
- * will be NULL. In this case, pass in kcred to VOP_OPEN().
- */
- if ((cr = CRED()) == NULL)
- cr = kcred;
- if ((error = VOP_OPEN(&vp, soflags, cr, NULL)) != 0) {
- VN_RELE(vp);
- *errorp = error;
- return (NULL);
- }
- so = VTOSO(vp);
+ so->so_state |= (SS_ISBOUND | SS_ISCONNECTED |
+ (pso->so_state & SS_ASYNC));
+ sosdp_so_inherit(pso, so);
+ so->so_proto_props = pso->so_proto_props;
- dprint(2, ("sosdp_create: %p domain %d type %d\n", (void *)so,
- domain, type));
+ mutex_exit(&so->so_lock);
- if (version == SOV_DEFAULT) {
- version = so_default_version;
+ return (0);
}
- so->so_version = (short)version;
- return (so);
-}
+ upcalls = &sosdp_sock_upcalls;
-/*
- * Free SDP socket data structure.
- * Closes incoming connections which were never accepted, frees
- * resources.
- */
-void
-sosdp_free(struct sonode *so)
-{
- struct sonode *nso;
- mblk_t *mp;
+ so->so_proto_handle = (sock_lower_handle_t)sdp_create(so, NULL,
+ so->so_family, SDP_CAN_BLOCK, upcalls, &sbl, cr, &error);
+ if (so->so_proto_handle == NULL)
+ return (ENOMEM);
- dprint(3, ("sosdp_free: so:%p priv:%p", (void *)so, so->so_priv));
+ so->so_rcvbuf = sbl.sbl_rxbuf;
+ so->so_rcvlowat = sbl.sbl_rxlowat;
+ so->so_sndbuf = sbl.sbl_txbuf;
+ so->so_sndlowat = sbl.sbl_txlowat;
- mutex_enter(&so->so_lock);
-
- /*
- * Need to clear these out so that sockfree() doesn't think that
- * there's memory in need of free'ing.
- */
- so->so_laddr_sa = so->so_faddr_sa = NULL;
- so->so_laddr_len = so->so_laddr_maxlen = 0;
- so->so_faddr_len = so->so_faddr_maxlen = 0;
-
- while ((mp = so->so_conn_ind_head) != NULL) {
- so->so_conn_ind_head = mp->b_next;
- mutex_exit(&so->so_lock);
- mp->b_next = NULL;
- nso = *(struct sonode **)mp->b_rptr;
-
- (void) VOP_CLOSE(SOTOV(nso), 0, 1, 0, CRED(), NULL);
- vn_invalid(SOTOV(nso));
- VN_RELE(SOTOV(nso));
-
- freeb(mp);
- mutex_enter(&so->so_lock);
- }
- so->so_conn_ind_tail = NULL;
- so->so_state &= ~SS_HASCONNIND;
- mutex_exit(&so->so_lock);
-
- sockfree(so);
+ return (error);
}
/*
* Accept incoming connection.
*/
+/* ARGSUSED */
static int
-sosdp_accept(struct sonode *lso, int fflag, struct sonode **nsop)
+sosdp_accept(struct sonode *lso, int fflag, struct cred *cr,
+ struct sonode **nsop)
{
int error = 0;
- mblk_t *mp;
struct sonode *nso;
- dprint(3, ("sosdp_accept: so:%p priv:%p", (void *)lso,
- lso->so_priv));
+ dprint(3, ("sosdp_accept: so:%p so_proto_handle:%p", (void *)lso,
+ (void *)lso->so_proto_handle));
if (!(lso->so_state & SS_ACCEPTCONN)) {
/*
@@ -429,50 +186,36 @@ sosdp_accept(struct sonode *lso, int fflag, struct sonode **nsop)
eprintsoline(lso, EINVAL);
return (EINVAL);
}
-
/*
* Returns right away if socket is nonblocking.
*/
- error = sowaitconnind(lso, fflag, &mp);
+ error = so_acceptq_dequeue(lso, (fflag & (FNONBLOCK|FNDELAY)), &nso);
if (error != 0) {
eprintsoline(lso, error);
- dprint(4, ("sosdp_accept: failed <%d>:lso:%p prv:%p",
- error, (void *)lso, lso->so_priv));
+ dprint(4, ("sosdp_accept: failed %d:lso:%p so_proto_handle:%p",
+ error, (void *)lso, (void *)lso->so_proto_handle));
return (error);
}
- nso = *(struct sonode **)mp->b_rptr;
- freeb(mp);
-
- mutex_enter(&lso->so_lock);
- ASSERT(SOTOSDO(lso)->ss_rxqueued > 0);
- --SOTOSDO(lso)->ss_rxqueued;
- mutex_exit(&lso->so_lock);
-
-
- /*
- * accept() needs remote address right away.
- */
- (void) sosdp_getpeername(nso);
dprint(2, ("sosdp_accept: new %p\n", (void *)nso));
-
*nsop = nso;
+
return (0);
}
/*
* Bind local endpoint.
*/
+/* ARGSUSED */
int
sosdp_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
- int flags)
+ int flags, struct cred *cr)
{
- int error = 0;
+ int error = 0;
if (!(flags & _SOBIND_LOCK_HELD)) {
mutex_enter(&so->so_lock);
so_lock_single(so); /* Set SOLOCKED */
- /* LINTED - statement has no conseq */
} else {
ASSERT(MUTEX_HELD(&so->so_lock));
ASSERT(so->so_flag & SOLOCKED);
@@ -487,6 +230,7 @@ sosdp_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
eprintsoline(so, error);
goto done;
}
+
/*
* X/Open requires this check
*/
@@ -496,16 +240,17 @@ sosdp_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
}
/*
- * Protocol module does address family checks.
+ * Protocol module does address family checks
*/
mutex_exit(&so->so_lock);
- error = sdp_bind(so->so_priv, name, namelen);
+ error = sdp_bind((struct sdp_conn_struct_t *)so->so_proto_handle,
+ name, namelen);
mutex_enter(&so->so_lock);
+
if (error == 0) {
so->so_state |= SS_ISBOUND;
- /* LINTED - statement has no conseq */
} else {
eprintsoline(so, error);
}
@@ -513,7 +258,6 @@ done:
if (!(flags & _SOBIND_LOCK_HELD)) {
so_unlock_single(so, SOLOCKED);
mutex_exit(&so->so_lock);
- /* LINTED - statement has no conseq */
} else {
/* If the caller held the lock don't release it here */
ASSERT(MUTEX_HELD(&so->so_lock));
@@ -525,12 +269,12 @@ done:
/*
* Turn socket into a listen socket.
*/
+/* ARGSUSED */
static int
-sosdp_listen(struct sonode *so, int backlog)
+sosdp_listen(struct sonode *so, int backlog, struct cred *cr)
{
int error = 0;
-
mutex_enter(&so->so_lock);
so_lock_single(so);
@@ -541,30 +285,9 @@ sosdp_listen(struct sonode *so, int backlog)
if (so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED |
SS_ISDISCONNECTING | SS_CANTRCVMORE | SS_CANTSENDMORE)) {
error = EINVAL;
- eprintsoline(so, error);
+ eprintsoline(so, EINVAL);
goto done;
}
-
- if (backlog < 0) {
- backlog = 0;
- }
-
- /*
- * Use the same qlimit as in BSD. BSD checks the qlimit
- * before queuing the next connection implying that a
- * listen(sock, 0) allows one connection to be queued.
- * BSD also uses 1.5 times the requested backlog.
- *
- * XNS Issue 4 required a strict interpretation of the backlog.
- * This has been waived subsequently for Issue 4 and the change
- * incorporated in XNS Issue 5. So we aren't required to do
- * anything special for XPG apps.
- */
- if (backlog >= (INT_MAX - 1) / 3)
- backlog = INT_MAX;
- else
- backlog = backlog * 3 / 2 + 1;
-
/*
* If listen() is only called to change backlog, we don't
* need to notify protocol module.
@@ -576,13 +299,13 @@ sosdp_listen(struct sonode *so, int backlog)
mutex_exit(&so->so_lock);
- error = sdp_listen(so->so_priv, backlog);
+ error = sdp_listen((struct sdp_conn_struct_t *)so->so_proto_handle,
+ backlog);
mutex_enter(&so->so_lock);
if (error == 0) {
- so->so_state |= (SS_ACCEPTCONN|SS_ISBOUND);
+ so->so_state |= (SS_ACCEPTCONN | SS_ISBOUND);
so->so_backlog = backlog;
- /* LINTED - statement has no conseq */
} else {
eprintsoline(so, error);
}
@@ -599,13 +322,9 @@ done:
/*ARGSUSED*/
static int
sosdp_connect(struct sonode *so, const struct sockaddr *name,
- socklen_t namelen, int fflag, int flags)
+ socklen_t namelen, int fflag, int flags, struct cred *cr)
{
- int error;
-
- ASSERT(so->so_type == SOCK_STREAM);
- dprint(3, ("sosdp_connect: so:%p priv:%p", (void *)so,
- so->so_priv));
+ int error = 0;
mutex_enter(&so->so_lock);
so_lock_single(so);
@@ -627,10 +346,10 @@ sosdp_connect(struct sonode *so, const struct sockaddr *name,
}
/*
- * Check for failure of an earlier call
+ * check for failure of an earlier call
*/
if (so->so_error != 0) {
- error = sogeterr(so);
+ error = sogeterr(so, B_TRUE);
eprintsoline(so, error);
goto done;
}
@@ -647,24 +366,27 @@ sosdp_connect(struct sonode *so, const struct sockaddr *name,
goto done;
}
if (name == NULL || namelen == 0) {
- error = EINVAL;
- eprintsoline(so, error);
+ eprintsoline(so, EINVAL);
goto done;
}
soisconnecting(so);
-
mutex_exit(&so->so_lock);
- error = sdp_connect(so->so_priv, name, namelen);
+ error = sdp_connect((struct sdp_conn_struct_t *)so->so_proto_handle,
+ name, namelen);
+
mutex_enter(&so->so_lock);
if (error == 0) {
/*
* Allow other threads to access the socket
*/
- error = sosdp_waitconnected(so, fflag);
- dprint(4, ("sosdp_connect: wait on so:%p priv:%p failed:%d",
- (void *)so, so->so_priv, error));
+ error = sowaitconnected(so, fflag, 0);
+ dprint(4,
+ ("sosdp_connect: wait on so:%p "
+ "so_proto_handle:%p failed:%d",
+ (void *)so, (void *)so->so_proto_handle, error));
}
+
switch (error) {
case 0:
case EINPROGRESS:
@@ -684,12 +406,13 @@ done:
return (error);
}
-
/*
* Receive data.
*/
+/* ARGSUSED */
int
-sosdp_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
+sosdp_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
+ struct cred *cr)
{
int flags, error = 0;
int size;
@@ -735,7 +458,9 @@ sosdp_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
if (uiop->uio_fmode & (FNDELAY|FNONBLOCK)) {
flags |= MSG_DONTWAIT;
}
- error = sdp_recv(so->so_priv, msg, size, flags, uiop);
+ error = sdp_recv(
+ (struct sdp_conn_struct_t *)so->so_proto_handle, msg,
+ size, flags, uiop);
} else {
msg->msg_controllen = 0;
msg->msg_namelen = 0;
@@ -750,8 +475,10 @@ done:
/*
* Send message.
*/
+/* ARGSUSED */
static int
-sosdp_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
+sosdp_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
+ struct cred *cr)
{
int flags;
ssize_t count;
@@ -759,8 +486,8 @@ sosdp_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
ASSERT(so->so_type == SOCK_STREAM);
- dprint(4, ("sosdp_sendmsg: so:%p priv:%p",
- (void *)so, so->so_priv));
+ dprint(4, ("sosdp_sendmsg: so:%p so_proto_handle:%p",
+ (void *)so, (void *)so->so_proto_handle));
flags = msg->msg_flags;
@@ -771,12 +498,11 @@ sosdp_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
mutex_enter(&so->so_lock);
if (so->so_state & SS_CANTSENDMORE) {
mutex_exit(&so->so_lock);
- tsignal(curthread, SIGPIPE);
return (EPIPE);
}
if (so->so_error != 0) {
- error = sogeterr(so);
+ error = sogeterr(so, B_TRUE);
mutex_exit(&so->so_lock);
return (error);
}
@@ -794,93 +520,83 @@ sosdp_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
}
mutex_exit(&so->so_lock);
- error = sdp_send(so->so_priv, msg, count, flags, uiop);
- if (error == 0)
- return (0);
+ error = sdp_send((struct sdp_conn_struct_t *)so->so_proto_handle,
+ msg, count, flags, uiop);
- mutex_enter(&so->so_lock);
- if ((error == EPIPE) && (so->so_state & SS_CANTSENDMORE)) {
- /*
- * We received shutdown between the time lock was
- * lifted and call to sdp_sendmsg().
- */
- mutex_exit(&so->so_lock);
- tsignal(curthread, SIGPIPE);
- return (EPIPE);
- }
- mutex_exit(&so->so_lock);
return (error);
}
-
/*
* Get address of remote node.
*/
+/* ARGSUSED */
static int
-sosdp_getpeername(struct sonode *so)
+sosdp_getpeername(struct sonode *so, struct sockaddr *addr, socklen_t *addrlen,
+ boolean_t accept, struct cred *cr)
{
- int error;
-
- if (!(so->so_state & SS_ISCONNECTED)) {
- error = ENOTCONN;
+ if (!accept && !(so->so_state & SS_ISCONNECTED)) {
+ return (ENOTCONN);
} else {
- error = sdp_getpeername(so->so_priv, so->so_faddr_sa,
- &so->so_faddr_len);
+ return (sdp_getpeername(
+ (struct sdp_conn_struct_t *)so->so_proto_handle,
+ addr, addrlen));
}
- return (error);
}
/*
* Get local address.
*/
+/* ARGSUSED */
static int
-sosdp_getsockname(struct sonode *so)
+sosdp_getsockname(struct sonode *so, struct sockaddr *addr, socklen_t *addrlen,
+ struct cred *cr)
{
- int error;
-
mutex_enter(&so->so_lock);
+
if (!(so->so_state & SS_ISBOUND)) {
/*
* Zero address, except for address family
*/
- bzero(so->so_laddr_sa, so->so_laddr_maxlen);
-
- so->so_laddr_len = (so->so_family == AF_INET6) ?
- sizeof (struct sockaddr_in6) : sizeof (struct sockaddr_in);
- so->so_laddr_sa->sa_family = so->so_family;
- error = 0;
+ if (so->so_family == AF_INET || so->so_family == AF_INET6) {
+ bzero(addr, *addrlen);
+ *addrlen = (so->so_family == AF_INET6) ?
+ sizeof (struct sockaddr_in6) :
+ sizeof (struct sockaddr_in);
+ addr->sa_family = so->so_family;
+ }
mutex_exit(&so->so_lock);
+ return (0);
} else {
mutex_exit(&so->so_lock);
-
- error = sdp_getsockname(so->so_priv, so->so_laddr_sa,
- &so->so_laddr_len);
+ return (sdp_getsockname(
+ (struct sdp_conn_struct_t *)so->so_proto_handle,
+ addr, addrlen));
}
-
- return (error);
}
/*
* Called from shutdown().
*/
+/* ARGSUSED */
static int
-sosdp_shutdown(struct sonode *so, int how)
+sosdp_shutdown(struct sonode *so, int how, struct cred *cr)
{
- struct sdp_sonode *ss = SOTOSDO(so);
uint_t state_change;
int error = 0;
- short wakesig = 0;
mutex_enter(&so->so_lock);
so_lock_single(so);
-
/*
* Record the current state and then perform any state changes.
* Then use the difference between the old and new states to
* determine which needs to be done.
*/
state_change = so->so_state;
+ if (!(state_change & SS_ISCONNECTED)) {
+ error = ENOTCONN;
+ goto done;
+ }
switch (how) {
case SHUT_RD:
@@ -900,21 +616,16 @@ sosdp_shutdown(struct sonode *so, int how)
state_change = so->so_state & ~state_change;
- if (state_change & SS_CANTRCVMORE) {
- wakesig = POLLIN|POLLRDNORM;
- sosdp_sendsig(ss, SDPSIG_READ);
- }
if (state_change & SS_CANTSENDMORE) {
- wakesig |= POLLOUT;
so->so_state |= SS_ISDISCONNECTING;
}
- mutex_exit(&so->so_lock);
-
- pollwakeup(&ss->ss_poll_list, wakesig);
+ so_notify_shutdown(so);
if (state_change & SS_CANTSENDMORE) {
- error = sdp_shutdown(so->so_priv, how);
+ error = sdp_shutdown(
+ (struct sdp_conn_struct_t *)so->so_proto_handle, how);
}
+
mutex_enter(&so->so_lock);
done:
so_unlock_single(so, SOLOCKED);
@@ -935,7 +646,7 @@ done:
/*ARGSUSED*/
static int
sosdp_getsockopt(struct sonode *so, int level, int option_name,
- void *optval, socklen_t *optlenp, int flags)
+ void *optval, socklen_t *optlenp, int flags, struct cred *cr)
{
int error = 0;
void *option = NULL;
@@ -987,7 +698,7 @@ sosdp_getsockopt(struct sonode *so, int level, int option_name,
goto copyout;
case SO_ERROR:
- value = sogeterr(so);
+ value = sogeterr(so, B_TRUE);
goto copyout;
case SO_ACCEPTCONN:
@@ -1045,7 +756,8 @@ sosdp_getsockopt(struct sonode *so, int level, int option_name,
}
optlen = maxlen;
mutex_exit(&so->so_lock);
- error = sdp_get_opt(so->so_priv, level, option_name, optbuf, &optlen);
+ error = sdp_get_opt((struct sdp_conn_struct_t *)so->so_proto_handle,
+ level, option_name, optbuf, &optlen);
mutex_enter(&so->so_lock);
ASSERT(optlen <= maxlen);
if (error != 0) {
@@ -1078,43 +790,35 @@ done:
/*
* Set socket options
*/
+/* ARGSUSED */
static int
sosdp_setsockopt(struct sonode *so, int level, int option_name,
- const void *optval, t_uscalar_t optlen)
+ const void *optval, t_uscalar_t optlen, struct cred *cr)
{
- int error;
void *conn = NULL;
+ int error = 0;
-
- /* X/Open requires this check */
if (so->so_state & SS_CANTSENDMORE) {
return (EINVAL);
}
- /* Caller allocates aligned optval, or passes null */
- ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
-
- /* No SDP options should be zero-length */
- if (optlen == 0) {
- error = EINVAL;
- eprintsoline(so, error);
- return (error);
- }
-
mutex_enter(&so->so_lock);
so_lock_single(so);
if (so->so_type == SOCK_STREAM) {
- conn = so->so_priv;
+ conn = (void *)so->so_proto_handle;
}
dprint(2, ("sosdp_setsockopt (%d) - conn %p %d %d \n",
so->so_type, conn, level, option_name));
+
if (conn != NULL) {
mutex_exit(&so->so_lock);
- error = sdp_set_opt(conn, level, option_name, optval, optlen);
+ error = sdp_set_opt((struct sdp_conn_struct_t *)conn, level,
+ option_name, optval, optlen);
mutex_enter(&so->so_lock);
}
+
/*
* Check for SOL_SOCKET options and record their values.
* If we know about a SOL_SOCKET parameter and the transport
@@ -1244,6 +948,239 @@ done:
return (error);
}
+/* ARGSUSED */
+static int
+sosdp_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
+ struct cred *cr, int32_t *rvalp)
+{
+ int32_t value;
+ int error, intval;
+ pid_t pid;
+
+ /* handle socket specific ioctls */
+ switch (cmd) {
+ case FIONBIO:
+ if (so_copyin((void *)arg, &value, sizeof (int32_t),
+ (mode & (int)FKIOCTL))) {
+ return (EFAULT);
+ }
+ mutex_enter(&so->so_lock);
+ if (value != 0) {
+ so->so_state |= SS_NDELAY;
+ } else {
+ so->so_state &= ~SS_NDELAY;
+ }
+ mutex_exit(&so->so_lock);
+ return (0);
+
+ case FIOASYNC:
+ if (so_copyin((void *)arg, &value, sizeof (int32_t),
+ (mode & (int)FKIOCTL))) {
+ return (EFAULT);
+ }
+ mutex_enter(&so->so_lock);
+
+ if (value) {
+ /* Turn on SIGIO */
+ so->so_state |= SS_ASYNC;
+ } else {
+ /* Turn off SIGIO */
+ so->so_state &= ~SS_ASYNC;
+ }
+ mutex_exit(&so->so_lock);
+ return (0);
+
+ case SIOCSPGRP:
+ case FIOSETOWN:
+ if (so_copyin((void *)arg, &pid, sizeof (pid_t),
+ (mode & (int)FKIOCTL))) {
+ return (EFAULT);
+ }
+ mutex_enter(&so->so_lock);
+
+ error = (pid != so->so_pgrp) ? socket_chgpgrp(so, pid) : 0;
+ mutex_exit(&so->so_lock);
+ return (error);
+
+ case SIOCGPGRP:
+ case FIOGETOWN:
+ if (so_copyout(&so->so_pgrp, (void *)arg,
+ sizeof (pid_t), (mode & (int)FKIOCTL)))
+ return (EFAULT);
+ return (0);
+
+ case SIOCATMARK:
+ intval = 0;
+ error = sdp_ioctl(
+ (struct sdp_conn_struct_t *)so->so_proto_handle, cmd,
+ &intval, cr);
+ if (so_copyout(&intval, (void *)arg, sizeof (int),
+ (mode & (int)FKIOCTL)))
+ return (EFAULT);
+ return (0);
+
+
+ case SIOCSENABLESDP: {
+ int32_t enable;
+
+ /*
+ * System wide enable SDP
+ */
+
+ if (so_copyin((void *)arg, &enable, sizeof (int32_t),
+ mode & (int)FKIOCTL))
+ return (EFAULT);
+
+ error = sdp_ioctl(
+ (struct sdp_conn_struct_t *)so->so_proto_handle, cmd,
+ &enable, cr);
+ if (so_copyout(&enable, (void *)arg,
+ sizeof (int32_t), (mode & (int)FKIOCTL)))
+ return (EFAULT);
+ return (0);
+ }
+ /* from strioctl */
+ case FIONREAD:
+ /*
+ * Return number of bytes of data in all data messages
+ * in queue in "arg".
+ * For stream socket, amount of available data.
+ */
+ if (so->so_state & SS_ACCEPTCONN) {
+ intval = 0;
+ } else {
+ mutex_enter(&so->so_lock);
+ intval = sdp_polldata(
+ (struct sdp_conn_struct_t *)so->so_proto_handle,
+ SDP_READ);
+ mutex_exit(&so->so_lock);
+ }
+ if (so_copyout(&intval, (void *)arg, sizeof (intval),
+ (mode & (int)FKIOCTL)))
+ return (EFAULT);
+ return (0);
+ default:
+ return (EINVAL);
+ }
+}
+
+/*
+ * Check socktpi_poll() on why so_lock is not held in this function.
+ */
+static int
+sosdp_poll(struct sonode *so, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp)
+{
+ short origevents = events;
+ int so_state;
+
+ so_state = so->so_state;
+
+ ASSERT(so->so_version != SOV_STREAM);
+
+ if (!(so_state & SS_ISCONNECTED) && (so->so_type == SOCK_STREAM)) {
+ /*
+ * Not connected yet - turn off write side events
+ */
+ events &= ~(POLLOUT|POLLWRBAND);
+ }
+
+ /*
+ * Check for errors
+ */
+ if (so->so_error != 0 &&
+ ((POLLIN|POLLRDNORM|POLLOUT) & origevents) != 0) {
+ *reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
+ return (0);
+ }
+
+ *reventsp = 0;
+
+ /*
+ * Don't mark socket as writable until TX queued data is
+ * below watermark.
+ */
+ if (so->so_type == SOCK_STREAM) {
+ if (sdp_polldata(
+ (struct sdp_conn_struct_t *)so->so_proto_handle,
+ SDP_XMIT)) {
+ *reventsp |= POLLOUT & events;
+ }
+ } else {
+ *reventsp = 0;
+ goto done;
+ }
+
+ if (sdp_polldata((struct sdp_conn_struct_t *)so->so_proto_handle,
+ SDP_READ)) {
+ *reventsp |= (POLLIN|POLLRDNORM) & events;
+ }
+
+ if ((so_state & SS_CANTRCVMORE) || (so->so_acceptq_head != NULL)) {
+ *reventsp |= (POLLIN|POLLRDNORM) & events;
+ }
+
+done:
+ if (!*reventsp && !anyyet) {
+ *phpp = &so->so_poll_list;
+ }
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
+sosdp_close(struct sonode *so, int flag, struct cred *cr)
+{
+ int error = 0;
+
+ mutex_enter(&so->so_lock);
+ so_lock_single(so);
+ /*
+ * Need to set flags as there might be ops in progress on
+ * this socket.
+ *
+ * If socket already disconnected/disconnecting,
+ * don't send signal (again).
+ */
+ soisdisconnected(so, 0);
+ mutex_exit(&so->so_lock);
+
+ /*
+ * Initiate connection shutdown.
+ */
+ error = sdp_disconnect((struct sdp_conn_struct_t *)so->so_proto_handle,
+ flag);
+
+ mutex_enter(&so->so_lock);
+ so_unlock_single(so, SOLOCKED);
+ so_notify_disconnected(so, error);
+
+ return (error);
+}
+
+/* ARGSUSED */
+void
+sosdp_fini(struct sonode *so, struct cred *cr)
+{
+ dprint(3, ("sosdp_fini: so:%p so_proto_handle:%p", (void *)so,
+ (void *)so->so_proto_handle));
+
+ ASSERT(so->so_ops == &sosdp_sonodeops);
+
+ if (so->so_proto_handle != NULL)
+ sdp_close((struct sdp_conn_struct_t *)so->so_proto_handle);
+ so->so_proto_handle = NULL;
+
+ mutex_enter(&so->so_lock);
+
+ so_acceptq_flush(so);
+
+ mutex_exit(&so->so_lock);
+
+ sonode_fini(so);
+}
+
/*
* Upcalls from SDP
*/
@@ -1254,83 +1191,37 @@ done:
static void *
sdp_sock_newconn(void *parenthandle, void *connind)
{
- struct sdp_sonode *lss = parenthandle;
- struct sonode *lso = &lss->ss_so;
+ struct sonode *lso = parenthandle;
struct sonode *nso;
- struct sdp_sonode *nss;
- mblk_t *mp;
int error;
ASSERT(lso->so_state & SS_ACCEPTCONN);
- ASSERT(lso->so_priv != NULL); /* closed conn */
+ ASSERT(lso->so_proto_handle != NULL); /* closed conn */
ASSERT(lso->so_type == SOCK_STREAM);
- dprint(3, ("sosdp_newconn A: so:%p priv:%p", (void *)lso,
- lso->so_priv));
+ dprint(3, ("sosdp_newconn A: so:%p so_proto_handle:%p", (void *)lso,
+ (void *)lso->so_proto_handle));
/*
* Check current # of queued conns against backlog
*/
- if (lss->ss_rxqueued >= lso->so_backlog) {
- return (NULL);
- }
-
- /*
- * Need to create a new socket.
- */
- mp = allocb(sizeof (connind), BPRI_MED);
- if (mp == NULL) {
- eprintsoline(lso, ENOMEM);
+ if (lso->so_rcv_queued >= lso->so_backlog) {
return (NULL);
}
- DB_TYPE(mp) = M_PROTO;
- VN_HOLD(lso->so_accessvp);
- nso = sosdp_create(lso->so_accessvp, lso->so_family, lso->so_type,
- lso->so_protocol, lso->so_version, lso, &error);
+ nso = socket_newconn(lso, connind, NULL, SOCKET_NOSLEEP, &error);
if (nso == NULL) {
- VN_RELE(lso->so_accessvp);
- freeb(mp);
eprintsoline(lso, error);
return (NULL);
}
dprint(2, ("sdp_stream_newconn: new %p\n", (void *)nso));
- nss = SOTOSDO(nso);
-
- /*
- * Inherit socket properties
- */
- mutex_enter(&lso->so_lock);
- mutex_enter(&nso->so_lock);
- nso->so_state |= (SS_ISBOUND | SS_ISCONNECTED |
- (lso->so_state & SS_ASYNC));
- sosdp_so_inherit(lss, nss);
- nso->so_priv = connind;
-
- mutex_exit(&nso->so_lock);
-
- ++lss->ss_rxqueued;
- mutex_exit(&lso->so_lock);
-
- /*
- * Copy pointer to new socket to connind queue message
- */
- *(struct sonode **)mp->b_wptr = nso;
- mp->b_wptr += sizeof (nso);
-
- /*
- * Wake people who're waiting incoming conns. Note that
- * soqueueconnind gets so_lock.
- */
- soqueueconnind(lso, mp);
- pollwakeup(&lss->ss_poll_list, POLLIN|POLLRDNORM);
+ (void) so_acceptq_enqueue(lso, nso);
mutex_enter(&lso->so_lock);
- sosdp_sendsig(lss, SDPSIG_READ);
- mutex_exit(&lso->so_lock);
- return (nss);
+ so_notify_newconn(lso);
+ return (nso);
}
/*
@@ -1339,26 +1230,19 @@ sdp_sock_newconn(void *parenthandle, void *connind)
static void
sdp_sock_connected(void *handle)
{
- struct sdp_sonode *ss = handle;
- struct sonode *so = &ss->ss_so;
+ struct sonode *so = handle;
ASSERT(so->so_type == SOCK_STREAM);
- dprint(3, ("sosdp_connected C: so:%p priv:%p", (void *)so,
- so->so_priv));
+ dprint(3, ("sosdp_connected C: so:%p so_proto_handle:%p", (void *)so,
+ (void *)so->so_proto_handle));
mutex_enter(&so->so_lock);
- ASSERT(so->so_priv); /* closed conn */
+ ASSERT(so->so_proto_handle); /* closed conn */
ASSERT(!(so->so_state & SS_ACCEPTCONN));
soisconnected(so);
- sosdp_sendsig(ss, SDPSIG_WRITE);
- mutex_exit(&so->so_lock);
-
- /*
- * Wake ones who're waiting for conn to become established.
- */
- pollwakeup(&ss->ss_poll_list, POLLOUT);
+ so_notify_connected(so);
}
/*
@@ -1368,32 +1252,17 @@ sdp_sock_connected(void *handle)
static void
sdp_sock_disconnected(void *handle, int error)
{
- int event = 0;
- struct sdp_sonode *ss = handle;
- struct sonode *so = &ss->ss_so;
+ struct sonode *so = handle;
ASSERT(so->so_type == SOCK_STREAM);
- dprint(2, ("sosdp_disconnected C: so:%p priv:%p error:%d",
- (void *)so, so->so_priv, error));
+ dprint(2, ("sosdp_disconnected C: so:%p so_proto_handle:%p error:%d",
+ (void *)so, (void *)so->so_proto_handle, error));
mutex_enter(&so->so_lock);
- ASSERT(so->so_priv != NULL); /* closed conn */
-
- /*
- * If socket is already disconnected/disconnecting,
- * don't (re)send signal.
- */
- if (!(so->so_state & SS_CANTRCVMORE))
- event |= SDPSIG_READ;
- if (!(so->so_state & SS_CANTSENDMORE))
- event |= SDPSIG_WRITE;
- if (event != 0)
- sosdp_sendsig(ss, event);
+ ASSERT(so->so_proto_handle != NULL); /* closed conn */
soisdisconnected(so, error);
- mutex_exit(&so->so_lock);
-
- pollwakeup(&ss->ss_poll_list, POLLIN|POLLRDNORM|POLLOUT);
+ so_notify_disconnected(so, error);
}
/*
@@ -1403,15 +1272,12 @@ sdp_sock_disconnected(void *handle, int error)
static int
sdp_sock_recv(void *handle, mblk_t *mp, int flags)
{
- struct sdp_sonode *ss = handle;
- struct sonode *so = &ss->ss_so;
+ struct sonode *so = handle;
ASSERT(so->so_type == SOCK_STREAM);
mutex_enter(&so->so_lock);
- sosdp_sendsig(ss, SDPSIG_READ);
- mutex_exit(&so->so_lock);
- pollwakeup(&ss->ss_poll_list, POLLIN|POLLRDNORM);
+ so_notify_data(so, 0);
return (so->so_rcvbuf);
}
@@ -1422,13 +1288,12 @@ sdp_sock_recv(void *handle, mblk_t *mp, int flags)
static void
sdp_sock_xmitted(void *handle, int writeable)
{
- struct sdp_sonode *ss = handle;
- struct sonode *so = &ss->ss_so;
+ struct sonode *so = handle;
- dprint(4, ("sosdp_sock_xmitted: so:%p priv:%p txq:%d",
- (void *)so, so->so_priv, writeable));
+ dprint(4, ("sosdp_sock_xmitted: so:%p so_proto_handle:%p txq:%d",
+ (void *)so, (void *)so->so_proto_handle, writeable));
mutex_enter(&so->so_lock);
- ASSERT(so->so_priv != NULL); /* closed conn */
+ ASSERT(so->so_proto_handle != NULL); /* closed conn */
/*
@@ -1436,9 +1301,7 @@ sdp_sock_xmitted(void *handle, int writeable)
* watermark.
*/
if (!writeable) {
- sosdp_sendsig(ss, SDPSIG_WRITE);
- mutex_exit(&so->so_lock);
- pollwakeup(&ss->ss_poll_list, POLLOUT);
+ so_notify_writable(so);
} else {
mutex_exit(&so->so_lock);
}
@@ -1451,16 +1314,14 @@ sdp_sock_xmitted(void *handle, int writeable)
static void
sdp_sock_urgdata(void *handle)
{
- struct sdp_sonode *ss = handle;
-
- ASSERT(ss->ss_so.so_type == SOCK_STREAM);
+ struct sonode *so = handle;
- mutex_enter(&ss->ss_so.so_lock);
+ ASSERT(so->so_type == SOCK_STREAM);
- ASSERT(ss->ss_so.so_priv != NULL); /* closed conn */
- sosdp_sendsig(ss, SDPSIG_URG);
+ mutex_enter(&so->so_lock);
- mutex_exit(&ss->ss_so.so_lock);
+ ASSERT(so->so_proto_handle != NULL); /* closed conn */
+ so_notify_oobsig(so);
}
/*
@@ -1469,31 +1330,26 @@ sdp_sock_urgdata(void *handle)
static void
sdp_sock_ordrel(void *handle)
{
- struct sdp_sonode *ss = handle;
- /* LINTED */
- struct sonode *so = &ss->ss_so;
-
- ASSERT(ss->ss_so.so_type == SOCK_STREAM);
-
- dprint(4, ("sdp_sock_ordrel : so:%p, priv:%p",
- (void *)so, so->so_priv));
- mutex_enter(&ss->ss_so.so_lock);
- socantrcvmore(&ss->ss_so);
- mutex_exit(&ss->ss_so.so_lock);
- pollwakeup(&ss->ss_poll_list, POLLIN|POLLRDNORM);
+ struct sonode *so = handle;
+
+ ASSERT(so->so_type == SOCK_STREAM);
+
+ dprint(4, ("sdp_sock_ordrel : so:%p, so_proto_handle:%p",
+ (void *)so, (void *)so->so_proto_handle));
+ mutex_enter(&so->so_lock);
+ socantrcvmore(so);
+ so_notify_eof(so);
}
static void
sdp_sock_connfail(void *handle, int error)
{
+ struct sonode *so = handle;
- struct sdp_sonode *ss = handle;
- struct sonode *so = &ss->ss_so;
-
- dprint(3, ("sosdp_conn Failed: so:%p priv:%p", (void *)so,
- so->so_priv));
+ dprint(3, ("sosdp_conn Failed: so:%p so_proto_handle:%p", (void *)so,
+ (void *)so->so_proto_handle));
mutex_enter(&so->so_lock);
- ASSERT(so->so_priv != NULL); /* closed conn */
+ ASSERT(so->so_proto_handle != NULL); /* closed conn */
so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
so->so_error = (ushort_t)error;
mutex_exit(&so->so_lock);
diff --git a/usr/src/uts/common/inet/sockmods/socksdp.h b/usr/src/uts/common/inet/sockmods/socksdp.h
new file mode 100644
index 0000000000..ba6bd109e8
--- /dev/null
+++ b/usr/src/uts/common/inet/sockmods/socksdp.h
@@ -0,0 +1,44 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SOCKSDP_H_
+#define _SOCKSDP_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern sonodeops_t sosdp_sonodeops;
+extern sdp_upcalls_t sosdp_sock_upcalls;
+
+extern void sosdp_fini(struct sonode *, struct cred *);
+extern void sosdp_so_inherit(struct sonode *, struct sonode *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SOCKSDP_H_ */
diff --git a/usr/src/uts/common/inet/sockmods/socksdpsubr.c b/usr/src/uts/common/inet/sockmods/socksdpsubr.c
new file mode 100644
index 0000000000..8917878ec5
--- /dev/null
+++ b/usr/src/uts/common/inet/sockmods/socksdpsubr.c
@@ -0,0 +1,60 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/t_lock.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/debug.h>
+#include <sys/errno.h>
+#include <sys/strsubr.h>
+#include <sys/cmn_err.h>
+#include <sys/sysmacros.h>
+
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/strsun.h>
+#include <sys/signal.h>
+
+#include <inet/sdp_itf.h>
+#include "socksdp.h"
+
+/*
+ * Inherit socket properties
+ */
+void
+sosdp_so_inherit(struct sonode *lso, struct sonode *nso)
+{
+ nso->so_options = lso->so_options & (SO_DEBUG|SO_REUSEADDR|
+ SO_KEEPALIVE|SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
+ SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
+ nso->so_sndbuf = lso->so_sndbuf;
+ nso->so_rcvbuf = lso->so_rcvbuf;
+ nso->so_pgrp = lso->so_pgrp;
+
+ nso->so_rcvlowat = lso->so_rcvlowat;
+ nso->so_sndlowat = lso->so_sndlowat;
+}
diff --git a/usr/src/uts/common/inet/spdsock.h b/usr/src/uts/common/inet/spdsock.h
index a5f18bd1c4..7622e56a45 100644
--- a/usr/src/uts/common/inet/spdsock.h
+++ b/usr/src/uts/common/inet/spdsock.h
@@ -26,8 +26,6 @@
#ifndef _INET_SPDSOCK_H
#define _INET_SPDSOCK_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/netstack.h>
#ifdef __cplusplus
@@ -112,8 +110,7 @@ extern uint_t spdsock_max_optsize;
extern int spdsock_opt_get(queue_t *, int, int, uchar_t *);
extern int spdsock_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
- uint_t *, uchar_t *, void *, cred_t *,
- mblk_t *);
+ uint_t *, uchar_t *, void *, cred_t *, mblk_t *);
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c
index 559abd9178..396068a2d9 100644
--- a/usr/src/uts/common/inet/squeue.c
+++ b/usr/src/uts/common/inet/squeue.c
@@ -1240,3 +1240,142 @@ squeue_getprivate(squeue_t *sqp, sqprivate_t p)
return (&sqp->sq_private[p]);
}
+
+/* ARGSUSED */
+void
+squeue_wakeup_conn(void *arg, mblk_t *mp, void *arg2)
+{
+ conn_t *connp = (conn_t *)arg;
+ squeue_t *sqp = connp->conn_sqp;
+
+ /*
+ * Mark the squeue as paused before waking up the thread stuck
+ * in squeue_synch_enter().
+ */
+ mutex_enter(&sqp->sq_lock);
+ sqp->sq_state |= SQS_PAUSE;
+
+ /*
+ * Notify the thread that it's OK to proceed; that is done by
+ * clearing the MSGWAITSYNC flag. The synch thread will free the mblk.
+ */
+ ASSERT(mp->b_flag & MSGWAITSYNC);
+ mp->b_flag &= ~MSGWAITSYNC;
+ cv_broadcast(&connp->conn_sq_cv);
+
+ /*
+ * We are doing something on behalf of another thread, so we have to
+ * pause and wait until it finishes.
+ */
+ while (sqp->sq_state & SQS_PAUSE) {
+ cv_wait(&sqp->sq_synch_cv, &sqp->sq_lock);
+ }
+ mutex_exit(&sqp->sq_lock);
+}
+
+/* ARGSUSED */
+int
+squeue_synch_enter(squeue_t *sqp, void *arg, uint8_t tag)
+{
+ conn_t *connp = (conn_t *)arg;
+
+ mutex_enter(&sqp->sq_lock);
+ if (sqp->sq_first == NULL && !(sqp->sq_state & SQS_PROC)) {
+ /*
+ * We are OK to proceed if the squeue is empty, and
+ * no one owns the squeue.
+ *
+ * The caller won't own the squeue as this is called from the
+ * application.
+ */
+ ASSERT(sqp->sq_run == NULL);
+
+ sqp->sq_state |= SQS_PROC;
+ sqp->sq_run = curthread;
+ mutex_exit(&sqp->sq_lock);
+
+#if SQUEUE_DEBUG
+ sqp->sq_curmp = NULL;
+ sqp->sq_curproc = NULL;
+ sqp->sq_connp = connp;
+#endif
+ connp->conn_on_sqp = B_TRUE;
+ return (0);
+ } else {
+ mblk_t *mp;
+
+ mp = allocb(0, BPRI_MED);
+ if (mp == NULL) {
+ mutex_exit(&sqp->sq_lock);
+ return (ENOMEM);
+ }
+
+ /*
+ * We mark the mblk as awaiting synchronous squeue access
+ * by setting the MSGWAITSYNC flag. Once squeue_wakeup_conn
+ * fires, MSGWAITSYNC is cleared, at which point we know we
+ * have exclusive access.
+ */
+ mp->b_flag |= MSGWAITSYNC;
+
+ CONN_INC_REF(connp);
+ SET_SQUEUE(mp, squeue_wakeup_conn, connp);
+ ENQUEUE_CHAIN(sqp, mp, mp, 1);
+
+ ASSERT(sqp->sq_run != curthread);
+
+ /* Wait until the enqueued mblk get processed. */
+ while (mp->b_flag & MSGWAITSYNC)
+ cv_wait(&connp->conn_sq_cv, &sqp->sq_lock);
+ mutex_exit(&sqp->sq_lock);
+
+ freeb(mp);
+
+ return (0);
+ }
+}
+
+/* ARGSUSED */
+void
+squeue_synch_exit(squeue_t *sqp, void *arg)
+{
+ conn_t *connp = (conn_t *)arg;
+
+ mutex_enter(&sqp->sq_lock);
+ if (sqp->sq_run == curthread) {
+ ASSERT(sqp->sq_state & SQS_PROC);
+
+ sqp->sq_state &= ~SQS_PROC;
+ sqp->sq_run = NULL;
+ connp->conn_on_sqp = B_FALSE;
+
+ if (sqp->sq_first == NULL) {
+ mutex_exit(&sqp->sq_lock);
+ } else {
+ /*
+ * If this was a normal thread, then it would
+ * (most likely) continue processing the pending
+ * requests. Since the just completed operation
+ * was executed synchronously, the thread should
+ * not be delayed. To compensate, wake up the
+ * worker thread right away when there are outstanding
+ * requests.
+ */
+ sqp->sq_awaken = lbolt;
+ cv_signal(&sqp->sq_worker_cv);
+ mutex_exit(&sqp->sq_lock);
+ }
+ } else {
+ /*
+ * The caller doesn't own the squeue, clear the SQS_PAUSE flag,
+ * and wake up the squeue owner, such that owner can continue
+ * processing.
+ */
+ ASSERT(sqp->sq_state & SQS_PAUSE);
+ sqp->sq_state &= ~SQS_PAUSE;
+
+ /* There should be only one thread blocking on sq_synch_cv. */
+ cv_signal(&sqp->sq_synch_cv);
+ mutex_exit(&sqp->sq_lock);
+ }
+}
diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h
index 3a557048d6..76d1864d62 100644
--- a/usr/src/uts/common/inet/tcp.h
+++ b/usr/src/uts/common/inet/tcp.h
@@ -35,6 +35,7 @@ extern "C" {
#include <netinet/ip6.h>
#include <netinet/tcp.h>
#include <sys/socket.h>
+#include <sys/socket_proto.h>
#include <sys/sodirect.h>
#include <sys/multidata.h>
#include <sys/md5.h>
@@ -201,7 +202,6 @@ typedef struct tcp_s {
#define TCP_OFO_FIN_VALID 0x8 /* Has TCP received an out of order FIN? */
- int32_t tcp_xmit_hiwater; /* Send buffer high water mark. */
timeout_id_t tcp_timer_tid; /* Control block for timer service */
uchar_t tcp_timer_backoff; /* Backoff shift count. */
@@ -340,7 +340,10 @@ typedef struct tcp_s {
struct tcp_s *tcp_listener; /* Our listener */
- int32_t tcp_xmit_lowater; /* Send buffer low water mark. */
+ size_t tcp_xmit_hiwater; /* Send buffer high water mark. */
+ size_t tcp_xmit_lowater; /* Send buffer low water mark. */
+ size_t tcp_recv_hiwater; /* Recv high water mark */
+ size_t tcp_recv_lowater; /* Recv low water mark */
uint32_t tcp_irs; /* Initial recv seq num */
uint32_t tcp_fss; /* Final/fin send seq num */
@@ -491,6 +494,7 @@ typedef struct tcp_s {
struct tcp_s *tcp_acceptor_hash; /* Acceptor hash chain */
struct tcp_s **tcp_ptpahn; /* Pointer to previous accept hash next. */
struct tcp_s *tcp_bind_hash; /* Bind hash chain */
+ struct tcp_s *tcp_bind_hash_port; /* tcp_t's bound to the same lport */
struct tcp_s **tcp_ptpbhn;
boolean_t tcp_ire_ill_check_done;
@@ -599,6 +603,15 @@ typedef struct tcp_s {
boolean_t tcp_flow_stopped;
/*
+ * The socket generation number is bumped when an outgoing connection
+ * attempts is made, and it sent up to the socket when the
+ * connection was successfully established, or an error occured. The
+ * generation is used to ensure that the socket does not miss the
+ * asynchronous notification.
+ */
+ sock_connid_t tcp_connid;
+
+ /*
* tcp_sodirect is used by tcp on the receive side to push mblk_t(s)
* directly to sockfs. Also, to schedule asynchronous copyout directly
* to a pending user-land uio buffer.
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index 4bb50d2344..ce7d9fb395 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -58,6 +58,7 @@
#include <sys/errno.h>
#include <sys/signal.h>
#include <sys/socket.h>
+#include <sys/socketvar.h>
#include <sys/sockio.h>
#include <sys/isa_defs.h>
#include <sys/md5.h>
@@ -78,7 +79,7 @@
#include <inet/ip_impl.h>
#include <inet/ip6.h>
#include <inet/ip_ndp.h>
-#include <inet/mi.h>
+#include <inet/proto_set.h>
#include <inet/mib2.h>
#include <inet/nd.h>
#include <inet/optcom.h>
@@ -386,11 +387,8 @@ kstat_t *tcp_g_kstat;
* tcp write side.
*/
#define CALL_IP_WPUT(connp, q, mp) { \
- tcp_stack_t *tcps; \
- \
- tcps = connp->conn_netstack->netstack_tcp; \
ASSERT(((q)->q_flag & QREADR) == 0); \
- TCP_DBGSTAT(tcps, tcp_ip_output); \
+ TCP_DBGSTAT(connp->conn_netstack->netstack_tcp, tcp_ip_output); \
connp->conn_send(connp, (mp), (q), IP_WPUT); \
}
@@ -650,6 +648,19 @@ typedef struct tcp_opt_s {
} tcp_opt_t;
/*
+ * TCP option struct passing information b/w lisenter and eager.
+ */
+struct tcp_options {
+ uint_t to_flags;
+ ssize_t to_boundif; /* IPV6_BOUND_IF */
+ sock_upper_handle_t to_handle;
+};
+
+#define TCPOPT_BOUNDIF 0x00000001 /* set IPV6_BOUND_IF */
+#define TCPOPT_RECVPKTINFO 0x00000002 /* set IPV6_RECVPKTINFO */
+#define TCPOPT_UPPERHANDLE 0x00000004 /* set upper handle */
+
+/*
* RFC1323-recommended phrasing of TSTAMP option, for easier parsing
*/
@@ -742,6 +753,7 @@ void tcp_input(void *arg, mblk_t *mp, void *arg2);
void tcp_rput_data(void *arg, mblk_t *mp, void *arg2);
static void tcp_close_output(void *arg, mblk_t *mp, void *arg2);
void tcp_output(void *arg, mblk_t *mp, void *arg2);
+void tcp_output_urgent(void *arg, mblk_t *mp, void *arg2);
static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2);
static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2);
static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2);
@@ -750,7 +762,7 @@ static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2);
/* Prototype for TCP functions */
static void tcp_random_init(void);
int tcp_random(void);
-static void tcp_accept(tcp_t *tcp, mblk_t *mp);
+static void tcp_tli_accept(tcp_t *tcp, mblk_t *mp);
static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor,
tcp_t *eager);
static int tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp);
@@ -761,12 +773,12 @@ static void tcp_closei_local(tcp_t *tcp);
static void tcp_close_detached(tcp_t *tcp);
static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph,
mblk_t *idmp, mblk_t **defermp);
-static void tcp_connect(tcp_t *tcp, mblk_t *mp);
-static void tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp,
- in_port_t dstport, uint_t srcid);
-static void tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp,
+static void tcp_tpi_connect(tcp_t *tcp, mblk_t *mp);
+static int tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp,
+ in_port_t dstport, uint_t srcid, cred_t *cr, pid_t pid);
+static int tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp,
in_port_t dstport, uint32_t flowinfo, uint_t srcid,
- uint32_t scope_id);
+ uint32_t scope_id, cred_t *cr, pid_t pid);
static int tcp_clean_death(tcp_t *tcp, int err, uint8_t tag);
static void tcp_def_q_set(tcp_t *tcp, mblk_t *mp);
static void tcp_disconnect(tcp_t *tcp, mblk_t *mp);
@@ -803,11 +815,9 @@ static int tcp_header_init_ipv6(tcp_t *tcp);
int tcp_init(tcp_t *tcp, queue_t *q);
static int tcp_init_values(tcp_t *tcp);
static mblk_t *tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic);
-static mblk_t *tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim,
- t_scalar_t addr_length);
static void tcp_ip_ire_mark_advice(tcp_t *tcp);
static void tcp_ip_notify(tcp_t *tcp);
-static mblk_t *tcp_ire_mp(mblk_t *mp);
+static mblk_t *tcp_ire_mp(mblk_t **mpp);
static void tcp_iss_init(tcp_t *tcp);
static void tcp_keepalive_killer(void *arg);
static int tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt);
@@ -816,8 +826,8 @@ static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp,
int *do_disconnectp, int *t_errorp, int *sys_errorp);
static boolean_t tcp_allow_connopt_set(int level, int name);
int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr);
-int tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr);
-int tcp_opt_set(queue_t *q, uint_t optset_context, int level,
+int tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr);
+int tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level,
int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
uchar_t *outvalp, void *thisdg_attrs, cred_t *cr,
mblk_t *mblk);
@@ -842,7 +852,8 @@ static void tcp_reinit_values(tcp_t *tcp);
static void tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval,
tcp_t *thisstream, cred_t *cr);
-static uint_t tcp_rcv_drain(queue_t *q, tcp_t *tcp);
+static uint_t tcp_rwnd_reopen(tcp_t *tcp);
+static uint_t tcp_rcv_drain(tcp_t *tcp);
static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags);
static boolean_t tcp_send_rst_chk(tcp_stack_t *);
static void tcp_ss_rexmit(tcp_t *tcp);
@@ -868,7 +879,8 @@ static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp,
boolean_t random);
static in_port_t tcp_get_next_priv_port(const tcp_t *);
static void tcp_wput_sock(queue_t *q, mblk_t *mp);
-void tcp_wput_accept(queue_t *q, mblk_t *mp);
+static void tcp_wput_fallback(queue_t *q, mblk_t *mp);
+void tcp_tpi_accept(queue_t *q, mblk_t *mp);
static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent);
static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp);
static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
@@ -901,9 +913,7 @@ static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *,
boolean_t, boolean_t);
static void tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp,
boolean_t ipsec_mctl);
-static mblk_t *tcp_setsockopt_mp(int level, int cmd,
- char *opt, int optlen);
-static int tcp_build_hdrs(queue_t *, tcp_t *);
+static int tcp_build_hdrs(tcp_t *);
static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp,
uint32_t seg_seq, uint32_t seg_ack, int seg_len,
tcph_t *tcph);
@@ -943,7 +953,7 @@ static int tcp_squeue_switch(int);
static int tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t);
static int tcp_openv4(queue_t *, dev_t *, int, int, cred_t *);
static int tcp_openv6(queue_t *, dev_t *, int, int, cred_t *);
-static int tcp_close(queue_t *, int);
+static int tcp_tpi_close(queue_t *, int);
static int tcpclose_accept(queue_t *);
static void tcp_squeue_add(squeue_t *);
@@ -958,6 +968,19 @@ extern void tcp_kssl_input(tcp_t *, mblk_t *);
void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2);
void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2);
+static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
+ sock_upper_handle_t, cred_t *);
+static int tcp_listen(sock_lower_handle_t, int, cred_t *);
+static int tcp_post_ip_bind(tcp_t *, mblk_t *, int);
+static int tcp_do_listen(conn_t *, int, cred_t *);
+static int tcp_do_connect(conn_t *, const struct sockaddr *, socklen_t,
+ cred_t *, pid_t);
+static int tcp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *,
+ boolean_t);
+static int tcp_do_unbind(conn_t *);
+static int tcp_bind_check(conn_t *, struct sockaddr *, socklen_t, cred_t *,
+ boolean_t);
+
/*
* Routines related to the TCP_IOC_ABORT_CONN ioctl command.
*
@@ -1001,11 +1024,11 @@ static struct module_info tcp_winfo = {
* We have separate open functions for the /dev/tcp and /dev/tcp6 devices.
*/
struct qinit tcp_rinitv4 = {
- NULL, (pfi_t)tcp_rsrv, tcp_openv4, tcp_close, NULL, &tcp_rinfo
+ NULL, (pfi_t)tcp_rsrv, tcp_openv4, tcp_tpi_close, NULL, &tcp_rinfo
};
struct qinit tcp_rinitv6 = {
- NULL, (pfi_t)tcp_rsrv, tcp_openv6, tcp_close, NULL, &tcp_rinfo
+ NULL, (pfi_t)tcp_rsrv, tcp_openv6, tcp_tpi_close, NULL, &tcp_rinfo
};
struct qinit tcp_winit = {
@@ -1017,6 +1040,11 @@ struct qinit tcp_sock_winit = {
(pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
};
+/* TCP entry point during fallback */
+struct qinit tcp_fallback_sock_winit = {
+ (pfi_t)tcp_wput_fallback, NULL, NULL, NULL, NULL, &tcp_winfo
+};
+
/*
* Entry points for TCP as a acceptor STREAM opened by sockfs when doing
* an accept. Avoid allocating data structures since eager has already
@@ -1027,7 +1055,7 @@ struct qinit tcp_acceptor_rinit = {
};
struct qinit tcp_acceptor_winit = {
- (pfi_t)tcp_wput_accept, NULL, NULL, NULL, NULL, &tcp_winfo
+ (pfi_t)tcp_tpi_accept, NULL, NULL, NULL, NULL, &tcp_winfo
};
/*
@@ -1036,7 +1064,7 @@ struct qinit tcp_acceptor_winit = {
* have a separate one for tcp_openv6.
*/
struct qinit tcp_loopback_rinit = {
- (pfi_t)0, (pfi_t)tcp_rsrv, tcp_openv4, tcp_close, (pfi_t)0,
+ (pfi_t)0, (pfi_t)tcp_rsrv, tcp_openv4, tcp_tpi_close, (pfi_t)0,
&tcp_rinfo, NULL, tcp_fuse_rrw, tcp_fuse_rinfop, STRUIOT_STANDARD
};
@@ -1050,6 +1078,8 @@ struct streamtab tcpinfov6 = {
&tcp_rinitv6, &tcp_winit
};
+sock_downcalls_t sock_tcp_downcalls;
+
/*
* Have to ensure that tcp_g_q_close is not done by an
* interrupt thread.
@@ -1907,6 +1937,7 @@ tcp_time_wait_collector(void *arg)
CALLOUT_FLAG_ROUNDUP);
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
}
+
/*
* Reply to a clients T_CONN_RES TPI message. This function
* is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
@@ -1914,7 +1945,7 @@ tcp_time_wait_collector(void *arg)
* Read the block comment on top of tcp_conn_request().
*/
static void
-tcp_accept(tcp_t *listener, mblk_t *mp)
+tcp_tli_accept(tcp_t *listener, mblk_t *mp)
{
tcp_t *acceptor;
tcp_t *eager;
@@ -1923,6 +1954,7 @@ tcp_accept(tcp_t *listener, mblk_t *mp)
t_uscalar_t acceptor_id;
t_scalar_t seqnum;
mblk_t *opt_mp = NULL; /* T_OPTMGMT_REQ messages */
+ struct tcp_options *tcpopt;
mblk_t *ok_mp;
mblk_t *mp1;
tcp_stack_t *tcps = listener->tcp_tcps;
@@ -2070,7 +2102,8 @@ tcp_accept(tcp_t *listener, mblk_t *mp)
ASSERT(eager->tcp_connp->conn_ref >= 1);
/* Pre allocate the stroptions mblk also */
- opt_mp = allocb(sizeof (struct stroptions), BPRI_HI);
+ opt_mp = allocb(MAX(sizeof (struct tcp_options),
+ sizeof (struct T_conn_res)), BPRI_HI);
if (opt_mp == NULL) {
CONN_DEC_REF(acceptor->tcp_connp);
CONN_DEC_REF(eager->tcp_connp);
@@ -2078,29 +2111,20 @@ tcp_accept(tcp_t *listener, mblk_t *mp)
return;
}
DB_TYPE(opt_mp) = M_SETOPTS;
- opt_mp->b_wptr += sizeof (struct stroptions);
+ opt_mp->b_wptr += sizeof (struct tcp_options);
+ tcpopt = (struct tcp_options *)opt_mp->b_rptr;
+ tcpopt->to_flags = 0;
/*
* Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO
- * from listener to acceptor. The message is chained on opt_mp
- * which will be sent onto eager's squeue.
+ * from listener to acceptor.
*/
if (listener->tcp_bound_if != 0) {
- /* allocate optmgmt req */
- mp1 = tcp_setsockopt_mp(IPPROTO_IPV6,
- IPV6_BOUND_IF, (char *)&listener->tcp_bound_if,
- sizeof (int));
- if (mp1 != NULL)
- linkb(opt_mp, mp1);
+ tcpopt->to_flags |= TCPOPT_BOUNDIF;
+ tcpopt->to_boundif = listener->tcp_bound_if;
}
if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) {
- uint_t on = 1;
-
- /* allocate optmgmt req */
- mp1 = tcp_setsockopt_mp(IPPROTO_IPV6,
- IPV6_RECVPKTINFO, (char *)&on, sizeof (on));
- if (mp1 != NULL)
- linkb(opt_mp, mp1);
+ tcpopt->to_flags |= TCPOPT_RECVPKTINFO;
}
/* Re-use mp1 to hold a copy of mp, in case reallocb fails */
@@ -2341,6 +2365,7 @@ tcp_accept(tcp_t *listener, mblk_t *mp)
finish:
ASSERT(acceptor->tcp_detached);
ASSERT(tcps->tcps_g_q != NULL);
+ ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp));
acceptor->tcp_rq = tcps->tcps_g_q;
acceptor->tcp_wq = WR(tcps->tcps_g_q);
(void) tcp_clean_death(acceptor, 0, 2);
@@ -2995,39 +3020,24 @@ error:
return (0);
}
-/*
- * tcp_bind is called (holding the writer lock) by tcp_wput_proto to process a
- * O_T_BIND_REQ/T_BIND_REQ message.
- */
static void
-tcp_bind(tcp_t *tcp, mblk_t *mp)
+tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
{
+ int error;
+ conn_t *connp = tcp->tcp_connp;
+ struct sockaddr *sa;
+ mblk_t *mp1;
+ struct T_bind_req *tbr;
+ int backlog;
+ socklen_t len;
sin_t *sin;
sin6_t *sin6;
- mblk_t *mp1;
- in_port_t requested_port;
- in_port_t allocated_port;
- struct T_bind_req *tbr;
- boolean_t bind_to_req_port_only;
- boolean_t backlog_update = B_FALSE;
- boolean_t user_specified;
- in6_addr_t v6addr;
- ipaddr_t v4addr;
- uint_t origipversion;
- int err;
- queue_t *q = tcp->tcp_wq;
- conn_t *connp = tcp->tcp_connp;
- mlp_type_t addrtype, mlptype;
- zone_t *zone;
- cred_t *cr;
- in_port_t mlp_port;
- tcp_stack_t *tcps = tcp->tcp_tcps;
ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
if (tcp->tcp_debug) {
(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
- "tcp_bind: bad req, len %u",
+ "tcp_tpi_bind: bad req, len %u",
(uint_t)(mp->b_wptr - mp->b_rptr));
}
tcp_err_ack(tcp, mp, TPROTO, 0);
@@ -3041,442 +3051,80 @@ tcp_bind(tcp_t *tcp, mblk_t *mp)
}
mp = mp1;
tbr = (struct T_bind_req *)mp->b_rptr;
- if (tcp->tcp_state >= TCPS_BOUND) {
- if ((tcp->tcp_state == TCPS_BOUND ||
- tcp->tcp_state == TCPS_LISTEN) &&
- tcp->tcp_conn_req_max != tbr->CONIND_number &&
- tbr->CONIND_number > 0) {
- /*
- * Handle listen() increasing CONIND_number.
- * This is more "liberal" then what the TPI spec
- * requires but is needed to avoid a t_unbind
- * when handling listen() since the port number
- * might be "stolen" between the unbind and bind.
- */
- backlog_update = B_TRUE;
- goto do_bind;
- }
- if (tcp->tcp_debug) {
- (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
- "tcp_bind: bad state, %d", tcp->tcp_state);
- }
- tcp_err_ack(tcp, mp, TOUTSTATE, 0);
- return;
- }
- origipversion = tcp->tcp_ipversion;
- switch (tbr->ADDR_length) {
- case 0: /* request for a generic port */
+ backlog = tbr->CONIND_number;
+ len = tbr->ADDR_length;
+
+ switch (len) {
+ case 0: /* request for a generic port */
tbr->ADDR_offset = sizeof (struct T_bind_req);
if (tcp->tcp_family == AF_INET) {
tbr->ADDR_length = sizeof (sin_t);
sin = (sin_t *)&tbr[1];
*sin = sin_null;
sin->sin_family = AF_INET;
+ sa = (struct sockaddr *)sin;
+ len = sizeof (sin_t);
mp->b_wptr = (uchar_t *)&sin[1];
- tcp->tcp_ipversion = IPV4_VERSION;
- IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &v6addr);
} else {
ASSERT(tcp->tcp_family == AF_INET6);
tbr->ADDR_length = sizeof (sin6_t);
sin6 = (sin6_t *)&tbr[1];
*sin6 = sin6_null;
sin6->sin6_family = AF_INET6;
+ sa = (struct sockaddr *)sin6;
+ len = sizeof (sin6_t);
mp->b_wptr = (uchar_t *)&sin6[1];
- tcp->tcp_ipversion = IPV6_VERSION;
- V6_SET_ZERO(v6addr);
}
- requested_port = 0;
break;
- case sizeof (sin_t): /* Complete IPv4 address */
- sin = (sin_t *)mi_offset_param(mp, tbr->ADDR_offset,
+ case sizeof (sin_t): /* Complete IPv4 address */
+ sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
sizeof (sin_t));
- if (sin == NULL || !OK_32PTR((char *)sin)) {
- if (tcp->tcp_debug) {
- (void) strlog(TCP_MOD_ID, 0, 1,
- SL_ERROR|SL_TRACE,
- "tcp_bind: bad address parameter, "
- "offset %d, len %d",
- tbr->ADDR_offset, tbr->ADDR_length);
- }
- tcp_err_ack(tcp, mp, TPROTO, 0);
- return;
- }
- /*
- * With sockets sockfs will accept bogus sin_family in
- * bind() and replace it with the family used in the socket
- * call.
- */
- if (sin->sin_family != AF_INET ||
- tcp->tcp_family != AF_INET) {
- tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
- return;
- }
- requested_port = ntohs(sin->sin_port);
- tcp->tcp_ipversion = IPV4_VERSION;
- v4addr = sin->sin_addr.s_addr;
- IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
break;
case sizeof (sin6_t): /* Complete IPv6 address */
- sin6 = (sin6_t *)mi_offset_param(mp,
+ sa = (struct sockaddr *)mi_offset_param(mp,
tbr->ADDR_offset, sizeof (sin6_t));
- if (sin6 == NULL || !OK_32PTR((char *)sin6)) {
- if (tcp->tcp_debug) {
- (void) strlog(TCP_MOD_ID, 0, 1,
- SL_ERROR|SL_TRACE,
- "tcp_bind: bad IPv6 address parameter, "
- "offset %d, len %d", tbr->ADDR_offset,
- tbr->ADDR_length);
- }
- tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
- return;
- }
- if (sin6->sin6_family != AF_INET6 ||
- tcp->tcp_family != AF_INET6) {
- tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
- return;
- }
- requested_port = ntohs(sin6->sin6_port);
- tcp->tcp_ipversion = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ?
- IPV4_VERSION : IPV6_VERSION;
- v6addr = sin6->sin6_addr;
break;
default:
if (tcp->tcp_debug) {
(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
- "tcp_bind: bad address length, %d",
+ "tcp_tpi_bind: bad address length, %d",
tbr->ADDR_length);
}
tcp_err_ack(tcp, mp, TBADADDR, 0);
return;
}
- tcp->tcp_bound_source_v6 = v6addr;
-
- /* Check for change in ipversion */
- if (origipversion != tcp->tcp_ipversion) {
- ASSERT(tcp->tcp_family == AF_INET6);
- err = tcp->tcp_ipversion == IPV6_VERSION ?
- tcp_header_init_ipv6(tcp) : tcp_header_init_ipv4(tcp);
- if (err) {
- tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
- return;
- }
- }
-
- /*
- * Initialize family specific fields. Copy of the src addr.
- * in tcp_t is needed for the lookup funcs.
- */
- if (tcp->tcp_ipversion == IPV6_VERSION) {
- tcp->tcp_ip6h->ip6_src = v6addr;
- } else {
- IN6_V4MAPPED_TO_IPADDR(&v6addr, tcp->tcp_ipha->ipha_src);
- }
- tcp->tcp_ip_src_v6 = v6addr;
- /*
- * For O_T_BIND_REQ:
- * Verify that the target port/addr is available, or choose
- * another.
- * For T_BIND_REQ:
- * Verify that the target port/addr is available or fail.
- * In both cases when it succeeds the tcp is inserted in the
- * bind hash table. This ensures that the operation is atomic
- * under the lock on the hash bucket.
- */
- bind_to_req_port_only = requested_port != 0 &&
- tbr->PRIM_type != O_T_BIND_REQ;
- /*
- * Get a valid port (within the anonymous range and should not
- * be a privileged one) to use if the user has not given a port.
- * If multiple threads are here, they may all start with
- * with the same initial port. But, it should be fine as long as
- * tcp_bindi will ensure that no two threads will be assigned
- * the same port.
- *
- * NOTE: XXX If a privileged process asks for an anonymous port, we
- * still check for ports only in the range > tcp_smallest_non_priv_port,
- * unless TCP_ANONPRIVBIND option is set.
- */
- mlptype = mlptSingle;
- mlp_port = requested_port;
- if (requested_port == 0) {
- requested_port = tcp->tcp_anon_priv_bind ?
- tcp_get_next_priv_port(tcp) :
- tcp_update_next_port(tcps->tcps_next_port_to_try,
- tcp, B_TRUE);
- if (requested_port == 0) {
- tcp_err_ack(tcp, mp, TNOADDR, 0);
- return;
- }
- user_specified = B_FALSE;
-
- /*
- * If the user went through one of the RPC interfaces to create
- * this socket and RPC is MLP in this zone, then give him an
- * anonymous MLP.
- */
- cr = DB_CREDDEF(mp, tcp->tcp_cred);
- if (connp->conn_anon_mlp && is_system_labeled()) {
- zone = crgetzone(cr);
- addrtype = tsol_mlp_addr_type(zone->zone_id,
- IPV6_VERSION, &v6addr,
- tcps->tcps_netstack->netstack_ip);
- if (addrtype == mlptSingle) {
- tcp_err_ack(tcp, mp, TNOADDR, 0);
- return;
- }
- mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
- PMAPPORT, addrtype);
- mlp_port = PMAPPORT;
- }
- } else {
- int i;
- boolean_t priv = B_FALSE;
-
- /*
- * If the requested_port is in the well-known privileged range,
- * verify that the stream was opened by a privileged user.
- * Note: No locks are held when inspecting tcp_g_*epriv_ports
- * but instead the code relies on:
- * - the fact that the address of the array and its size never
- * changes
- * - the atomic assignment of the elements of the array
- */
- cr = DB_CREDDEF(mp, tcp->tcp_cred);
- if (requested_port < tcps->tcps_smallest_nonpriv_port) {
- priv = B_TRUE;
- } else {
- for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
- if (requested_port ==
- tcps->tcps_g_epriv_ports[i]) {
- priv = B_TRUE;
- break;
- }
- }
- }
- if (priv) {
- if (secpolicy_net_privaddr(cr, requested_port,
- IPPROTO_TCP) != 0) {
- if (tcp->tcp_debug) {
- (void) strlog(TCP_MOD_ID, 0, 1,
- SL_ERROR|SL_TRACE,
- "tcp_bind: no priv for port %d",
- requested_port);
- }
- tcp_err_ack(tcp, mp, TACCES, 0);
- return;
- }
- }
- user_specified = B_TRUE;
-
- if (is_system_labeled()) {
- zone = crgetzone(cr);
- addrtype = tsol_mlp_addr_type(zone->zone_id,
- IPV6_VERSION, &v6addr,
- tcps->tcps_netstack->netstack_ip);
- if (addrtype == mlptSingle) {
- tcp_err_ack(tcp, mp, TNOADDR, 0);
- return;
- }
- mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
- requested_port, addrtype);
- }
- }
-
- if (mlptype != mlptSingle) {
- if (secpolicy_net_bindmlp(cr) != 0) {
- if (tcp->tcp_debug) {
- (void) strlog(TCP_MOD_ID, 0, 1,
- SL_ERROR|SL_TRACE,
- "tcp_bind: no priv for multilevel port %d",
- requested_port);
- }
- tcp_err_ack(tcp, mp, TACCES, 0);
- return;
- }
-
- /*
- * If we're specifically binding a shared IP address and the
- * port is MLP on shared addresses, then check to see if this
- * zone actually owns the MLP. Reject if not.
- */
- if (mlptype == mlptShared && addrtype == mlptShared) {
- /*
- * No need to handle exclusive-stack zones since
- * ALL_ZONES only applies to the shared stack.
- */
- zoneid_t mlpzone;
-
- mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
- htons(mlp_port));
- if (connp->conn_zoneid != mlpzone) {
- if (tcp->tcp_debug) {
- (void) strlog(TCP_MOD_ID, 0, 1,
- SL_ERROR|SL_TRACE,
- "tcp_bind: attempt to bind port "
- "%d on shared addr in zone %d "
- "(should be %d)",
- mlp_port, connp->conn_zoneid,
- mlpzone);
- }
- tcp_err_ack(tcp, mp, TACCES, 0);
- return;
- }
- }
-
- if (!user_specified) {
- err = tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
- requested_port, B_TRUE);
- if (err != 0) {
- if (tcp->tcp_debug) {
- (void) strlog(TCP_MOD_ID, 0, 1,
- SL_ERROR|SL_TRACE,
- "tcp_bind: cannot establish anon "
- "MLP for port %d",
- requested_port);
- }
- tcp_err_ack(tcp, mp, TSYSERR, err);
- return;
- }
- connp->conn_anon_port = B_TRUE;
- }
- connp->conn_mlp_type = mlptype;
- }
-
- allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
- tcp->tcp_reuseaddr, B_FALSE, bind_to_req_port_only, user_specified);
-
- if (allocated_port == 0) {
- connp->conn_mlp_type = mlptSingle;
- if (connp->conn_anon_port) {
- connp->conn_anon_port = B_FALSE;
- (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
- requested_port, B_FALSE);
- }
- if (bind_to_req_port_only) {
- if (tcp->tcp_debug) {
- (void) strlog(TCP_MOD_ID, 0, 1,
- SL_ERROR|SL_TRACE,
- "tcp_bind: requested addr busy");
- }
- tcp_err_ack(tcp, mp, TADDRBUSY, 0);
- } else {
- /* If we are out of ports, fail the bind. */
- if (tcp->tcp_debug) {
- (void) strlog(TCP_MOD_ID, 0, 1,
- SL_ERROR|SL_TRACE,
- "tcp_bind: out of ports?");
- }
- tcp_err_ack(tcp, mp, TNOADDR, 0);
- }
- return;
- }
- ASSERT(tcp->tcp_state == TCPS_BOUND);
-do_bind:
- if (!backlog_update) {
- if (tcp->tcp_family == AF_INET)
- sin->sin_port = htons(allocated_port);
- else
- sin6->sin6_port = htons(allocated_port);
- }
- if (tcp->tcp_family == AF_INET) {
- if (tbr->CONIND_number != 0) {
- mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type,
- sizeof (sin_t));
- } else {
- /* Just verify the local IP address */
- mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, IP_ADDR_LEN);
- }
- } else {
- if (tbr->CONIND_number != 0) {
- mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type,
- sizeof (sin6_t));
+ error = tcp_bind_check(connp, sa, len, DB_CRED(mp),
+ tbr->PRIM_type != O_T_BIND_REQ);
+ if (error == 0) {
+ if (tcp->tcp_family == AF_INET) {
+ sin = (sin_t *)sa;
+ sin->sin_port = tcp->tcp_lport;
} else {
- /* Just verify the local IP address */
- mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type,
- IPV6_ADDR_LEN);
- }
- }
- if (mp1 == NULL) {
- if (connp->conn_anon_port) {
- connp->conn_anon_port = B_FALSE;
- (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
- requested_port, B_FALSE);
+ sin6 = (sin6_t *)sa;
+ sin6->sin6_port = tcp->tcp_lport;
}
- connp->conn_mlp_type = mlptSingle;
- tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
- return;
- }
-
- tbr->PRIM_type = T_BIND_ACK;
- mp->b_datap->db_type = M_PCPROTO;
- /* Chain in the reply mp for tcp_rput() */
- mp1->b_cont = mp;
- mp = mp1;
-
- tcp->tcp_conn_req_max = tbr->CONIND_number;
- if (tcp->tcp_conn_req_max) {
- if (tcp->tcp_conn_req_max < tcps->tcps_conn_req_min)
- tcp->tcp_conn_req_max = tcps->tcps_conn_req_min;
- if (tcp->tcp_conn_req_max > tcps->tcps_conn_req_max_q)
- tcp->tcp_conn_req_max = tcps->tcps_conn_req_max_q;
- /*
- * If this is a listener, do not reset the eager list
- * and other stuffs. Note that we don't check if the
- * existing eager list meets the new tcp_conn_req_max
- * requirement.
- */
- if (tcp->tcp_state != TCPS_LISTEN) {
- tcp->tcp_state = TCPS_LISTEN;
- /* Initialize the chain. Don't need the eager_lock */
- tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
- tcp->tcp_eager_next_drop_q0 = tcp;
- tcp->tcp_eager_prev_drop_q0 = tcp;
- tcp->tcp_second_ctimer_threshold =
- tcps->tcps_ip_abort_linterval;
+ if (backlog > 0) {
+ error = tcp_do_listen(connp, backlog, DB_CRED(mp));
}
}
-
- /*
- * We can call ip_bind directly which returns a T_BIND_ACK mp. The
- * processing continues in tcp_rput_other().
- *
- * We need to make sure that the conn_recv is set to a non-null
- * value before we insert the conn into the classifier table.
- * This is to avoid a race with an incoming packet which does an
- * ipcl_classify().
- */
- connp->conn_recv = tcp_conn_request;
- if (tcp->tcp_family == AF_INET6) {
- ASSERT(tcp->tcp_connp->conn_af_isv6);
- mp = ip_bind_v6(q, mp, tcp->tcp_connp, &tcp->tcp_sticky_ipp);
- } else {
- ASSERT(!tcp->tcp_connp->conn_af_isv6);
- mp = ip_bind_v4(q, mp, tcp->tcp_connp);
- }
- /*
- * If the bind cannot complete immediately
- * IP will arrange to call tcp_rput_other
- * when the bind completes.
- */
- if (mp != NULL) {
- tcp_rput_other(tcp, mp);
+done:
+ if (error > 0) {
+ tcp_err_ack(tcp, mp, TSYSERR, error);
+ } else if (error < 0) {
+ tcp_err_ack(tcp, mp, -error, 0);
} else {
- /*
- * Bind will be resumed later. Need to ensure
- * that conn doesn't disappear when that happens.
- * This will be decremented in ip_resume_tcp_bind().
- */
- CONN_INC_REF(tcp->tcp_connp);
+ mp->b_datap->db_type = M_PCPROTO;
+ tbr->PRIM_type = T_BIND_ACK;
+ putnext(tcp->tcp_rq, mp);
}
}
-
/*
* If the "bind_to_req_port_only" parameter is set, if the requested port
* number is available, return it, If not return 0
@@ -3560,12 +3208,14 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
mutex_enter(&tbf->tf_lock);
for (ltcp = tbf->tf_tcp; ltcp != NULL;
ltcp = ltcp->tcp_bind_hash) {
+ if (lport == ltcp->tcp_lport)
+ break;
+ }
+
+ for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
boolean_t not_socket;
boolean_t exclbind;
- if (lport != ltcp->tcp_lport)
- continue;
-
lconnp = ltcp->tcp_connp;
/*
@@ -3817,6 +3467,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
{
mblk_t *mp;
queue_t *q;
+ conn_t *connp = tcp->tcp_connp;
tcp_stack_t *tcps = tcp->tcp_tcps;
sodirect_t *sodp;
@@ -3857,7 +3508,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
*/
tcp_closei_local(tcp);
if (!tcp->tcp_tconnind_started) {
- CONN_DEC_REF(tcp->tcp_connp);
+ CONN_DEC_REF(connp);
} else {
tcp->tcp_state = TCPS_BOUND;
}
@@ -3879,7 +3530,10 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
q = tcp->tcp_rq;
/* Trash all inbound data */
- flushq(q, FLUSHALL);
+ if (!IPCL_IS_NONSTR(connp)) {
+ ASSERT(q != NULL);
+ flushq(q, FLUSHALL);
+ }
/*
* If we are at least part way open and there is error
@@ -3900,16 +3554,22 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
"tcp_clean_death: discon err %d", err);
}
- mp = mi_tpi_discon_ind(NULL, err, 0);
- if (mp != NULL) {
- putnext(q, mp);
+ if (IPCL_IS_NONSTR(connp)) {
+ /* Direct socket, use upcall */
+ (*connp->conn_upcalls->su_disconnected)(
+ connp->conn_upper_handle, tcp->tcp_connid, err);
} else {
- if (tcp->tcp_debug) {
- (void) strlog(TCP_MOD_ID, 0, 1,
- SL_ERROR|SL_TRACE,
- "tcp_clean_death, sending M_ERROR");
+ mp = mi_tpi_discon_ind(NULL, err, 0);
+ if (mp != NULL) {
+ putnext(q, mp);
+ } else {
+ if (tcp->tcp_debug) {
+ (void) strlog(TCP_MOD_ID, 0, 1,
+ SL_ERROR|SL_TRACE,
+ "tcp_clean_death, sending M_ERROR");
+ }
+ (void) putnextctl1(q, M_ERROR, EPROTO);
}
- (void) putnextctl1(q, M_ERROR, EPROTO);
}
if (tcp->tcp_state <= TCPS_SYN_RCVD) {
/* SYN_SENT or SYN_RCVD */
@@ -3921,6 +3581,9 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
}
tcp_reinit(tcp);
+ if (IPCL_IS_NONSTR(connp))
+ (void) tcp_do_unbind(connp);
+
return (-1);
}
@@ -3954,7 +3617,6 @@ tcp_stop_lingering(tcp_t *tcp)
*/
tcp_timers_stop(tcp);
-
tcp->tcp_detached = B_TRUE;
ASSERT(tcps->tcps_g_q != NULL);
tcp->tcp_rq = tcps->tcps_g_q;
@@ -3984,8 +3646,10 @@ finish:
mutex_enter(&tcp->tcp_closelock);
tcp->tcp_detached = B_TRUE;
ASSERT(tcps->tcps_g_q != NULL);
+
tcp->tcp_rq = tcps->tcps_g_q;
tcp->tcp_wq = WR(tcps->tcps_g_q);
+
tcp->tcp_closed = 1;
cv_signal(&tcp->tcp_closecv);
mutex_exit(&tcp->tcp_closelock);
@@ -4005,21 +3669,17 @@ tcp_close_linger_timeout(void *arg)
tcp_stop_lingering(tcp);
}
-static int
-tcp_close(queue_t *q, int flags)
+static void
+tcp_close_common(conn_t *connp, int flags)
{
- conn_t *connp = Q_TO_CONN(q);
tcp_t *tcp = connp->conn_tcp;
mblk_t *mp = &tcp->tcp_closemp;
boolean_t conn_ioctl_cleanup_reqd = B_FALSE;
mblk_t *bp;
- ASSERT(WR(q)->q_next == NULL);
ASSERT(connp->conn_ref >= 2);
/*
- * We are being closed as /dev/tcp or /dev/tcp6.
- *
* Mark the conn as closing. ill_pending_mp_add will not
* add any mp to the pending mp list, after this conn has
* started closing. Same for sq_pending_mp_add
@@ -4106,11 +3766,35 @@ tcp_close(queue_t *q, int flags)
if (conn_ioctl_cleanup_reqd)
conn_ioctl_cleanup(connp);
+ tcp->tcp_cpid = -1;
+}
+
+static int
+tcp_tpi_close(queue_t *q, int flags)
+{
+ conn_t *connp;
+
+ ASSERT(WR(q)->q_next == NULL);
+
+ if (flags & SO_FALLBACK) {
+ /*
+ * stream is being closed while in fallback
+ * simply free the resources that were allocated
+ */
+ inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
+ qprocsoff(q);
+ goto done;
+ }
+
+ connp = Q_TO_CONN(q);
+ /*
+ * We are being closed as /dev/tcp or /dev/tcp6.
+ */
+ tcp_close_common(connp, flags);
+
qprocsoff(q);
inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
- tcp->tcp_cpid = -1;
-
/*
* Drop IP's reference on the conn. This is the last reference
* on the connp if the state was less than established. If the
@@ -4124,6 +3808,7 @@ tcp_close(queue_t *q, int flags)
* packets in squeue for the timewait state.
*/
CONN_DEC_REF(connp);
+done:
q->q_ptr = WR(q)->q_ptr = NULL;
return (0);
}
@@ -4615,11 +4300,13 @@ tcp_free(tcp_t *tcp)
}
if (tcp->tcp_fused_sigurg_mp != NULL) {
+ ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp));
freeb(tcp->tcp_fused_sigurg_mp);
tcp->tcp_fused_sigurg_mp = NULL;
}
if (tcp->tcp_ordrel_mp != NULL) {
+ ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp));
freeb(tcp->tcp_ordrel_mp);
tcp->tcp_ordrel_mp = NULL;
}
@@ -4761,10 +4448,19 @@ tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp,
DB_CPID(mp) = DB_CPID(idmp);
}
- if (defermp == NULL)
- putnext(tcp->tcp_rq, mp);
- else
+ if (defermp == NULL) {
+ conn_t *connp = tcp->tcp_connp;
+ if (IPCL_IS_NONSTR(connp)) {
+ (*connp->conn_upcalls->su_connected)
+ (connp->conn_upper_handle, tcp->tcp_connid, cr,
+ DB_CPID(mp));
+ freemsg(mp);
+ } else {
+ putnext(tcp->tcp_rq, mp);
+ }
+ } else {
*defermp = mp;
+ }
if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
@@ -4946,10 +4642,13 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
/* Inherit information from the "parent" */
tcp->tcp_ipversion = ltcp->tcp_ipversion;
tcp->tcp_family = ltcp->tcp_family;
+
tcp->tcp_wq = ltcp->tcp_wq;
tcp->tcp_rq = ltcp->tcp_rq;
+
tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
tcp->tcp_detached = B_TRUE;
+ SOCK_CONNID_INIT(tcp->tcp_connid);
if ((err = tcp_init_values(tcp)) != 0) {
freemsg(tpi_mp);
return (err);
@@ -5100,6 +4799,12 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
tcp->tcp_kssl_pending = B_TRUE;
}
+ /* Inherit the listener's non-STREAMS flag */
+ if (IPCL_IS_NONSTR(lconnp)) {
+ connp->conn_flags |= IPCL_NONSTR;
+ connp->conn_upcalls = lconnp->conn_upcalls;
+ }
+
return (0);
}
@@ -5159,6 +4864,7 @@ tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
tcp->tcp_rq = ltcp->tcp_rq;
tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
tcp->tcp_detached = B_TRUE;
+ SOCK_CONNID_INIT(tcp->tcp_connid);
if ((err = tcp_init_values(tcp)) != 0) {
freemsg(tpi_mp);
return (err);
@@ -5219,6 +4925,12 @@ tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
tcp->tcp_kssl_pending = B_TRUE;
}
+ /* Inherit the listener's non-STREAMS flag */
+ if (IPCL_IS_NONSTR(lconnp)) {
+ connp->conn_flags |= IPCL_NONSTR;
+ connp->conn_upcalls = lconnp->conn_upcalls;
+ }
+
return (0);
}
@@ -5474,7 +5186,7 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr)
if (tsol_update_sticky(&tcp->tcp_sticky_ipp,
&tcp->tcp_label_len, optbuf) != 0)
return (B_FALSE);
- if (tcp_build_hdrs(tcp->tcp_rq, tcp) != 0)
+ if (tcp_build_hdrs(tcp) != 0)
return (B_FALSE);
}
@@ -5732,12 +5444,13 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
eager = econnp->conn_tcp;
/*
- * Pre-allocate the T_ordrel_ind mblk so that at close time, we
- * will always have that to send up. Otherwise, we need to do
+ * Pre-allocate the T_ordrel_ind mblk for TPI socket so that at close
+ * time, we will always have that to send up. Otherwise, we need to do
* special handling in case the allocation fails at that time.
*/
ASSERT(eager->tcp_ordrel_mp == NULL);
- if ((eager->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL)
+ if (!IPCL_IS_NONSTR(econnp) &&
+ (eager->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL)
goto error3;
/* Inherit various TCP parameters from the listener */
@@ -5839,7 +5552,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
* There should be no ire in the mp as we are being called after
* receiving the SYN.
*/
- ASSERT(tcp_ire_mp(mp) == NULL);
+ ASSERT(tcp_ire_mp(&mp) == NULL);
/*
* Adapt our mss, ttl, ... according to information provided in IRE.
@@ -5871,7 +5584,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
* we should not inherit receive window size from listener.
*/
eager->tcp_rwnd = MSS_ROUNDUP(
- (eager->tcp_rwnd == 0 ? tcp->tcp_rq->q_hiwat :
+ (eager->tcp_rwnd == 0 ? tcp->tcp_recv_hiwater:
eager->tcp_rwnd), eager->tcp_mss);
if (eager->tcp_snd_ws_ok)
tcp_set_ws_value(eager);
@@ -5899,6 +5612,8 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
*
*/
/* Set the TCP options */
+ eager->tcp_recv_hiwater = tcp->tcp_recv_hiwater;
+ eager->tcp_recv_lowater = tcp->tcp_recv_lowater;
eager->tcp_xmit_hiwater = tcp->tcp_xmit_hiwater;
eager->tcp_dgram_errind = tcp->tcp_dgram_errind;
eager->tcp_oobinline = tcp->tcp_oobinline;
@@ -5906,6 +5621,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
eager->tcp_broadcast = tcp->tcp_broadcast;
eager->tcp_useloopback = tcp->tcp_useloopback;
eager->tcp_dontroute = tcp->tcp_dontroute;
+ eager->tcp_debug = tcp->tcp_debug;
eager->tcp_linger = tcp->tcp_linger;
eager->tcp_lingertime = tcp->tcp_lingertime;
if (tcp->tcp_ka_enabled)
@@ -5979,6 +5695,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
goto error;
}
DB_CPID(mp1) = tcp->tcp_cpid;
+ mblk_setcred(mp1, tcp->tcp_cred);
eager->tcp_cpid = tcp->tcp_cpid;
eager->tcp_open_time = lbolt64;
@@ -6168,9 +5885,9 @@ done:
* Successful connect request processing begins when our client passes
* a T_CONN_REQ message into tcp_wput() and ends when tcp_rput() passes
* our T_OK_ACK reply message upstream. The control flow looks like this:
- * upstream -> tcp_wput() -> tcp_wput_proto() -> tcp_connect() -> IP
- * upstream <- tcp_rput() <- IP
- * After various error checks are completed, tcp_connect() lays
+ * upstream -> tcp_wput() -> tcp_wput_proto() -> tcp_tpi_connect() -> IP
+ * upstream <- tcp_rput() <- IP
+ * After various error checks are completed, tcp_tpi_connect() lays
* the target address and port into the composite header template,
* preallocates the T_OK_ACK reply message, construct a full 12 byte bind
* request followed by an IRE request, and passes the three mblk message
@@ -6185,15 +5902,14 @@ done:
* above.
*/
static void
-tcp_connect(tcp_t *tcp, mblk_t *mp)
+tcp_tpi_connect(tcp_t *tcp, mblk_t *mp)
{
sin_t *sin;
- sin6_t *sin6;
queue_t *q = tcp->tcp_wq;
struct T_conn_req *tcr;
- ipaddr_t *dstaddrp;
- in_port_t dstport;
- uint_t srcid;
+ struct sockaddr *sa;
+ socklen_t len;
+ int error;
tcr = (struct T_conn_req *)mp->b_rptr;
@@ -6287,46 +6003,24 @@ tcp_connect(tcp_t *tcp, mblk_t *mp)
/* FALLTHRU */
case sizeof (sin_t):
- sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
+ sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
sizeof (sin_t));
- if (sin == NULL || !OK_32PTR((char *)sin)) {
- tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
- return;
- }
- if (tcp->tcp_family != AF_INET ||
- sin->sin_family != AF_INET) {
- tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
- return;
- }
- if (sin->sin_port == 0) {
- tcp_err_ack(tcp, mp, TBADADDR, 0);
- return;
- }
- if (tcp->tcp_connp && tcp->tcp_connp->conn_ipv6_v6only) {
- tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
- return;
- }
-
+ len = sizeof (sin_t);
break;
case sizeof (sin6_t):
- sin6 = (sin6_t *)mi_offset_param(mp, tcr->DEST_offset,
+ sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
sizeof (sin6_t));
- if (sin6 == NULL || !OK_32PTR((char *)sin6)) {
- tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
- return;
- }
- if (tcp->tcp_family != AF_INET6 ||
- sin6->sin6_family != AF_INET6) {
- tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
- return;
- }
- if (sin6->sin6_port == 0) {
- tcp_err_ack(tcp, mp, TBADADDR, 0);
- return;
- }
+ len = sizeof (sin6_t);
break;
}
+
+ error = proto_verify_ip_addr(tcp->tcp_family, sa, len);
+ if (error != 0) {
+ tcp_err_ack(tcp, mp, TSYSERR, error);
+ return;
+ }
+
/*
* TODO: If someone in TCPS_TIME_WAIT has this dst/port we
* should key on their sequence number and cut them loose.
@@ -6394,80 +6088,17 @@ tcp_connect(tcp_t *tcp, mblk_t *mp)
}
}
- /*
- * If we're connecting to an IPv4-mapped IPv6 address, we need to
- * make sure that the template IP header in the tcp structure is an
- * IPv4 header, and that the tcp_ipversion is IPV4_VERSION. We
- * need to this before we call tcp_bindi() so that the port lookup
- * code will look for ports in the correct port space (IPv4 and
- * IPv6 have separate port spaces).
- */
- if (tcp->tcp_family == AF_INET6 && tcp->tcp_ipversion == IPV6_VERSION &&
- IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
- int err = 0;
-
- err = tcp_header_init_ipv4(tcp);
- if (err != 0) {
- mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
- goto connect_failed;
- }
- if (tcp->tcp_lport != 0)
- *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport;
- }
-
- if (tcp->tcp_issocket) {
- /*
- * TCP is _D_SODIRECT and sockfs is directly above so save
- * the shared sonode sodirect_t pointer (if any) to enable
- * TCP sodirect.
- */
- tcp->tcp_sodirect = SOD_QTOSODP(tcp->tcp_rq);
+ /* call the non-TPI version */
+ error = tcp_do_connect(tcp->tcp_connp, sa, len, DB_CRED(mp),
+ DB_CPID(mp));
+ if (error < 0) {
+ mp = mi_tpi_err_ack_alloc(mp, -error, 0);
+ } else if (error > 0) {
+ mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error);
+ } else {
+ mp = mi_tpi_ok_ack_alloc(mp);
}
- switch (tcp->tcp_state) {
- case TCPS_IDLE:
- /*
- * We support quick connect, refer to comments in
- * tcp_connect_*()
- */
- /* FALLTHRU */
- case TCPS_BOUND:
- case TCPS_LISTEN:
- if (tcp->tcp_family == AF_INET6) {
- if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
- tcp_connect_ipv6(tcp, mp,
- &sin6->sin6_addr,
- sin6->sin6_port, sin6->sin6_flowinfo,
- sin6->__sin6_src_id, sin6->sin6_scope_id);
- return;
- }
- /*
- * Destination adress is mapped IPv6 address.
- * Source bound address should be unspecified or
- * IPv6 mapped address as well.
- */
- if (!IN6_IS_ADDR_UNSPECIFIED(
- &tcp->tcp_bound_source_v6) &&
- !IN6_IS_ADDR_V4MAPPED(&tcp->tcp_bound_source_v6)) {
- mp = mi_tpi_err_ack_alloc(mp, TSYSERR,
- EADDRNOTAVAIL);
- break;
- }
- dstaddrp = &V4_PART_OF_V6((sin6->sin6_addr));
- dstport = sin6->sin6_port;
- srcid = sin6->__sin6_src_id;
- } else {
- dstaddrp = &sin->sin_addr.s_addr;
- dstport = sin->sin_port;
- srcid = 0;
- }
-
- tcp_connect_ipv4(tcp, mp, dstaddrp, dstport, srcid);
- return;
- default:
- mp = mi_tpi_err_ack_alloc(mp, TOUTSTATE, 0);
- break;
- }
/*
* Note: Code below is the "failure" case
*/
@@ -6479,23 +6110,22 @@ connect_failed:
tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
TSYSERR, ENOMEM);
}
- if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
- tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
}
/*
* Handle connect to IPv4 destinations, including connections for AF_INET6
* sockets connecting to IPv4 mapped IPv6 destinations.
*/
-static void
-tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, in_port_t dstport,
- uint_t srcid)
+static int
+tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport,
+ uint_t srcid, cred_t *cr, pid_t pid)
{
tcph_t *tcph;
- mblk_t *mp1;
+ mblk_t *mp;
ipaddr_t dstaddr = *dstaddrp;
int32_t oldstate;
uint16_t lport;
+ int error = 0;
tcp_stack_t *tcps = tcp->tcp_tcps;
ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
@@ -6538,7 +6168,7 @@ tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, in_port_t dstport,
*/
if (dstaddr == tcp->tcp_ipha->ipha_src &&
dstport == tcp->tcp_lport) {
- mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0);
+ error = -TBADADDR;
goto failed;
}
@@ -6583,91 +6213,77 @@ tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, in_port_t dstport,
lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE,
B_FALSE, B_FALSE);
if (lport == 0) {
- mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0);
+ error = -TNOADDR;
goto failed;
}
}
tcp->tcp_state = TCPS_SYN_SENT;
- /*
- * TODO: allow data with connect requests
- * by unlinking M_DATA trailers here and
- * linking them in behind the T_OK_ACK mblk.
- * The tcp_rput() bind ack handler would then
- * feed them to tcp_wput_data() rather than call
- * tcp_timer().
- */
- mp = mi_tpi_ok_ack_alloc(mp);
- if (!mp) {
+ mp = allocb(sizeof (ire_t), BPRI_HI);
+ if (mp == NULL) {
tcp->tcp_state = oldstate;
+ error = ENOMEM;
goto failed;
}
- if (tcp->tcp_family == AF_INET) {
- mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ,
- sizeof (ipa_conn_t));
- } else {
- mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ,
- sizeof (ipa6_conn_t));
+ mp->b_wptr += sizeof (ire_t);
+ mp->b_datap->db_type = IRE_DB_REQ_TYPE;
+ tcp->tcp_hard_binding = 1;
+ if (cr == NULL) {
+ cr = tcp->tcp_cred;
+ pid = tcp->tcp_cpid;
}
- if (mp1) {
- /*
- * We need to make sure that the conn_recv is set to a non-null
- * value before we insert the conn_t into the classifier table.
- * This is to avoid a race with an incoming packet which does
- * an ipcl_classify().
- */
- tcp->tcp_connp->conn_recv = tcp_input;
+ mblk_setcred(mp, cr);
+ DB_CPID(mp) = pid;
- /* Hang onto the T_OK_ACK for later. */
- linkb(mp1, mp);
- mblk_setcred(mp1, tcp->tcp_cred);
- if (tcp->tcp_family == AF_INET)
- mp1 = ip_bind_v4(tcp->tcp_wq, mp1, tcp->tcp_connp);
- else {
- mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp,
- &tcp->tcp_sticky_ipp);
+ /*
+ * We need to make sure that the conn_recv is set to a non-null
+ * value before we insert the conn_t into the classifier table.
+ * This is to avoid a race with an incoming packet which does
+ * an ipcl_classify().
+ */
+ tcp->tcp_connp->conn_recv = tcp_input;
+
+ if (tcp->tcp_family == AF_INET) {
+ error = ip_proto_bind_connected_v4(tcp->tcp_connp, &mp,
+ IPPROTO_TCP, &tcp->tcp_ipha->ipha_src, tcp->tcp_lport,
+ tcp->tcp_remote, tcp->tcp_fport, B_TRUE, B_TRUE);
+ } else {
+ in6_addr_t v6src;
+ if (tcp->tcp_ipversion == IPV4_VERSION) {
+ IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, &v6src);
+ } else {
+ v6src = tcp->tcp_ip6h->ip6_src;
}
- BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
- tcp->tcp_active_open = 1;
- /*
- * If the bind cannot complete immediately
- * IP will arrange to call tcp_rput_other
- * when the bind completes.
- */
- if (mp1 != NULL)
- tcp_rput_other(tcp, mp1);
- return;
+ error = ip_proto_bind_connected_v6(tcp->tcp_connp, &mp,
+ IPPROTO_TCP, &v6src, tcp->tcp_lport, &tcp->tcp_remote_v6,
+ &tcp->tcp_sticky_ipp, tcp->tcp_fport, B_TRUE, B_TRUE);
}
- /* Error case */
- tcp->tcp_state = oldstate;
- mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
+ BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
+ tcp->tcp_active_open = 1;
+ return (tcp_post_ip_bind(tcp, mp, error));
failed:
/* return error ack and blow away saved option results if any */
- if (mp != NULL)
- putnext(tcp->tcp_rq, mp);
- else {
- tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
- TSYSERR, ENOMEM);
- }
if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
-
+ return (error);
}
/*
* Handle connect to IPv6 destinations.
*/
-static void
-tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp,
- in_port_t dstport, uint32_t flowinfo, uint_t srcid, uint32_t scope_id)
+static int
+tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, in_port_t dstport,
+ uint32_t flowinfo, uint_t srcid, uint32_t scope_id, cred_t *cr, pid_t pid)
{
tcph_t *tcph;
- mblk_t *mp1;
+ mblk_t *mp;
ip6_rthdr_t *rth;
int32_t oldstate;
uint16_t lport;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ int error = 0;
+ conn_t *connp = tcp->tcp_connp;
ASSERT(tcp->tcp_family == AF_INET6);
@@ -6678,8 +6294,7 @@ tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp,
* IPv4-mapped IPv6 address.
*/
if (tcp->tcp_ipversion != IPV6_VERSION) {
- mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0);
- goto failed;
+ return (-TBADADDR);
}
/*
@@ -6694,7 +6309,7 @@ tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp,
/* Handle __sin6_src_id if socket not bound to an IP address */
if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) {
ip_srcid_find_id(srcid, &tcp->tcp_ip6h->ip6_src,
- tcp->tcp_connp->conn_zoneid, tcps->tcps_netstack);
+ connp->conn_zoneid, tcps->tcps_netstack);
tcp->tcp_ip_src_v6 = tcp->tcp_ip6h->ip6_src;
}
@@ -6724,7 +6339,7 @@ tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp,
ipp->ipp_fields |= IPPF_SCOPE_ID;
if (ipp->ipp_fields & IPPF_HAS_IP6I)
ip2dbg(("tcp_connect_v6: SCOPE_ID set\n"));
- reterr = tcp_build_hdrs(tcp->tcp_rq, tcp);
+ reterr = tcp_build_hdrs(tcp);
if (reterr != 0)
goto failed;
ip1dbg(("tcp_connect_ipv6: tcp_bld_hdrs returned\n"));
@@ -6741,7 +6356,7 @@ tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp,
*/
if (IN6_ARE_ADDR_EQUAL(dstaddrp, &tcp->tcp_ip6h->ip6_src) &&
(dstport == tcp->tcp_lport)) {
- mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0);
+ error = -TBADADDR;
goto failed;
}
@@ -6751,7 +6366,6 @@ tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp,
(IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
(flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
-
/*
* Massage a routing header (if present) putting the first hop
* in ip6_dst. Compute a starting value for the checksum which
@@ -6791,26 +6405,26 @@ tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp,
lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE,
B_FALSE, B_FALSE);
if (lport == 0) {
- mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0);
+ error = -TNOADDR;
goto failed;
}
}
tcp->tcp_state = TCPS_SYN_SENT;
- /*
- * TODO: allow data with connect requests
- * by unlinking M_DATA trailers here and
- * linking them in behind the T_OK_ACK mblk.
- * The tcp_rput() bind ack handler would then
- * feed them to tcp_wput_data() rather than call
- * tcp_timer().
- */
- mp = mi_tpi_ok_ack_alloc(mp);
- if (!mp) {
- tcp->tcp_state = oldstate;
- goto failed;
- }
- mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, sizeof (ipa6_conn_t));
- if (mp1) {
+
+ mp = allocb(sizeof (ire_t), BPRI_HI);
+ if (mp != NULL) {
+ in6_addr_t v6src;
+
+ mp->b_wptr += sizeof (ire_t);
+ mp->b_datap->db_type = IRE_DB_REQ_TYPE;
+ if (cr == NULL) {
+ cr = tcp->tcp_cred;
+ pid = tcp->tcp_cpid;
+ }
+ mblk_setcred(mp, cr);
+ DB_CPID(mp) = pid;
+ tcp->tcp_hard_binding = 1;
+
/*
* We need to make sure that the conn_recv is set to a non-null
* value before we insert the conn_t into the classifier table.
@@ -6819,32 +6433,28 @@ tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp,
*/
tcp->tcp_connp->conn_recv = tcp_input;
- /* Hang onto the T_OK_ACK for later. */
- linkb(mp1, mp);
- mblk_setcred(mp1, tcp->tcp_cred);
- mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp,
- &tcp->tcp_sticky_ipp);
+ if (tcp->tcp_ipversion == IPV4_VERSION) {
+ IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, &v6src);
+ } else {
+ v6src = tcp->tcp_ip6h->ip6_src;
+ }
+ error = ip_proto_bind_connected_v6(connp, &mp, IPPROTO_TCP,
+ &v6src, tcp->tcp_lport, &tcp->tcp_remote_v6,
+ &tcp->tcp_sticky_ipp, tcp->tcp_fport, B_TRUE, B_TRUE);
BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
tcp->tcp_active_open = 1;
- /* ip_bind_v6() may return ACK or ERROR */
- if (mp1 != NULL)
- tcp_rput_other(tcp, mp1);
- return;
+
+ return (tcp_post_ip_bind(tcp, mp, error));
}
/* Error case */
tcp->tcp_state = oldstate;
- mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
+ error = ENOMEM;
failed:
/* return error ack and blow away saved option results if any */
- if (mp != NULL)
- putnext(tcp->tcp_rq, mp);
- else {
- tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
- TSYSERR, ENOMEM);
- }
if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
+ return (error);
}
/*
@@ -6870,72 +6480,61 @@ tcp_def_q_set(tcp_t *tcp, mblk_t *mp)
mutex_exit(&tcps->tcps_g_q_lock);
iocp->ioc_error = EALREADY;
} else {
- mblk_t *mp1;
+ int error = 0;
+ conn_t *connp = tcp->tcp_connp;
+ ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
- mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, 0);
- if (mp1 == NULL) {
- mutex_exit(&tcps->tcps_g_q_lock);
- iocp->ioc_error = ENOMEM;
- } else {
- tcps->tcps_g_q = tcp->tcp_rq;
- mutex_exit(&tcps->tcps_g_q_lock);
- iocp->ioc_error = 0;
- iocp->ioc_rval = 0;
- /*
- * We are passing tcp_sticky_ipp as NULL
- * as it is not useful for tcp_default queue
- *
- * Set conn_recv just in case.
- */
- tcp->tcp_connp->conn_recv = tcp_conn_request;
+ tcps->tcps_g_q = tcp->tcp_rq;
+ mutex_exit(&tcps->tcps_g_q_lock);
+ iocp->ioc_error = 0;
+ iocp->ioc_rval = 0;
+ /*
+ * We are passing tcp_sticky_ipp as NULL
+ * as it is not useful for tcp_default queue
+ *
+ * Set conn_recv just in case.
+ */
+ tcp->tcp_connp->conn_recv = tcp_conn_request;
- mp1 = ip_bind_v6(q, mp1, tcp->tcp_connp, NULL);
- if (mp1 != NULL)
- tcp_rput_other(tcp, mp1);
+ ASSERT(connp->conn_af_isv6);
+ connp->conn_ulp = IPPROTO_TCP;
+
+ if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_TCP].connf_head !=
+ NULL || connp->conn_mac_exempt) {
+ error = -TBADADDR;
+ } else {
+ connp->conn_srcv6 = ipv6_all_zeros;
+ ipcl_proto_insert_v6(connp, IPPROTO_TCP);
}
+
+ (void) tcp_post_ip_bind(tcp, NULL, error);
}
qreply(q, mp);
}
-/*
- * Our client hereby directs us to reject the connection request
- * that tcp_conn_request() marked with 'seqnum'. Rejection consists
- * of sending the appropriate RST, not an ICMP error.
- */
-static void
-tcp_disconnect(tcp_t *tcp, mblk_t *mp)
+static int
+tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum)
{
tcp_t *ltcp = NULL;
- t_scalar_t seqnum;
conn_t *connp;
tcp_stack_t *tcps = tcp->tcp_tcps;
- ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
- if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) {
- tcp_err_ack(tcp, mp, TPROTO, 0);
- return;
- }
-
/*
* Right now, upper modules pass down a T_DISCON_REQ to TCP,
* when the stream is in BOUND state. Do not send a reset,
* since the destination IP address is not valid, and it can
* be the initialized value of all zeros (broadcast address).
*
- * If TCP has sent down a bind request to IP and has not
- * received the reply, reject the request. Otherwise, TCP
- * will be confused.
+ * XXX There won't be any pending bind request to IP.
*/
- if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_hard_binding) {
+ if (tcp->tcp_state <= TCPS_BOUND) {
if (tcp->tcp_debug) {
(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
"tcp_disconnect: bad state, %d", tcp->tcp_state);
}
- tcp_err_ack(tcp, mp, TOUTSTATE, 0);
- return;
+ return (TOUTSTATE);
}
- seqnum = ((struct T_discon_req *)mp->b_rptr)->SEQ_number;
if (seqnum == -1 || tcp->tcp_conn_req_max == 0) {
@@ -7009,25 +6608,42 @@ tcp_disconnect(tcp_t *tcp, mblk_t *mp)
tcp_reinit(tcp);
- if (old_state >= TCPS_ESTABLISHED) {
+ return (0);
+ } else if (!tcp_eager_blowoff(tcp, seqnum)) {
+ return (TBADSEQ);
+ }
+ return (0);
+}
+
+/*
+ * Our client hereby directs us to reject the connection request
+ * that tcp_conn_request() marked with 'seqnum'. Rejection consists
+ * of sending the appropriate RST, not an ICMP error.
+ */
+static void
+tcp_disconnect(tcp_t *tcp, mblk_t *mp)
+{
+ t_scalar_t seqnum;
+ int error;
+
+ ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
+ if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) {
+ tcp_err_ack(tcp, mp, TPROTO, 0);
+ return;
+ }
+ seqnum = ((struct T_discon_req *)mp->b_rptr)->SEQ_number;
+ error = tcp_disconnect_common(tcp, seqnum);
+ if (error != 0)
+ tcp_err_ack(tcp, mp, error, 0);
+ else {
+ if (tcp->tcp_state >= TCPS_ESTABLISHED) {
/* Send M_FLUSH according to TPI */
(void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW);
}
mp = mi_tpi_ok_ack_alloc(mp);
if (mp)
putnext(tcp->tcp_rq, mp);
- return;
- } else if (!tcp_eager_blowoff(tcp, seqnum)) {
- tcp_err_ack(tcp, mp, TBADSEQ, 0);
- return;
}
- if (tcp->tcp_state >= TCPS_ESTABLISHED) {
- /* Send M_FLUSH according to TPI */
- (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW);
- }
- mp = mi_tpi_ok_ack_alloc(mp);
- if (mp)
- putnext(tcp->tcp_rq, mp);
}
/*
@@ -7566,6 +7182,24 @@ tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp)
/* TODO: Default ETSDU is 1. Is that correct for tcp? */
}
+static void
+tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap,
+ t_uscalar_t cap_bits1)
+{
+ tcap->CAP_bits1 = 0;
+
+ if (cap_bits1 & TC1_INFO) {
+ tcp_copy_info(&tcap->INFO_ack, tcp);
+ tcap->CAP_bits1 |= TC1_INFO;
+ }
+
+ if (cap_bits1 & TC1_ACCEPTOR_ID) {
+ tcap->ACCEPTOR_id = tcp->tcp_acceptor_id;
+ tcap->CAP_bits1 |= TC1_ACCEPTOR_ID;
+ }
+
+}
+
/*
* This routine responds to T_CAPABILITY_REQ messages. It is called by
* tcp_wput. Much of the T_CAPABILITY_ACK information is copied from
@@ -7591,17 +7225,7 @@ tcp_capability_req(tcp_t *tcp, mblk_t *mp)
return;
tcap = (struct T_capability_ack *)mp->b_rptr;
- tcap->CAP_bits1 = 0;
-
- if (cap_bits1 & TC1_INFO) {
- tcp_copy_info(&tcap->INFO_ack, tcp);
- tcap->CAP_bits1 |= TC1_INFO;
- }
-
- if (cap_bits1 & TC1_ACCEPTOR_ID) {
- tcap->ACCEPTOR_id = tcp->tcp_acceptor_id;
- tcap->CAP_bits1 |= TC1_ACCEPTOR_ID;
- }
+ tcp_do_capability_ack(tcp, tcap, cap_bits1);
putnext(tcp->tcp_rq, mp);
}
@@ -7822,10 +7446,12 @@ tcp_reinit(tcp_t *tcp)
tcp->tcp_urp_mark_mp = NULL;
}
if (tcp->tcp_fused_sigurg_mp != NULL) {
+ ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp));
freeb(tcp->tcp_fused_sigurg_mp);
tcp->tcp_fused_sigurg_mp = NULL;
}
if (tcp->tcp_ordrel_mp != NULL) {
+ ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp));
freeb(tcp->tcp_ordrel_mp);
tcp->tcp_ordrel_mp = NULL;
}
@@ -7925,7 +7551,10 @@ tcp_reinit(tcp_t *tcp)
tcp->tcp_ip_src_v6 = tcp->tcp_bound_source_v6;
ASSERT(tcp->tcp_ptpbhn != NULL);
- tcp->tcp_rq->q_hiwat = tcps->tcps_recv_hiwat;
+ if (!IPCL_IS_NONSTR(tcp->tcp_connp))
+ tcp->tcp_rq->q_hiwat = tcps->tcps_recv_hiwat;
+ tcp->tcp_recv_hiwater = tcps->tcps_recv_hiwat;
+ tcp->tcp_recv_lowater = tcp_rinfo.mi_lowat;
tcp->tcp_rwnd = tcps->tcps_recv_hiwat;
tcp->tcp_mss = tcp->tcp_ipversion != IPV4_VERSION ?
tcps->tcps_mss_def_ipv6 : tcps->tcps_mss_def_ipv4;
@@ -7952,6 +7581,7 @@ tcp_reinit_values(tcp)
#define PRESERVE(x) ((x) = (x))
#endif /* lint */
+ PRESERVE(tcp->tcp_bind_hash_port);
PRESERVE(tcp->tcp_bind_hash);
PRESERVE(tcp->tcp_ptpbhn);
PRESERVE(tcp->tcp_acceptor_hash);
@@ -8239,6 +7869,8 @@ tcp_reinit_values(tcp)
DONTCARE(tcp->tcmp_stk[0]);
#endif
+ PRESERVE(tcp->tcp_connid);
+
#undef DONTCARE
#undef PRESERVE
@@ -9072,156 +8704,6 @@ noticmpv6:
}
/*
- * IP recognizes seven kinds of bind requests:
- *
- * - A zero-length address binds only to the protocol number.
- *
- * - A 4-byte address is treated as a request to
- * validate that the address is a valid local IPv4
- * address, appropriate for an application to bind to.
- * IP does the verification, but does not make any note
- * of the address at this time.
- *
- * - A 16-byte address contains is treated as a request
- * to validate a local IPv6 address, as the 4-byte
- * address case above.
- *
- * - A 16-byte sockaddr_in to validate the local IPv4 address and also
- * use it for the inbound fanout of packets.
- *
- * - A 24-byte sockaddr_in6 to validate the local IPv6 address and also
- * use it for the inbound fanout of packets.
- *
- * - A 12-byte address (ipa_conn_t) containing complete IPv4 fanout
- * information consisting of local and remote addresses
- * and ports. In this case, the addresses are both
- * validated as appropriate for this operation, and, if
- * so, the information is retained for use in the
- * inbound fanout.
- *
- * - A 36-byte address address (ipa6_conn_t) containing complete IPv6
- * fanout information, like the 12-byte case above.
- *
- * IP will also fill in the IRE request mblk with information
- * regarding our peer. In all cases, we notify IP of our protocol
- * type by appending a single protocol byte to the bind request.
- */
-static mblk_t *
-tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim, t_scalar_t addr_length)
-{
- char *cp;
- mblk_t *mp;
- struct T_bind_req *tbr;
- ipa_conn_t *ac;
- ipa6_conn_t *ac6;
- sin_t *sin;
- sin6_t *sin6;
-
- ASSERT(bind_prim == O_T_BIND_REQ || bind_prim == T_BIND_REQ);
- ASSERT((tcp->tcp_family == AF_INET &&
- tcp->tcp_ipversion == IPV4_VERSION) ||
- (tcp->tcp_family == AF_INET6 &&
- (tcp->tcp_ipversion == IPV4_VERSION ||
- tcp->tcp_ipversion == IPV6_VERSION)));
-
- mp = allocb(sizeof (*tbr) + addr_length + 1, BPRI_HI);
- if (!mp)
- return (mp);
- mp->b_datap->db_type = M_PROTO;
- tbr = (struct T_bind_req *)mp->b_rptr;
- tbr->PRIM_type = bind_prim;
- tbr->ADDR_offset = sizeof (*tbr);
- tbr->CONIND_number = 0;
- tbr->ADDR_length = addr_length;
- cp = (char *)&tbr[1];
- switch (addr_length) {
- case sizeof (ipa_conn_t):
- ASSERT(tcp->tcp_family == AF_INET);
- ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
-
- mp->b_cont = allocb(sizeof (ire_t), BPRI_HI);
- if (mp->b_cont == NULL) {
- freemsg(mp);
- return (NULL);
- }
- mp->b_cont->b_wptr += sizeof (ire_t);
- mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
-
- /* cp known to be 32 bit aligned */
- ac = (ipa_conn_t *)cp;
- ac->ac_laddr = tcp->tcp_ipha->ipha_src;
- ac->ac_faddr = tcp->tcp_remote;
- ac->ac_fport = tcp->tcp_fport;
- ac->ac_lport = tcp->tcp_lport;
- tcp->tcp_hard_binding = 1;
- break;
-
- case sizeof (ipa6_conn_t):
- ASSERT(tcp->tcp_family == AF_INET6);
-
- mp->b_cont = allocb(sizeof (ire_t), BPRI_HI);
- if (mp->b_cont == NULL) {
- freemsg(mp);
- return (NULL);
- }
- mp->b_cont->b_wptr += sizeof (ire_t);
- mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
-
- /* cp known to be 32 bit aligned */
- ac6 = (ipa6_conn_t *)cp;
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src,
- &ac6->ac6_laddr);
- } else {
- ac6->ac6_laddr = tcp->tcp_ip6h->ip6_src;
- }
- ac6->ac6_faddr = tcp->tcp_remote_v6;
- ac6->ac6_fport = tcp->tcp_fport;
- ac6->ac6_lport = tcp->tcp_lport;
- tcp->tcp_hard_binding = 1;
- break;
-
- case sizeof (sin_t):
- /*
- * NOTE: IPV6_ADDR_LEN also has same size.
- * Use family to discriminate.
- */
- if (tcp->tcp_family == AF_INET) {
- sin = (sin_t *)cp;
-
- *sin = sin_null;
- sin->sin_family = AF_INET;
- sin->sin_addr.s_addr = tcp->tcp_bound_source;
- sin->sin_port = tcp->tcp_lport;
- break;
- } else {
- *(in6_addr_t *)cp = tcp->tcp_bound_source_v6;
- }
- break;
-
- case sizeof (sin6_t):
- ASSERT(tcp->tcp_family == AF_INET6);
- sin6 = (sin6_t *)cp;
-
- *sin6 = sin6_null;
- sin6->sin6_family = AF_INET6;
- sin6->sin6_addr = tcp->tcp_bound_source_v6;
- sin6->sin6_port = tcp->tcp_lport;
- break;
-
- case IP_ADDR_LEN:
- ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
- *(uint32_t *)cp = tcp->tcp_ipha->ipha_src;
- break;
-
- }
- /* Add protocol number to end */
- cp[addr_length] = (char)IPPROTO_TCP;
- mp->b_wptr = (uchar_t *)&cp[addr_length + 1];
- return (mp);
-}
-
-/*
* Notify IP that we are having trouble with this connection. IP should
* blow the IRE away and start over.
*/
@@ -9268,25 +8750,29 @@ tcp_ip_notify(tcp_t *tcp)
/* Unlink and return any mblk that looks like it contains an ire */
static mblk_t *
-tcp_ire_mp(mblk_t *mp)
+tcp_ire_mp(mblk_t **mpp)
{
- mblk_t *prev_mp;
+ mblk_t *mp = *mpp;
+ mblk_t *prev_mp = NULL;
for (;;) {
- prev_mp = mp;
- mp = mp->b_cont;
- if (mp == NULL)
- break;
switch (DB_TYPE(mp)) {
case IRE_DB_TYPE:
case IRE_DB_REQ_TYPE:
- if (prev_mp != NULL)
+ if (mp == *mpp) {
+ *mpp = mp->b_cont;
+ } else {
prev_mp->b_cont = mp->b_cont;
+ }
mp->b_cont = NULL;
return (mp);
default:
break;
}
+ prev_mp = mp;
+ mp = mp->b_cont;
+ if (mp == NULL)
+ break;
}
return (mp);
}
@@ -9408,10 +8894,10 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
queue_t *q = tcp->tcp_rq;
int32_t mss = tcp->tcp_mss;
int maxpsz;
+ conn_t *connp = tcp->tcp_connp;
if (TCP_IS_DETACHED(tcp))
return (mss);
-
if (tcp->tcp_fused) {
maxpsz = tcp_fuse_maxpsz_set(tcp);
mss = INFPSZ;
@@ -9435,6 +8921,7 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
* head to break down larger than SMSS writes into SMSS-
* size mblks, up to tcp_maxpsz_multiplier mblks at a time.
*/
+ /* XXX tune this with ndd tcp_maxpsz_multiplier */
maxpsz = tcp->tcp_maxpsz * mss;
if (maxpsz > tcp->tcp_xmit_hiwater/2) {
maxpsz = tcp->tcp_xmit_hiwater/2;
@@ -9442,12 +8929,15 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
maxpsz = MSS_ROUNDUP(maxpsz, mss);
}
}
- (void) setmaxps(q, maxpsz);
- tcp->tcp_wq->q_maxpsz = maxpsz;
- if (set_maxblk)
- (void) mi_set_sth_maxblk(q, mss);
+ (void) proto_set_maxpsz(q, connp, maxpsz);
+ if (!(IPCL_IS_NONSTR(connp))) {
+ /* XXX do it in set_maxpsz()? */
+ tcp->tcp_wq->q_maxpsz = maxpsz;
+ }
+ if (set_maxblk)
+ (void) proto_set_tx_maxblk(q, connp, mss);
return (mss);
}
@@ -9687,116 +9177,74 @@ tcp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
return (tcp_open(q, devp, flag, sflag, credp, B_TRUE));
}
-static int
-tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
- boolean_t isv6)
+static conn_t *
+tcp_create_common(queue_t *q, cred_t *credp, boolean_t isv6,
+ boolean_t issocket, int *errorp)
{
tcp_t *tcp = NULL;
conn_t *connp;
int err;
- vmem_t *minor_arena = NULL;
- dev_t conn_dev;
zoneid_t zoneid;
- tcp_stack_t *tcps = NULL;
+ tcp_stack_t *tcps;
+ squeue_t *sqp;
- if (q->q_ptr != NULL)
- return (0);
+ ASSERT(errorp != NULL);
+ /*
+ * Find the proper zoneid and netstack.
+ */
+ /*
+ * Special case for install: miniroot needs to be able to
+ * access files via NFS as though it were always in the
+ * global zone.
+ */
+ if (credp == kcred && nfs_global_client_only != 0) {
+ zoneid = GLOBAL_ZONEID;
+ tcps = netstack_find_by_stackid(GLOBAL_NETSTACKID)->
+ netstack_tcp;
+ ASSERT(tcps != NULL);
+ } else {
+ netstack_t *ns;
- if (sflag == MODOPEN)
- return (EINVAL);
+ ns = netstack_find_by_cred(credp);
+ ASSERT(ns != NULL);
+ tcps = ns->netstack_tcp;
+ ASSERT(tcps != NULL);
- if (!(flag & SO_ACCEPTOR)) {
/*
- * Special case for install: miniroot needs to be able to
- * access files via NFS as though it were always in the
- * global zone.
+ * For exclusive stacks we set the zoneid to zero
+ * to make TCP operate as if in the global zone.
*/
- if (credp == kcred && nfs_global_client_only != 0) {
+ if (tcps->tcps_netstack->netstack_stackid !=
+ GLOBAL_NETSTACKID)
zoneid = GLOBAL_ZONEID;
- tcps = netstack_find_by_stackid(GLOBAL_NETSTACKID)->
- netstack_tcp;
- ASSERT(tcps != NULL);
- } else {
- netstack_t *ns;
-
- ns = netstack_find_by_cred(credp);
- ASSERT(ns != NULL);
- tcps = ns->netstack_tcp;
- ASSERT(tcps != NULL);
-
- /*
- * For exclusive stacks we set the zoneid to zero
- * to make TCP operate as if in the global zone.
- */
- if (tcps->tcps_netstack->netstack_stackid !=
- GLOBAL_NETSTACKID)
- zoneid = GLOBAL_ZONEID;
- else
- zoneid = crgetzoneid(credp);
- }
- /*
- * For stackid zero this is done from strplumb.c, but
- * non-zero stackids are handled here.
- */
- if (tcps->tcps_g_q == NULL &&
- tcps->tcps_netstack->netstack_stackid !=
- GLOBAL_NETSTACKID) {
- tcp_g_q_setup(tcps);
- }
- }
-
- if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) &&
- ((conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) {
- minor_arena = ip_minor_arena_la;
- } else {
- /*
- * Either minor numbers in the large arena were exhausted
- * or a non socket application is doing the open.
- * Try to allocate from the small arena.
- */
- if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
- if (tcps != NULL)
- netstack_rele(tcps->tcps_netstack);
- return (EBUSY);
- }
- minor_arena = ip_minor_arena_sa;
+ else
+ zoneid = crgetzoneid(credp);
}
- ASSERT(minor_arena != NULL);
-
- *devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
-
- if (flag & SO_ACCEPTOR) {
- /* No netstack_find_by_cred, hence no netstack_rele needed */
- ASSERT(tcps == NULL);
- q->q_qinfo = &tcp_acceptor_rinit;
- /*
- * the conn_dev and minor_arena will be subsequently used by
- * tcp_wput_accept() and tcpclose_accept() to figure out the
- * minor device number for this connection from the q_ptr.
- */
- RD(q)->q_ptr = (void *)conn_dev;
- WR(q)->q_qinfo = &tcp_acceptor_winit;
- WR(q)->q_ptr = (void *)minor_arena;
- qprocson(q);
- return (0);
+ /*
+ * For stackid zero this is done from strplumb.c, but
+ * non-zero stackids are handled here.
+ */
+ if (tcps->tcps_g_q == NULL &&
+ tcps->tcps_netstack->netstack_stackid !=
+ GLOBAL_NETSTACKID) {
+ tcp_g_q_setup(tcps);
}
- connp = (conn_t *)tcp_get_conn(IP_SQUEUE_GET(lbolt), tcps);
+ sqp = IP_SQUEUE_GET((uint_t)gethrtime());
+ connp = (conn_t *)tcp_get_conn(sqp, tcps);
/*
* Both tcp_get_conn and netstack_find_by_cred incremented refcnt,
* so we drop it by one.
*/
netstack_rele(tcps->tcps_netstack);
if (connp == NULL) {
- inet_minor_free(minor_arena, conn_dev);
- q->q_ptr = NULL;
- return (ENOSR);
+ *errorp = ENOSR;
+ return (NULL);
}
- connp->conn_sqp = IP_SQUEUE_GET(lbolt);
+ connp->conn_sqp = sqp;
connp->conn_initial_sqp = connp->conn_sqp;
tcp = connp->conn_tcp;
- q->q_ptr = WR(q)->q_ptr = connp;
if (isv6) {
connp->conn_flags |= (IPCL_TCP6|IPCL_ISV6);
connp->conn_send = ip_output_v6;
@@ -9838,45 +9286,135 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
if (getpflags(NET_MAC_AWARE, credp) != 0)
connp->conn_mac_exempt = B_TRUE;
- connp->conn_dev = conn_dev;
- connp->conn_minor_arena = minor_arena;
+ connp->conn_dev = NULL;
+ if (issocket) {
+ connp->conn_flags |= IPCL_SOCKET;
+ tcp->tcp_issocket = 1;
+ }
- ASSERT(q->q_qinfo == &tcp_rinitv4 || q->q_qinfo == &tcp_rinitv6);
- ASSERT(WR(q)->q_qinfo == &tcp_winit);
+ tcp->tcp_recv_hiwater = tcps->tcps_recv_hiwat;
+ tcp->tcp_rwnd = tcps->tcps_recv_hiwat;
+ tcp->tcp_recv_lowater = tcp_rinfo.mi_lowat;
- if (flag & SO_SOCKSTR) {
+ /* Non-zero default values */
+ connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
+
+ if (q == NULL) {
/*
- * No need to insert a socket in tcp acceptor hash.
- * If it was a socket acceptor stream, we dealt with
- * it above. A socket listener can never accept a
- * connection and doesn't need acceptor_id.
+ * Create a helper stream for non-STREAMS socket.
*/
- connp->conn_flags |= IPCL_SOCKET;
- tcp->tcp_issocket = 1;
- WR(q)->q_qinfo = &tcp_sock_winit;
+ err = ip_create_helper_stream(connp, tcps->tcps_ldi_ident);
+ if (err != 0) {
+ ip1dbg(("tcp_create: create of IP helper stream "
+ "failed\n"));
+ CONN_DEC_REF(connp);
+ *errorp = err;
+ return (NULL);
+ }
+ q = connp->conn_rq;
} else {
-#ifdef _ILP32
- tcp->tcp_acceptor_id = (t_uscalar_t)RD(q);
-#else
- tcp->tcp_acceptor_id = conn_dev;
-#endif /* _ILP32 */
- tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
+ RD(q)->q_hiwat = tcps->tcps_recv_hiwat;
}
+ SOCK_CONNID_INIT(tcp->tcp_connid);
err = tcp_init(tcp, q);
if (err != 0) {
- inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
- tcp_acceptor_hash_remove(tcp);
CONN_DEC_REF(connp);
+ *errorp = err;
+ return (NULL);
+ }
+
+ return (connp);
+}
+
+static int
+tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
+ boolean_t isv6)
+{
+ tcp_t *tcp = NULL;
+ conn_t *connp = NULL;
+ int err;
+ vmem_t *minor_arena = NULL;
+ dev_t conn_dev;
+ boolean_t issocket;
+
+ if (q->q_ptr != NULL)
+ return (0);
+
+ if (sflag == MODOPEN)
+ return (EINVAL);
+
+ if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) &&
+ ((conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) {
+ minor_arena = ip_minor_arena_la;
+ } else {
+ /*
+ * Either minor numbers in the large arena were exhausted
+ * or a non socket application is doing the open.
+ * Try to allocate from the small arena.
+ */
+ if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
+ return (EBUSY);
+ }
+ minor_arena = ip_minor_arena_sa;
+ }
+
+ ASSERT(minor_arena != NULL);
+
+ *devp = makedevice(getmajor(*devp), (minor_t)conn_dev);
+
+ if (flag & SO_FALLBACK) {
+ /*
+ * Non streams socket needs a stream to fallback to
+ */
+ RD(q)->q_ptr = (void *)conn_dev;
+ WR(q)->q_qinfo = &tcp_fallback_sock_winit;
+ WR(q)->q_ptr = (void *)minor_arena;
+ qprocson(q);
+ return (0);
+ } else if (flag & SO_ACCEPTOR) {
+ q->q_qinfo = &tcp_acceptor_rinit;
+ /*
+ * the conn_dev and minor_arena will be subsequently used by
+ * tcp_wput_accept() and tcpclose_accept() to figure out the
+ * minor device number for this connection from the q_ptr.
+ */
+ RD(q)->q_ptr = (void *)conn_dev;
+ WR(q)->q_qinfo = &tcp_acceptor_winit;
+ WR(q)->q_ptr = (void *)minor_arena;
+ qprocson(q);
+ return (0);
+ }
+
+ issocket = flag & SO_SOCKSTR;
+ connp = tcp_create_common(q, credp, isv6, issocket, &err);
+
+ if (connp == NULL) {
+ inet_minor_free(minor_arena, conn_dev);
q->q_ptr = WR(q)->q_ptr = NULL;
return (err);
}
- RD(q)->q_hiwat = tcps->tcps_recv_hiwat;
- tcp->tcp_rwnd = tcps->tcps_recv_hiwat;
+ q->q_ptr = WR(q)->q_ptr = connp;
+
+ connp->conn_dev = conn_dev;
+ connp->conn_minor_arena = minor_arena;
+
+ ASSERT(q->q_qinfo == &tcp_rinitv4 || q->q_qinfo == &tcp_rinitv6);
+ ASSERT(WR(q)->q_qinfo == &tcp_winit);
+
+ if (issocket) {
+ WR(q)->q_qinfo = &tcp_sock_winit;
+ } else {
+ tcp = connp->conn_tcp;
+#ifdef _ILP32
+ tcp->tcp_acceptor_id = (t_uscalar_t)RD(q);
+#else
+ tcp->tcp_acceptor_id = conn_dev;
+#endif /* _ILP32 */
+ tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
+ }
- /* Non-zero default values */
- connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
/*
* Put the ref for TCP. Ref for IP was already put
* by ipcl_conn_create. Also Make the conn_t globally
@@ -9922,7 +9460,7 @@ tcp_allow_connopt_set(int level, int name)
}
/*
- * This routine gets default values of certain options whose default
+ * this routine gets default values of certain options whose default
* values are maintained by protocol specific code
*/
/* ARGSUSED */
@@ -9975,15 +9513,10 @@ tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
return (sizeof (int));
}
-
-/*
- * TCP routine to get the values of options.
- */
-int
-tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
+static int
+tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
{
int *i1 = (int *)ptr;
- conn_t *connp = Q_TO_CONN(q);
tcp_t *tcp = connp->conn_tcp;
ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp;
@@ -10028,7 +9561,7 @@ tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
*i1 = tcp->tcp_xmit_hiwater;
break;
case SO_RCVBUF:
- *i1 = RD(q)->q_hiwat;
+ *i1 = tcp->tcp_recv_hiwater;
break;
case SO_SND_COPYAVOID:
*i1 = tcp->tcp_snd_zcopy_on ?
@@ -10052,6 +9585,8 @@ tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
case SO_DOMAIN:
*i1 = tcp->tcp_family;
break;
+ case SO_ACCEPTCONN:
+ *i1 = (tcp->tcp_state == TCPS_LISTEN);
default:
return (-1);
}
@@ -10293,22 +9828,84 @@ tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
}
/*
+ * TCP routine to get the values of options.
+ */
+int
+tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
+{
+ return (tcp_opt_get(Q_TO_CONN(q), level, name, ptr));
+}
+
+/* returns UNIX error, the optlen is a value-result arg */
+int
+tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
+ void *optvalp, socklen_t *optlen, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ squeue_t *sqp = connp->conn_sqp;
+ int error;
+ t_uscalar_t max_optbuf_len;
+ void *optvalp_buf;
+ int len;
+
+ error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
+ tcp_opt_obj.odb_opt_des_arr,
+ tcp_opt_obj.odb_opt_arr_cnt,
+ tcp_opt_obj.odb_topmost_tpiprovider,
+ B_FALSE, B_TRUE, cr);
+ if (error != 0) {
+ if (error < 0) {
+ error = proto_tlitosyserr(-error);
+ }
+ return (error);
+ }
+
+ optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
+
+ error = squeue_synch_enter(sqp, connp, 0);
+ if (error == ENOMEM) {
+ return (ENOMEM);
+ }
+
+ len = tcp_opt_get(connp, level, option_name, optvalp_buf);
+ squeue_synch_exit(sqp, connp);
+
+ if (len < 0) {
+ /*
+ * Pass on to IP
+ */
+ kmem_free(optvalp_buf, max_optbuf_len);
+ return (ip_get_options(connp, level, option_name,
+ optvalp, optlen, cr));
+ } else {
+ /*
+ * update optlen and copy option value
+ */
+ t_uscalar_t size = MIN(len, *optlen);
+ bcopy(optvalp_buf, optvalp, size);
+ bcopy(&size, optlen, sizeof (size));
+
+ kmem_free(optvalp_buf, max_optbuf_len);
+ return (0);
+ }
+}
+
+/*
* We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
* Parameters are assumed to be verified by the caller.
*/
/* ARGSUSED */
int
-tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
+tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
- void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+ void *thisdg_attrs, cred_t *cr)
{
- conn_t *connp = Q_TO_CONN(q);
tcp_t *tcp = connp->conn_tcp;
int *i1 = (int *)invalp;
boolean_t onoff = (*i1 == 0) ? 0 : 1;
boolean_t checkonly;
int reterr;
- tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
+ tcp_stack_t *tcps = tcp->tcp_tcps;
switch (optset_context) {
case SETFN_OPTCOM_CHECKONLY:
@@ -10371,7 +9968,6 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
* of passed in length is done. It is assumed *_optcom_req()
* routines do the right thing.
*/
-
switch (level) {
case SOL_SOCKET:
switch (name) {
@@ -10408,7 +10004,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
break;
case SO_KEEPALIVE:
if (checkonly) {
- /* T_CHECK case */
+ /* check only case */
break;
}
@@ -10462,8 +10058,11 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
}
break;
case SO_OOBINLINE:
- if (!checkonly)
+ if (!checkonly) {
tcp->tcp_oobinline = onoff;
+ if (IPCL_IS_NONSTR(tcp->tcp_connp))
+ proto_set_rx_oob_opt(connp, onoff);
+ }
break;
case SO_DGRAM_ERRIND:
if (!checkonly)
@@ -10740,7 +10339,6 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
/*
* Only sticky options; no ancillary data
*/
- ASSERT(thisdg_attrs == NULL);
ipp = &tcp->tcp_sticky_ipp;
switch (name) {
@@ -10764,22 +10362,15 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
(uint8_t)*i1;
ipp->ipp_fields |= IPPF_UNICAST_HOPS;
}
- reterr = tcp_build_hdrs(q, tcp);
+ reterr = tcp_build_hdrs(tcp);
if (reterr != 0)
return (reterr);
}
break;
case IPV6_BOUND_IF:
if (!checkonly) {
- int error = 0;
-
tcp->tcp_bound_if = *i1;
- error = ip_opt_set_ill(tcp->tcp_connp, *i1,
- B_TRUE, checkonly, level, name, mblk);
- if (error != 0) {
- *outlenp = 0;
- return (error);
- }
+ PASS_OPT_TO_IP(connp);
}
break;
/*
@@ -10795,6 +10386,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
~TCP_IPV6_RECVPKTINFO;
/* Force it to be sent up with the next msg */
tcp->tcp_recvifindex = 0;
+ PASS_OPT_TO_IP(connp);
}
break;
case IPV6_RECVTCLASS:
@@ -10805,6 +10397,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
else
tcp->tcp_ipv6_recvancillary &=
~TCP_IPV6_RECVTCLASS;
+ PASS_OPT_TO_IP(connp);
}
break;
case IPV6_RECVHOPLIMIT:
@@ -10817,6 +10410,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
~TCP_IPV6_RECVHOPLIMIT;
/* Force it to be sent up with the next msg */
tcp->tcp_recvhops = 0xffffffffU;
+ PASS_OPT_TO_IP(connp);
}
break;
case IPV6_RECVHOPOPTS:
@@ -10827,6 +10421,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
else
tcp->tcp_ipv6_recvancillary &=
~TCP_IPV6_RECVHOPOPTS;
+ PASS_OPT_TO_IP(connp);
}
break;
case IPV6_RECVDSTOPTS:
@@ -10837,6 +10432,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
else
tcp->tcp_ipv6_recvancillary &=
~TCP_IPV6_RECVDSTOPTS;
+ PASS_OPT_TO_IP(connp);
}
break;
case _OLD_IPV6_RECVDSTOPTS:
@@ -10857,6 +10453,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
else
tcp->tcp_ipv6_recvancillary &=
~TCP_IPV6_RECVRTHDR;
+ PASS_OPT_TO_IP(connp);
}
break;
case IPV6_RECVRTHDRDSTOPTS:
@@ -10867,6 +10464,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
else
tcp->tcp_ipv6_recvancillary &=
~TCP_IPV6_RECVRTDSTOPTS;
+ PASS_OPT_TO_IP(connp);
}
break;
case IPV6_PKTINFO:
@@ -10890,11 +10488,11 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
return (EINVAL);
/*
- * ip6_set_pktinfo() validates the source
- * address and interface index.
+ * IP will validate the source address and
+ * interface index.
*/
- reterr = ip6_set_pktinfo(cr, tcp->tcp_connp,
- pkti, mblk);
+ reterr = ip_set_options(tcp->tcp_connp, level,
+ name, invalp, inlen, cr);
if (reterr != 0)
return (reterr);
ipp->ipp_ifindex = pkti->ipi6_ifindex;
@@ -10908,9 +10506,10 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
else
ipp->ipp_fields &= ~IPPF_ADDR;
}
- reterr = tcp_build_hdrs(q, tcp);
+ reterr = tcp_build_hdrs(tcp);
if (reterr != 0)
return (reterr);
+ PASS_OPT_TO_IP(connp);
break;
case IPV6_TCLASS:
if (inlen != 0 && inlen != sizeof (int))
@@ -10931,7 +10530,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
}
ipp->ipp_fields |= IPPF_TCLASS;
}
- reterr = tcp_build_hdrs(q, tcp);
+ reterr = tcp_build_hdrs(tcp);
if (reterr != 0)
return (reterr);
break;
@@ -10962,9 +10561,10 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
else
ipp->ipp_fields &= ~IPPF_NEXTHOP;
}
- reterr = tcp_build_hdrs(q, tcp);
+ reterr = tcp_build_hdrs(tcp);
if (reterr != 0)
return (reterr);
+ PASS_OPT_TO_IP(connp);
break;
case IPV6_HOPOPTS: {
ip6_hbh_t *hopts = (ip6_hbh_t *)invalp;
@@ -10989,7 +10589,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
ipp->ipp_fields &= ~IPPF_HOPOPTS;
else
ipp->ipp_fields |= IPPF_HOPOPTS;
- reterr = tcp_build_hdrs(q, tcp);
+ reterr = tcp_build_hdrs(tcp);
if (reterr != 0)
return (reterr);
break;
@@ -11017,7 +10617,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
ipp->ipp_fields &= ~IPPF_RTDSTOPTS;
else
ipp->ipp_fields |= IPPF_RTDSTOPTS;
- reterr = tcp_build_hdrs(q, tcp);
+ reterr = tcp_build_hdrs(tcp);
if (reterr != 0)
return (reterr);
break;
@@ -11045,7 +10645,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
ipp->ipp_fields &= ~IPPF_DSTOPTS;
else
ipp->ipp_fields |= IPPF_DSTOPTS;
- reterr = tcp_build_hdrs(q, tcp);
+ reterr = tcp_build_hdrs(tcp);
if (reterr != 0)
return (reterr);
break;
@@ -11073,14 +10673,15 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
ipp->ipp_fields &= ~IPPF_RTHDR;
else
ipp->ipp_fields |= IPPF_RTHDR;
- reterr = tcp_build_hdrs(q, tcp);
+ reterr = tcp_build_hdrs(tcp);
if (reterr != 0)
return (reterr);
break;
}
case IPV6_V6ONLY:
- if (!checkonly)
+ if (!checkonly) {
tcp->tcp_connp->conn_ipv6_v6only = onoff;
+ }
break;
case IPV6_USE_MIN_MTU:
if (inlen != sizeof (int))
@@ -11140,6 +10741,80 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
return (0);
}
+/* ARGSUSED */
+int
+tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
+ uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
+ void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+{
+ conn_t *connp = Q_TO_CONN(q);
+
+ return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp,
+ outlenp, outvalp, thisdg_attrs, cr));
+}
+
+int
+tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
+ const void *optvalp, socklen_t optlen, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ squeue_t *sqp = connp->conn_sqp;
+ int error;
+
+ /*
+ * Entering the squeue synchronously can result in a context switch,
+ * which can cause a rather sever performance degradation. So we try to
+ * handle whatever options we can without entering the squeue.
+ */
+ if (level == IPPROTO_TCP) {
+ switch (option_name) {
+ case TCP_NODELAY:
+ if (optlen != sizeof (int32_t))
+ return (EINVAL);
+ mutex_enter(&connp->conn_tcp->tcp_non_sq_lock);
+ connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 :
+ connp->conn_tcp->tcp_mss;
+ mutex_exit(&connp->conn_tcp->tcp_non_sq_lock);
+ return (0);
+ default:
+ break;
+ }
+ }
+
+ error = squeue_synch_enter(sqp, connp, 0);
+ if (error == ENOMEM) {
+ return (ENOMEM);
+ }
+
+ error = proto_opt_check(level, option_name, optlen, NULL,
+ tcp_opt_obj.odb_opt_des_arr,
+ tcp_opt_obj.odb_opt_arr_cnt,
+ tcp_opt_obj.odb_topmost_tpiprovider,
+ B_TRUE, B_FALSE, cr);
+
+ if (error != 0) {
+ if (error < 0) {
+ error = proto_tlitosyserr(-error);
+ }
+ squeue_synch_exit(sqp, connp);
+ return (error);
+ }
+
+ error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
+ optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
+ NULL, cr);
+ squeue_synch_exit(sqp, connp);
+
+ if (error < 0) {
+ /*
+ * Pass on to ip
+ */
+ error = ip_set_options(connp, level, option_name, optvalp,
+ optlen, cr);
+ }
+ return (error);
+}
+
/*
* Update tcp_sticky_hdrs based on tcp_sticky_ipp.
* The headers include ip6i_t (if needed), ip6_t, any sticky extension
@@ -11148,7 +10823,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
* Returns failure if can't allocate memory.
*/
static int
-tcp_build_hdrs(queue_t *q, tcp_t *tcp)
+tcp_build_hdrs(tcp_t *tcp)
{
char *hdrs;
uint_t hdrs_len;
@@ -11157,6 +10832,7 @@ tcp_build_hdrs(queue_t *q, tcp_t *tcp)
ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp;
in6_addr_t src, dst;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ conn_t *connp = tcp->tcp_connp;
/*
* save the existing tcp header and source/dest IP addresses
@@ -11241,7 +10917,8 @@ tcp_build_hdrs(queue_t *q, tcp_t *tcp)
}
/* Try to get everything in a single mblk */
- (void) mi_set_sth_wroff(RD(q), hdrs_len + tcps->tcps_wroff_xtra);
+ (void) proto_set_tx_wroff(tcp->tcp_rq, connp,
+ hdrs_len + tcps->tcps_wroff_xtra);
return (0);
}
@@ -11368,6 +11045,7 @@ tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, uchar_t *ptr, uint_t len)
uint8_t *ip_optp;
tcph_t *new_tcph;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ conn_t *connp = tcp->tcp_connp;
if ((len > TCP_MAX_IP_OPTIONS_LENGTH) || (len & 0x3))
return (EINVAL);
@@ -11408,7 +11086,7 @@ tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, uchar_t *ptr, uint_t len)
tcp->tcp_hdr_len = len + tcph_len;
if (!TCP_IS_DETACHED(tcp)) {
/* Always allocate room for all options. */
- (void) mi_set_sth_wroff(tcp->tcp_rq,
+ (void) proto_set_tx_wroff(tcp->tcp_rq, connp,
TCP_MAX_COMBINED_HEADER_LENGTH + tcps->tcps_wroff_xtra);
}
return (0);
@@ -11721,26 +11399,55 @@ tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp)
tcp->tcp_reass_tail = mp;
}
+static uint_t
+tcp_rwnd_reopen(tcp_t *tcp)
+{
+ uint_t ret = 0;
+ uint_t thwin;
+
+ /* Learn the latest rwnd information that we sent to the other side. */
+ thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win))
+ << tcp->tcp_rcv_ws;
+ /* This is peer's calculated send window (our receive window). */
+ thwin -= tcp->tcp_rnxt - tcp->tcp_rack;
+ /*
+ * Increase the receive window to max. But we need to do receiver
+ * SWS avoidance. This means that we need to check the increase of
+ * of receive window is at least 1 MSS.
+ */
+ if (tcp->tcp_recv_hiwater - thwin >= tcp->tcp_mss) {
+ /*
+ * If the window that the other side knows is less than max
+ * deferred acks segments, send an update immediately.
+ */
+ if (thwin < tcp->tcp_rack_cur_max * tcp->tcp_mss) {
+ BUMP_MIB(&tcp->tcp_tcps->tcps_mib, tcpOutWinUpdate);
+ ret = TH_ACK_NEEDED;
+ }
+ tcp->tcp_rwnd = tcp->tcp_recv_hiwater;
+ }
+ return (ret);
+}
+
/*
* Send up all messages queued on tcp_rcv_list.
*/
static uint_t
-tcp_rcv_drain(queue_t *q, tcp_t *tcp)
+tcp_rcv_drain(tcp_t *tcp)
{
mblk_t *mp;
uint_t ret = 0;
- uint_t thwin;
#ifdef DEBUG
uint_t cnt = 0;
#endif
- tcp_stack_t *tcps = tcp->tcp_tcps;
+ queue_t *q = tcp->tcp_rq;
/* Can't drain on an eager connection */
if (tcp->tcp_listener != NULL)
return (ret);
- /* Can't be sodirect enabled */
- ASSERT(SOD_NOT_ENABLED(tcp));
+ /* Can't be a non-STREAMS connection or sodirect enabled */
+ ASSERT((!IPCL_IS_NONSTR(tcp->tcp_connp)) && SOD_NOT_ENABLED(tcp));
/* No need for the push timer now. */
if (tcp->tcp_push_tid != 0) {
@@ -11758,7 +11465,8 @@ tcp_rcv_drain(queue_t *q, tcp_t *tcp)
* some work.
*/
if ((tcp->tcp_fused || tcp->tcp_fused_sigurg)) {
- ASSERT(tcp->tcp_fused_sigurg_mp != NULL);
+ ASSERT(IPCL_IS_NONSTR(tcp->tcp_connp) ||
+ tcp->tcp_fused_sigurg_mp != NULL);
if (tcp_fuse_rcv_drain(q, tcp, tcp->tcp_fused ? NULL :
&tcp->tcp_fused_sigurg_mp))
return (ret);
@@ -11779,32 +11487,16 @@ tcp_rcv_drain(queue_t *q, tcp_t *tcp)
}
putnext(q, mp);
}
+#ifdef DEBUG
ASSERT(cnt == tcp->tcp_rcv_cnt);
+#endif
tcp->tcp_rcv_last_head = NULL;
tcp->tcp_rcv_last_tail = NULL;
tcp->tcp_rcv_cnt = 0;
- /* Learn the latest rwnd information that we sent to the other side. */
- thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win))
- << tcp->tcp_rcv_ws;
- /* This is peer's calculated send window (our receive window). */
- thwin -= tcp->tcp_rnxt - tcp->tcp_rack;
- /*
- * Increase the receive window to max. But we need to do receiver
- * SWS avoidance. This means that we need to check the increase of
- * of receive window is at least 1 MSS.
- */
- if (canputnext(q) && (q->q_hiwat - thwin >= tcp->tcp_mss)) {
- /*
- * If the window that the other side knows is less than max
- * deferred acks segments, send an update immediately.
- */
- if (thwin < tcp->tcp_rack_cur_max * tcp->tcp_mss) {
- BUMP_MIB(&tcps->tcps_mib, tcpOutWinUpdate);
- ret = TH_ACK_NEEDED;
- }
- tcp->tcp_rwnd = q->q_hiwat;
- }
+ if (canputnext(q))
+ return (tcp_rwnd_reopen(tcp));
+
return (ret);
}
@@ -12993,8 +12685,27 @@ tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2)
tcp->tcp_remote)] = tcp->tcp_remote;
}
mutex_exit(&listener->tcp_eager_lock);
- if (need_send_conn_ind)
- putnext(listener->tcp_rq, mp);
+ if (need_send_conn_ind) {
+ if (IPCL_IS_NONSTR(lconnp)) {
+ ASSERT(tcp->tcp_listener == listener);
+ ASSERT(tcp->tcp_saved_listener == listener);
+ if ((*lconnp->conn_upcalls->su_newconn)
+ (lconnp->conn_upper_handle,
+ (sock_lower_handle_t)tcp->tcp_connp,
+ &sock_tcp_downcalls, DB_CRED(mp), DB_CPID(mp),
+ &tcp->tcp_connp->conn_upcalls) != NULL) {
+ /*
+ * Keep the message around
+ * in case of fallback
+ */
+ tcp->tcp_conn.tcp_eager_conn_ind = mp;
+ } else {
+ freemsg(mp);
+ }
+ } else {
+ putnext(listener->tcp_rq, mp);
+ }
+ }
}
mblk_t *
@@ -13223,6 +12934,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
rptr = mp->b_rptr;
}
ASSERT(DB_TYPE(mp) == M_DATA);
+ ASSERT(mp->b_next == NULL);
tcph = (tcph_t *)&rptr[ip_hdr_len];
seg_seq = ABE32_TO_U32(tcph->th_seq);
@@ -13339,8 +13051,8 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
* The following changes our rwnd to be a multiple of the
* MIN(peer MSS, our MSS) for performance reason.
*/
- (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(tcp->tcp_rq->q_hiwat,
- tcp->tcp_mss));
+ (void) tcp_rwnd_set(tcp,
+ MSS_ROUNDUP(tcp->tcp_recv_hiwater, tcp->tcp_mss));
/* Is the other end ECN capable? */
if (tcp->tcp_ecn_ok) {
@@ -13361,12 +13073,13 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
if (!TCP_IS_DETACHED(tcp)) {
/* Allocate room for SACK options if needed. */
if (tcp->tcp_snd_sack_ok) {
- (void) mi_set_sth_wroff(tcp->tcp_rq,
- tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN +
+ (void) proto_set_tx_wroff(tcp->tcp_rq, connp,
+ tcp->tcp_hdr_len +
+ TCPOPT_MAX_SACK_LEN +
(tcp->tcp_loopback ? 0 :
tcps->tcps_wroff_xtra));
} else {
- (void) mi_set_sth_wroff(tcp->tcp_rq,
+ (void) proto_set_tx_wroff(tcp->tcp_rq, connp,
tcp->tcp_hdr_len +
(tcp->tcp_loopback ? 0 :
tcps->tcps_wroff_xtra));
@@ -13466,8 +13179,18 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
BUMP_LOCAL(tcp->tcp_obsegs);
BUMP_MIB(&tcps->tcps_mib, tcpOutAck);
- /* Send up T_CONN_CON */
- putnext(tcp->tcp_rq, mp1);
+ if (!IPCL_IS_NONSTR(connp)) {
+ /* Send up T_CONN_CON */
+ putnext(tcp->tcp_rq, mp1);
+ } else {
+ (*connp->conn_upcalls->
+ su_connected)
+ (connp->conn_upper_handle,
+ tcp->tcp_connid,
+ DB_CRED(mp1),
+ DB_CPID(mp1));
+ freemsg(mp1);
+ }
freemsg(mp);
return;
@@ -13481,7 +13204,15 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2)
*/
TCP_STAT(tcps, tcp_fusion_unfusable);
tcp->tcp_unfusable = B_TRUE;
- putnext(tcp->tcp_rq, mp1);
+ if (!IPCL_IS_NONSTR(connp)) {
+ putnext(tcp->tcp_rq, mp1);
+ } else {
+ (*connp->conn_upcalls->su_connected)
+ (connp->conn_upper_handle,
+ tcp->tcp_connid, DB_CRED(mp1),
+ DB_CPID(mp1));
+ freemsg(mp1);
+ }
}
/*
@@ -13835,31 +13566,40 @@ try_again:;
if ((flags & TH_URG) &&
(!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq,
tcp->tcp_urp_last))) {
- mp1 = allocb(0, BPRI_MED);
- if (mp1 == NULL) {
- freemsg(mp);
- return;
- }
- if (!TCP_IS_DETACHED(tcp) &&
- !putnextctl1(tcp->tcp_rq, M_PCSIG,
- SIGURG)) {
- /* Try again on the rexmit. */
- freemsg(mp1);
- freemsg(mp);
- return;
+ if (IPCL_IS_NONSTR(connp)) {
+ if (!TCP_IS_DETACHED(tcp)) {
+ (*connp->conn_upcalls->
+ su_signal_oob)
+ (connp->conn_upper_handle,
+ urp);
+ }
+ } else {
+ mp1 = allocb(0, BPRI_MED);
+ if (mp1 == NULL) {
+ freemsg(mp);
+ return;
+ }
+ if (!TCP_IS_DETACHED(tcp) &&
+ !putnextctl1(tcp->tcp_rq,
+ M_PCSIG, SIGURG)) {
+ /* Try again on the rexmit. */
+ freemsg(mp1);
+ freemsg(mp);
+ return;
+ }
+ /*
+ * If the next byte would be the mark
+ * then mark with MARKNEXT else mark
+ * with NOTMARKNEXT.
+ */
+ if (gap == 0 && urp == 0)
+ mp1->b_flag |= MSGMARKNEXT;
+ else
+ mp1->b_flag |= MSGNOTMARKNEXT;
+ freemsg(tcp->tcp_urp_mark_mp);
+ tcp->tcp_urp_mark_mp = mp1;
+ flags |= TH_SEND_URP_MARK;
}
- /*
- * If the next byte would be the mark
- * then mark with MARKNEXT else mark
- * with NOTMARKNEXT.
- */
- if (gap == 0 && urp == 0)
- mp1->b_flag |= MSGMARKNEXT;
- else
- mp1->b_flag |= MSGNOTMARKNEXT;
- freemsg(tcp->tcp_urp_mark_mp);
- tcp->tcp_urp_mark_mp = mp1;
- flags |= TH_SEND_URP_MARK;
tcp->tcp_urp_last_valid = B_TRUE;
tcp->tcp_urp_last = urp + seg_seq;
}
@@ -14070,50 +13810,60 @@ ok:;
if (flags & TH_URG && urp >= 0) {
if (!tcp->tcp_urp_last_valid ||
SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) {
- /*
- * If we haven't generated the signal yet for this
- * urgent pointer value, do it now. Also, send up a
- * zero-length M_DATA indicating whether or not this is
- * the mark. The latter is not needed when a
- * T_EXDATA_IND is sent up. However, if there are
- * allocation failures this code relies on the sender
- * retransmitting and the socket code for determining
- * the mark should not block waiting for the peer to
- * transmit. Thus, for simplicity we always send up the
- * mark indication.
- */
- mp1 = allocb(0, BPRI_MED);
- if (mp1 == NULL) {
- freemsg(mp);
- return;
- }
- if (!TCP_IS_DETACHED(tcp) &&
- !putnextctl1(tcp->tcp_rq, M_PCSIG, SIGURG)) {
- /* Try again on the rexmit. */
- freemsg(mp1);
- freemsg(mp);
- return;
- }
- /*
- * Mark with NOTMARKNEXT for now.
- * The code below will change this to MARKNEXT
- * if we are at the mark.
- *
- * If there are allocation failures (e.g. in dupmsg
- * below) the next time tcp_rput_data sees the urgent
- * segment it will send up the MSG*MARKNEXT message.
- */
- mp1->b_flag |= MSGNOTMARKNEXT;
- freemsg(tcp->tcp_urp_mark_mp);
- tcp->tcp_urp_mark_mp = mp1;
- flags |= TH_SEND_URP_MARK;
+ if (IPCL_IS_NONSTR(connp)) {
+ if (!TCP_IS_DETACHED(tcp)) {
+ (*connp->conn_upcalls->su_signal_oob)
+ (connp->conn_upper_handle, urp);
+ }
+ } else {
+ /*
+ * If we haven't generated the signal yet for
+ * this urgent pointer value, do it now. Also,
+ * send up a zero-length M_DATA indicating
+ * whether or not this is the mark. The latter
+ * is not needed when a T_EXDATA_IND is sent up.
+ * However, if there are allocation failures
+ * this code relies on the sender retransmitting
+ * and the socket code for determining the mark
+ * should not block waiting for the peer to
+ * transmit. Thus, for simplicity we always
+ * send up the mark indication.
+ */
+ mp1 = allocb(0, BPRI_MED);
+ if (mp1 == NULL) {
+ freemsg(mp);
+ return;
+ }
+ if (!TCP_IS_DETACHED(tcp) &&
+ !putnextctl1(tcp->tcp_rq, M_PCSIG,
+ SIGURG)) {
+ /* Try again on the rexmit. */
+ freemsg(mp1);
+ freemsg(mp);
+ return;
+ }
+ /*
+ * Mark with NOTMARKNEXT for now.
+ * The code below will change this to MARKNEXT
+ * if we are at the mark.
+ *
+ * If there are allocation failures (e.g. in
+ * dupmsg below) the next time tcp_rput_data
+ * sees the urgent segment it will send up the
+ * MSGMARKNEXT message.
+ */
+ mp1->b_flag |= MSGNOTMARKNEXT;
+ freemsg(tcp->tcp_urp_mark_mp);
+ tcp->tcp_urp_mark_mp = mp1;
+ flags |= TH_SEND_URP_MARK;
#ifdef DEBUG
- (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
- "tcp_rput: sent M_PCSIG 2 seq %x urp %x "
- "last %x, %s",
- seg_seq, urp, tcp->tcp_urp_last,
- tcp_display(tcp, NULL, DISP_PORT_ONLY));
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
+ "tcp_rput: sent M_PCSIG 2 seq %x urp %x "
+ "last %x, %s",
+ seg_seq, urp, tcp->tcp_urp_last,
+ tcp_display(tcp, NULL, DISP_PORT_ONLY));
#endif /* DEBUG */
+ }
tcp->tcp_urp_last_valid = B_TRUE;
tcp->tcp_urp_last = urp + seg_seq;
} else if (tcp->tcp_urp_mark_mp != NULL) {
@@ -14218,7 +13968,15 @@ ok:;
* This segment contains only the urgent byte. We
* have to allocate the T_exdata_ind, if we can.
*/
- if (!tcp->tcp_urp_mp) {
+ if (IPCL_IS_NONSTR(connp)) {
+ int error;
+
+ (*connp->conn_upcalls->su_recv)
+ (connp->conn_upper_handle, mp, seg_len,
+ MSG_OOB, &error, NULL);
+ mp = NULL;
+ goto update_ack;
+ } else if (!tcp->tcp_urp_mp) {
struct T_exdata_ind *tei;
mp1 = allocb(sizeof (struct T_exdata_ind),
BPRI_MED);
@@ -14299,15 +14057,16 @@ ok:;
seg_len, flags,
tcp_display(tcp, NULL, DISP_PORT_ONLY));
#endif /* DEBUG */
- } else {
- /* Data left until we hit mark */
+ }
#ifdef DEBUG
+ else {
+ /* Data left until we hit mark */
(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
"tcp_rput: URP %d bytes left, %s",
urp - seg_len, tcp_display(tcp, NULL,
DISP_PORT_ONLY));
-#endif /* DEBUG */
}
+#endif /* DEBUG */
}
process_ack:
@@ -15194,6 +14953,7 @@ est:
mp = mp->b_cont;
freeb(mp1);
}
+update_ack:
tcph = tcp->tcp_tcph;
tcp->tcp_rack_cnt++;
{
@@ -15239,6 +14999,9 @@ est:
tcp->tcp_rnxt += seg_len;
U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack);
+ if (mp == NULL)
+ goto xmit_check;
+
/* Update SACK list */
if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
tcp_sack_remove(tcp->tcp_sack_list, tcp->tcp_rnxt,
@@ -15297,17 +15060,28 @@ est:
if (!(sodp->sod_state & SOD_ENABLED) ||
(tcp->tcp_kssl_ctx != NULL &&
DB_TYPE(mp) == M_DATA)) {
- mutex_exit(sodp->sod_lockp);
sodp = NULL;
}
+ mutex_exit(sodp->sod_lockp);
}
if (mp->b_datap->db_type != M_DATA ||
(flags & TH_MARKNEXT_NEEDED)) {
- if (sodp != NULL) {
- if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) {
- sodp->sod_uioa.uioa_state &= UIOA_CLR;
- sodp->sod_uioa.uioa_state |= UIOA_FINI;
+ if (IPCL_IS_NONSTR(connp)) {
+ int error;
+
+ if ((*connp->conn_upcalls->su_recv)
+ (connp->conn_upper_handle, mp,
+ seg_len, 0, &error, NULL) <= 0) {
+ if (error == ENOSPC) {
+ tcp->tcp_rwnd -= seg_len;
+ } else if (error == EOPNOTSUPP) {
+ tcp_rcv_enqueue(tcp, mp,
+ seg_len);
+ }
}
+ } else if (sodp != NULL) {
+ mutex_enter(sodp->sod_lockp);
+ SOD_UIOAFINI(sodp);
if (!SOD_QEMPTY(sodp) &&
(sodp->sod_state & SOD_WAKE_NOT)) {
flags |= tcp_rcv_sod_wakeup(tcp, sodp);
@@ -15316,7 +15090,7 @@ est:
mutex_exit(sodp->sod_lockp);
}
} else if (tcp->tcp_rcv_list != NULL) {
- flags |= tcp_rcv_drain(tcp->tcp_rq, tcp);
+ flags |= tcp_rcv_drain(tcp);
}
ASSERT(tcp->tcp_rcv_list == NULL ||
tcp->tcp_fused_sigurg);
@@ -15338,23 +15112,44 @@ est:
DTRACE_PROBE1(kssl_mblk__ksslinput_data1,
mblk_t *, mp);
tcp_kssl_input(tcp, mp);
- } else {
+ } else if (!IPCL_IS_NONSTR(connp)) {
+ /* Already handled non-STREAMS case. */
putnext(tcp->tcp_rq, mp);
if (!canputnext(tcp->tcp_rq))
tcp->tcp_rwnd -= seg_len;
}
} else if ((tcp->tcp_kssl_ctx != NULL) &&
(DB_TYPE(mp) == M_DATA)) {
- /* Do SSL processing first */
- DTRACE_PROBE1(kssl_mblk__ksslinput_data2,
- mblk_t *, mp);
+ /* Does this need SSL processing first? */
+ DTRACE_PROBE1(kssl_mblk__ksslinput_data2, mblk_t *, mp);
tcp_kssl_input(tcp, mp);
+ } else if (IPCL_IS_NONSTR(connp)) {
+ /* Non-STREAMS socket */
+ boolean_t push = flags & (TH_PUSH|TH_FIN);
+ int error;
+
+ if ((*connp->conn_upcalls->su_recv)(
+ connp->conn_upper_handle,
+ mp, seg_len, 0, &error, &push) <= 0) {
+ if (error == ENOSPC) {
+ tcp->tcp_rwnd -= seg_len;
+ } else if (error == EOPNOTSUPP) {
+ tcp_rcv_enqueue(tcp, mp, seg_len);
+ }
+ } else if (push) {
+ /*
+ * PUSH bit set and sockfs is not
+ * flow controlled
+ */
+ flags |= tcp_rwnd_reopen(tcp);
+ }
} else if (sodp != NULL) {
/*
* Sodirect so all mblk_t's are queued on the
* socket directly, check for wakeup of blocked
* reader (if any), and last if flow-controled.
*/
+ mutex_enter(sodp->sod_lockp);
flags |= tcp_rcv_sod_enqueue(tcp, sodp, mp, seg_len);
if ((sodp->sod_state & SOD_WAKE_NEED) ||
(flags & (TH_PUSH|TH_FIN))) {
@@ -15368,7 +15163,7 @@ est:
mutex_exit(sodp->sod_lockp);
}
} else if ((flags & (TH_PUSH|TH_FIN)) ||
- tcp->tcp_rcv_cnt + seg_len >= tcp->tcp_rq->q_hiwat >> 3) {
+ tcp->tcp_rcv_cnt + seg_len >= tcp->tcp_recv_hiwater >> 3) {
if (tcp->tcp_rcv_list != NULL) {
/*
* Enqueue the new segment first and then
@@ -15379,12 +15174,12 @@ est:
* This way can remove the else part later
* on.
*
- * We don't this to avoid one more call to
+ * We don't do this to avoid one more call to
* canputnext() as tcp_rcv_drain() needs to
* call canputnext().
*/
tcp_rcv_enqueue(tcp, mp, seg_len);
- flags |= tcp_rcv_drain(tcp->tcp_rq, tcp);
+ flags |= tcp_rcv_drain(tcp);
} else {
putnext(tcp->tcp_rq, mp);
if (!canputnext(tcp->tcp_rq))
@@ -15394,6 +15189,8 @@ est:
/*
* Enqueue all packets when processing an mblk
* from the co queue and also enqueue normal packets.
+ * For packets which belong to SSL stream do SSL
+ * processing first.
*/
tcp_rcv_enqueue(tcp, mp, seg_len);
}
@@ -15409,7 +15206,8 @@ est:
* such that the Q is empty now even though data was added
* above.
*/
- if (((sodp != NULL && !SOD_QEMPTY(sodp) &&
+ if (!IPCL_IS_NONSTR(connp) &&
+ ((sodp != NULL && !SOD_QEMPTY(sodp) &&
(sodp->sod_state & SOD_WAKE_NOT)) ||
(sodp == NULL && tcp->tcp_rcv_list != NULL)) &&
tcp->tcp_push_tid == 0) {
@@ -15495,6 +15293,7 @@ xmit_check:
ack_check:
if (flags & TH_SEND_URP_MARK) {
ASSERT(tcp->tcp_urp_mark_mp);
+ ASSERT(!IPCL_IS_NONSTR(connp));
/*
* Send up any queued data and then send the mark message
*/
@@ -15514,7 +15313,7 @@ ack_check:
flags |= tcp_rcv_sod_wakeup(tcp, sodp);
/* sod_wakeup() does the mutex_exit() */
} else if (tcp->tcp_rcv_list != NULL) {
- flags |= tcp_rcv_drain(tcp->tcp_rq, tcp);
+ flags |= tcp_rcv_drain(tcp);
ASSERT(tcp->tcp_rcv_list == NULL ||
tcp->tcp_fused_sigurg);
@@ -15568,6 +15367,14 @@ ack_check:
ASSERT(tcp->tcp_listener == NULL);
+ if (IPCL_IS_NONSTR(connp)) {
+ ASSERT(tcp->tcp_ordrel_mp == NULL);
+ tcp->tcp_ordrel_done = B_TRUE;
+ (*connp->conn_upcalls->su_opctl)
+ (connp->conn_upper_handle, SOCK_OPCTL_SHUT_RECV, 0);
+ goto done;
+ }
+
SOD_PTR_ENTER(tcp, sodp);
if (sodp != NULL) {
if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) {
@@ -15588,7 +15395,7 @@ ack_check:
/*
* Push any mblk(s) enqueued from co processing.
*/
- flags |= tcp_rcv_drain(tcp->tcp_rq, tcp);
+ flags |= tcp_rcv_drain(tcp);
ASSERT(tcp->tcp_rcv_list == NULL ||
tcp->tcp_fused_sigurg);
@@ -15934,7 +15741,7 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp)
* thus we clear out all addresses and ports.
*/
static void
-tcp_bind_failed(tcp_t *tcp, mblk_t *mp, int error)
+tcp_tpi_bind_failed(tcp_t *tcp, mblk_t *mp, int error)
{
queue_t *q = tcp->tcp_rq;
tcph_t *tcph;
@@ -15980,7 +15787,7 @@ tcp_bind_failed(tcp_t *tcp, mblk_t *mp, int error)
tea->ERROR_prim = T_CONN_REQ;
break;
default:
- panic("tcp_bind_failed: unexpected TPI type");
+ panic("tcp_tpi_bind_failed: unexpected TPI type");
/*NOTREACHED*/
}
@@ -16015,17 +15822,9 @@ tcp_bind_failed(tcp_t *tcp, mblk_t *mp, int error)
void
tcp_rput_other(tcp_t *tcp, mblk_t *mp)
{
- mblk_t *mp1;
uchar_t *rptr = mp->b_rptr;
queue_t *q = tcp->tcp_rq;
struct T_error_ack *tea;
- uint32_t mss;
- mblk_t *syn_mp;
- mblk_t *mdti;
- mblk_t *lsoi;
- int retval;
- mblk_t *ire_mp;
- tcp_stack_t *tcps = tcp->tcp_tcps;
switch (mp->b_datap->db_type) {
case M_PROTO:
@@ -16037,190 +15836,11 @@ tcp_rput_other(tcp_t *tcp, mblk_t *mp)
switch (tea->PRIM_type) {
case T_BIND_ACK:
/*
- * Adapt Multidata information, if any. The
- * following tcp_mdt_update routine will free
- * the message.
- */
- if ((mdti = tcp_mdt_info_mp(mp)) != NULL) {
- tcp_mdt_update(tcp, &((ip_mdt_info_t *)mdti->
- b_rptr)->mdt_capab, B_TRUE);
- freemsg(mdti);
- }
-
- /*
- * Check to update LSO information with tcp, and
- * tcp_lso_update routine will free the message.
- */
- if ((lsoi = tcp_lso_info_mp(mp)) != NULL) {
- tcp_lso_update(tcp, &((ip_lso_info_t *)lsoi->
- b_rptr)->lso_capab);
- freemsg(lsoi);
- }
-
- /* Get the IRE, if we had requested for it */
- ire_mp = tcp_ire_mp(mp);
-
- if (tcp->tcp_hard_binding) {
- tcp->tcp_hard_binding = B_FALSE;
- tcp->tcp_hard_bound = B_TRUE;
- CL_INET_CONNECT(tcp);
- } else {
- if (ire_mp != NULL)
- freeb(ire_mp);
- goto after_syn_sent;
- }
-
- retval = tcp_adapt_ire(tcp, ire_mp);
- if (ire_mp != NULL)
- freeb(ire_mp);
- if (retval == 0) {
- tcp_bind_failed(tcp, mp,
- (int)((tcp->tcp_state >= TCPS_SYN_SENT) ?
- ENETUNREACH : EADDRNOTAVAIL));
- return;
- }
- /*
- * Don't let an endpoint connect to itself.
- * Also checked in tcp_connect() but that
- * check can't handle the case when the
- * local IP address is INADDR_ANY.
- */
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- if ((tcp->tcp_ipha->ipha_dst ==
- tcp->tcp_ipha->ipha_src) &&
- (BE16_EQL(tcp->tcp_tcph->th_lport,
- tcp->tcp_tcph->th_fport))) {
- tcp_bind_failed(tcp, mp, EADDRNOTAVAIL);
- return;
- }
- } else {
- if (IN6_ARE_ADDR_EQUAL(
- &tcp->tcp_ip6h->ip6_dst,
- &tcp->tcp_ip6h->ip6_src) &&
- (BE16_EQL(tcp->tcp_tcph->th_lport,
- tcp->tcp_tcph->th_fport))) {
- tcp_bind_failed(tcp, mp, EADDRNOTAVAIL);
- return;
- }
- }
- ASSERT(tcp->tcp_state == TCPS_SYN_SENT);
- /*
- * This should not be possible! Just for
- * defensive coding...
- */
- if (tcp->tcp_state != TCPS_SYN_SENT)
- goto after_syn_sent;
-
- if (is_system_labeled() &&
- !tcp_update_label(tcp, CONN_CRED(tcp->tcp_connp))) {
- tcp_bind_failed(tcp, mp, EHOSTUNREACH);
- return;
- }
-
- ASSERT(q == tcp->tcp_rq);
- /*
- * tcp_adapt_ire() does not adjust
- * for TCP/IP header length.
- */
- mss = tcp->tcp_mss - tcp->tcp_hdr_len;
-
- /*
- * Just make sure our rwnd is at
- * least tcp_recv_hiwat_mss * MSS
- * large, and round up to the nearest
- * MSS.
- *
- * We do the round up here because
- * we need to get the interface
- * MTU first before we can do the
- * round up.
- */
- tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss),
- tcps->tcps_recv_hiwat_minmss * mss);
- q->q_hiwat = tcp->tcp_rwnd;
- tcp_set_ws_value(tcp);
- U32_TO_ABE16((tcp->tcp_rwnd >> tcp->tcp_rcv_ws),
- tcp->tcp_tcph->th_win);
- if (tcp->tcp_rcv_ws > 0 || tcps->tcps_wscale_always)
- tcp->tcp_snd_ws_ok = B_TRUE;
-
- /*
- * Set tcp_snd_ts_ok to true
- * so that tcp_xmit_mp will
- * include the timestamp
- * option in the SYN segment.
- */
- if (tcps->tcps_tstamp_always ||
- (tcp->tcp_rcv_ws && tcps->tcps_tstamp_if_wscale)) {
- tcp->tcp_snd_ts_ok = B_TRUE;
- }
-
- /*
- * tcp_snd_sack_ok can be set in
- * tcp_adapt_ire() if the sack metric
- * is set. So check it here also.
- */
- if (tcps->tcps_sack_permitted == 2 ||
- tcp->tcp_snd_sack_ok) {
- if (tcp->tcp_sack_info == NULL) {
- tcp->tcp_sack_info =
- kmem_cache_alloc(
- tcp_sack_info_cache,
- KM_SLEEP);
- }
- tcp->tcp_snd_sack_ok = B_TRUE;
- }
-
- /*
- * Should we use ECN? Note that the current
- * default value (SunOS 5.9) of tcp_ecn_permitted
- * is 1. The reason for doing this is that there
- * are equipments out there that will drop ECN
- * enabled IP packets. Setting it to 1 avoids
- * compatibility problems.
- */
- if (tcps->tcps_ecn_permitted == 2)
- tcp->tcp_ecn_ok = B_TRUE;
-
- TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
- syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL,
- tcp->tcp_iss, B_FALSE, NULL, B_FALSE);
- if (syn_mp) {
- cred_t *cr;
- pid_t pid;
-
- /*
- * Obtain the credential from the
- * thread calling connect(); the credential
- * lives on in the second mblk which
- * originated from T_CONN_REQ and is echoed
- * with the T_BIND_ACK from ip. If none
- * can be found, default to the creator
- * of the socket.
- */
- if (mp->b_cont == NULL ||
- (cr = DB_CRED(mp->b_cont)) == NULL) {
- cr = tcp->tcp_cred;
- pid = tcp->tcp_cpid;
- } else {
- pid = DB_CPID(mp->b_cont);
- }
- mblk_setcred(syn_mp, cr);
- DB_CPID(syn_mp) = pid;
- tcp_send_data(tcp, tcp->tcp_wq, syn_mp);
- }
- after_syn_sent:
- /*
- * A trailer mblk indicates a waiting client upstream.
- * We complete here the processing begun in
- * either tcp_bind() or tcp_connect() by passing
- * upstream the reply message they supplied.
+ * AF_INET socket should not be here.
*/
- mp1 = mp;
- mp = mp->b_cont;
- freeb(mp1);
- if (mp)
- break;
+ ASSERT(tcp->tcp_family != AF_INET &&
+ tcp->tcp_family != AF_INET6);
+ (void) tcp_post_ip_bind(tcp, mp->b_cont, 0);
return;
case T_ERROR_ACK:
if (tcp->tcp_debug) {
@@ -16233,25 +15853,11 @@ tcp_rput_other(tcp_t *tcp, mblk_t *mp)
switch (tea->ERROR_prim) {
case O_T_BIND_REQ:
case T_BIND_REQ:
- tcp_bind_failed(tcp, mp,
+ ASSERT(tcp->tcp_family != AF_INET);
+ tcp_tpi_bind_failed(tcp, mp,
(int)((tcp->tcp_state >= TCPS_SYN_SENT) ?
ENETUNREACH : EADDRNOTAVAIL));
return;
- case T_UNBIND_REQ:
- tcp->tcp_hard_binding = B_FALSE;
- tcp->tcp_hard_bound = B_FALSE;
- if (mp->b_cont) {
- freemsg(mp->b_cont);
- mp->b_cont = NULL;
- }
- if (tcp->tcp_unbind_pending)
- tcp->tcp_unbind_pending = 0;
- else {
- /* From tcp_ip_unbind() - free */
- freemsg(mp);
- return;
- }
- break;
case T_SVR4_OPTMGMT_REQ:
if (tcp->tcp_drop_opt_ack_cnt > 0) {
/* T_OPTMGMT_REQ generated by TCP */
@@ -16285,6 +15891,7 @@ tcp_rput_other(tcp_t *tcp, mblk_t *mp)
}
break;
default:
+ ASSERT(tea->ERROR_prim != T_UNBIND_REQ);
break;
}
break;
@@ -16302,6 +15909,7 @@ tcp_rput_other(tcp_t *tcp, mblk_t *mp)
* bind. Otherwise accept could possibly run and free
* this tcp struct.
*/
+ ASSERT(q != NULL);
putnext(q, mp);
}
@@ -16345,7 +15953,7 @@ tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2)
*/
TCP_FUSE_SYNCSTR_PLUG_DRAIN(tcp);
if (tcp->tcp_rcv_list != NULL)
- (void) tcp_rcv_drain(tcp->tcp_rq, tcp);
+ (void) tcp_rcv_drain(tcp);
if (peer_tcp > tcp) {
mutex_enter(&peer_tcp->tcp_non_sq_lock);
@@ -16487,8 +16095,20 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd)
* purposes in tcp_fuse_output().
*/
sth_hiwat = tcp_fuse_set_rcv_hiwat(tcp, rwnd);
- if (!tcp_detached)
- (void) mi_set_sth_hiwat(tcp->tcp_rq, sth_hiwat);
+ if (!tcp_detached) {
+ (void) proto_set_rx_hiwat(tcp->tcp_rq, tcp->tcp_connp,
+ sth_hiwat);
+ if (IPCL_IS_NONSTR(tcp->tcp_connp)) {
+ conn_t *connp = tcp->tcp_connp;
+ struct sock_proto_props sopp;
+
+ sopp.sopp_flags = SOCKOPT_RCVTHRESH;
+ sopp.sopp_rcvthresh = sth_hiwat >> 3;
+
+ (*connp->conn_upcalls->su_set_proto_props)
+ (connp->conn_upper_handle, &sopp);
+ }
+ }
/*
* In the fusion case, the maxpsz stream head value of
@@ -16500,10 +16120,11 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd)
return (rwnd);
}
- if (tcp_detached)
+ if (tcp_detached) {
old_max_rwnd = tcp->tcp_rwnd;
- else
- old_max_rwnd = tcp->tcp_rq->q_hiwat;
+ } else {
+ old_max_rwnd = tcp->tcp_recv_hiwater;
+ }
/*
* Insist on a receive window that is at least
@@ -16570,17 +16191,20 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd)
if (tcp_detached)
return (rwnd);
/*
- * We set the maximum receive window into rq->q_hiwat.
+ * We set the maximum receive window into rq->q_hiwat if it is
+ * a STREAMS socket.
* This is not actually used for flow control.
*/
- tcp->tcp_rq->q_hiwat = rwnd;
+ if (!IPCL_IS_NONSTR(tcp->tcp_connp))
+ tcp->tcp_rq->q_hiwat = rwnd;
+ tcp->tcp_recv_hiwater = rwnd;
/*
- * Set the Stream head high water mark. This doesn't have to be
+ * Set the STREAM head high water mark. This doesn't have to be
* here, since we are simply using default values, but we would
* prefer to choose these values algorithmically, with a likely
* relationship to rwnd.
*/
- (void) mi_set_sth_hiwat(tcp->tcp_rq,
+ (void) proto_set_rx_hiwat(tcp->tcp_rq, tcp->tcp_connp,
MAX(rwnd, tcps->tcps_sth_rcv_hiwat));
return (rwnd);
}
@@ -16939,7 +16563,7 @@ tcp_snmp_state(tcp_t *tcp)
static char tcp_report_header[] =
"TCP " MI_COL_HDRPAD_STR
- "zone dest snxt suna "
+ "zone dest snxt suna "
"swnd rnxt rack rwnd rto mss w sw rw t "
"recent [lport,fport] state";
@@ -17127,7 +16751,7 @@ static int
tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
{
tf_t *tbf;
- tcp_t *tcp;
+ tcp_t *tcp, *ltcp;
int i;
zoneid_t zoneid;
tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
@@ -17153,15 +16777,18 @@ tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
for (i = 0; i < TCP_BIND_FANOUT_SIZE; i++) {
tbf = &tcps->tcps_bind_fanout[i];
mutex_enter(&tbf->tf_lock);
- for (tcp = tbf->tf_tcp; tcp != NULL;
- tcp = tcp->tcp_bind_hash) {
- if (zoneid != GLOBAL_ZONEID &&
- zoneid != tcp->tcp_connp->conn_zoneid)
- continue;
- CONN_INC_REF(tcp->tcp_connp);
- tcp_report_item(mp->b_cont, tcp, i,
- Q_TO_TCP(q), cr);
- CONN_DEC_REF(tcp->tcp_connp);
+ for (ltcp = tbf->tf_tcp; ltcp != NULL;
+ ltcp = ltcp->tcp_bind_hash) {
+ for (tcp = ltcp; tcp != NULL;
+ tcp = tcp->tcp_bind_hash_port) {
+ if (zoneid != GLOBAL_ZONEID &&
+ zoneid != tcp->tcp_connp->conn_zoneid)
+ continue;
+ CONN_INC_REF(tcp->tcp_connp);
+ tcp_report_item(mp->b_cont, tcp, i,
+ Q_TO_TCP(q), cr);
+ CONN_DEC_REF(tcp->tcp_connp);
+ }
}
mutex_exit(&tbf->tf_lock);
}
@@ -17201,7 +16828,7 @@ tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
(void) mi_mpprintf(mp,
" TCP " MI_COL_HDRPAD_STR
- "zone IP addr port seqnum backlog (q0/q/max)");
+ "zone IP addr port seqnum backlog (q0/q/max)");
ipst = tcps->tcps_netstack->netstack_ip;
@@ -17717,19 +17344,18 @@ tcp_timer(void *arg)
}
-/* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */
-static void
-tcp_unbind(tcp_t *tcp, mblk_t *mp)
+static int
+tcp_do_unbind(conn_t *connp)
{
- conn_t *connp;
+ tcp_t *tcp = connp->conn_tcp;
+ int error = 0;
switch (tcp->tcp_state) {
case TCPS_BOUND:
case TCPS_LISTEN:
break;
default:
- tcp_err_ack(tcp, mp, TOUTSTATE, 0);
- return;
+ return (-TOUTSTATE);
}
/*
@@ -17752,14 +17378,32 @@ tcp_unbind(tcp_t *tcp, mblk_t *mp)
tcp_bind_hash_remove(tcp);
tcp->tcp_state = TCPS_IDLE;
tcp->tcp_mdt = B_FALSE;
- /* Send M_FLUSH according to TPI */
- (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW);
+
connp = tcp->tcp_connp;
connp->conn_mdt_ok = B_FALSE;
ipcl_hash_remove(connp);
bzero(&connp->conn_ports, sizeof (connp->conn_ports));
- mp = mi_tpi_ok_ack_alloc(mp);
- putnext(tcp->tcp_rq, mp);
+
+ return (error);
+}
+
+/* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */
+static void
+tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp)
+{
+ int error = tcp_do_unbind(tcp->tcp_connp);
+
+ if (error > 0) {
+ tcp_err_ack(tcp, mp, TSYSERR, error);
+ } else if (error < 0) {
+ tcp_err_ack(tcp, mp, -error, 0);
+ } else {
+ /* Send M_FLUSH according to TPI */
+ (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW);
+
+ mp = mi_tpi_ok_ack_alloc(mp);
+ putnext(tcp->tcp_rq, mp);
+ }
}
/*
@@ -18025,9 +17669,9 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
/* find out how much we can send */
/* BEGIN CSTYLED */
/*
- * un-acked usable
+ * un-acked usable
* |--------------|-----------------|
- * tcp_suna tcp_snxt tcp_suna+tcp_swnd
+ * tcp_suna tcp_snxt tcp_suna+tcp_swnd
*/
/* END CSTYLED */
@@ -18229,10 +17873,6 @@ slow:
tcp_wput_data(tcp, NULL, B_FALSE);
}
-/*
- * The function called through squeue to get behind eager's perimeter to
- * finish the accept processing.
- */
/* ARGSUSED */
void
tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
@@ -18240,17 +17880,33 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
conn_t *connp = (conn_t *)arg;
tcp_t *tcp = connp->conn_tcp;
queue_t *q = tcp->tcp_rq;
- mblk_t *mp1;
- mblk_t *stropt_mp = mp;
- struct stroptions *stropt;
- uint_t thwin;
- tcp_stack_t *tcps = tcp->tcp_tcps;
+ struct tcp_options *tcpopt;
+ tcp_stack_t *tcps = tcp->tcp_tcps;
+
+ /* socket options */
+ uint_t sopp_flags;
+ ssize_t sopp_rxhiwat;
+ ssize_t sopp_maxblk;
+ ushort_t sopp_wroff;
+ ushort_t sopp_tail;
+ ushort_t sopp_copyopt;
+
+ tcpopt = (struct tcp_options *)mp->b_rptr;
/*
* Drop the eager's ref on the listener, that was placed when
* this eager began life in tcp_conn_request.
*/
CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
+ if (IPCL_IS_NONSTR(connp)) {
+ /* Safe to free conn_ind message */
+ freemsg(tcp->tcp_conn.tcp_eager_conn_ind);
+ tcp->tcp_conn.tcp_eager_conn_ind = NULL;
+
+ /* The listener tells us which upper handle to use */
+ ASSERT(tcpopt->to_flags & TCPOPT_UPPERHANDLE);
+ connp->conn_upper_handle = tcpopt->to_handle;
+ }
tcp->tcp_detached = B_FALSE;
@@ -18267,37 +17923,47 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
*/
ASSERT(tcp->tcp_listener == NULL);
if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) {
- struct T_discon_ind *tdi;
-
- (void) putnextctl1(q, M_FLUSH, FLUSHRW);
- /*
- * Let us reuse the incoming mblk to avoid memory
- * allocation failure problems. We know that the
- * size of the incoming mblk i.e. stroptions is greater
- * than sizeof T_discon_ind. So the reallocb below
- * can't fail.
- */
- freemsg(mp->b_cont);
- mp->b_cont = NULL;
- ASSERT(DB_REF(mp) == 1);
- mp = reallocb(mp, sizeof (struct T_discon_ind),
- B_FALSE);
- ASSERT(mp != NULL);
- DB_TYPE(mp) = M_PROTO;
- ((union T_primitives *)mp->b_rptr)->type = T_DISCON_IND;
- tdi = (struct T_discon_ind *)mp->b_rptr;
- if (tcp->tcp_issocket) {
- tdi->DISCON_reason = ECONNREFUSED;
- tdi->SEQ_number = 0;
+ if (IPCL_IS_NONSTR(connp)) {
+ ASSERT(tcp->tcp_issocket);
+ (*connp->conn_upcalls->su_disconnected)(
+ connp->conn_upper_handle, tcp->tcp_connid,
+ ECONNREFUSED);
+ freemsg(mp);
} else {
- tdi->DISCON_reason = ENOPROTOOPT;
- tdi->SEQ_number =
- tcp->tcp_conn_req_seqnum;
+ struct T_discon_ind *tdi;
+
+ (void) putnextctl1(q, M_FLUSH, FLUSHRW);
+ /*
+ * Let us reuse the incoming mblk to avoid
+ * memory allocation failure problems. We know
+ * that the size of the incoming mblk i.e.
+ * stroptions is greater than sizeof
+ * T_discon_ind. So the reallocb below can't
+ * fail.
+ */
+ freemsg(mp->b_cont);
+ mp->b_cont = NULL;
+ ASSERT(DB_REF(mp) == 1);
+ mp = reallocb(mp, sizeof (struct T_discon_ind),
+ B_FALSE);
+ ASSERT(mp != NULL);
+ DB_TYPE(mp) = M_PROTO;
+ ((union T_primitives *)mp->b_rptr)->type =
+ T_DISCON_IND;
+ tdi = (struct T_discon_ind *)mp->b_rptr;
+ if (tcp->tcp_issocket) {
+ tdi->DISCON_reason = ECONNREFUSED;
+ tdi->SEQ_number = 0;
+ } else {
+ tdi->DISCON_reason = ENOPROTOOPT;
+ tdi->SEQ_number =
+ tcp->tcp_conn_req_seqnum;
+ }
+ mp->b_wptr = mp->b_rptr +
+ sizeof (struct T_discon_ind);
+ putnext(q, mp);
+ return;
}
- mp->b_wptr = mp->b_rptr + sizeof (struct T_discon_ind);
- putnext(q, mp);
- } else {
- freemsg(mp);
}
if (tcp->tcp_hard_binding) {
tcp->tcp_hard_binding = B_FALSE;
@@ -18306,19 +17972,21 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
return;
}
- mp1 = stropt_mp->b_cont;
- stropt_mp->b_cont = NULL;
- ASSERT(DB_TYPE(stropt_mp) == M_SETOPTS);
- stropt = (struct stroptions *)stropt_mp->b_rptr;
+ if (tcpopt->to_flags & TCPOPT_BOUNDIF) {
+ int boundif = tcpopt->to_boundif;
+ uint_t len = sizeof (int);
- while (mp1 != NULL) {
- mp = mp1;
- mp1 = mp1->b_cont;
- mp->b_cont = NULL;
- tcp->tcp_drop_opt_ack_cnt++;
- CALL_IP_WPUT(connp, tcp->tcp_wq, mp);
+ (void) tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, IPPROTO_IPV6,
+ IPV6_BOUND_IF, len, (uchar_t *)&boundif, &len,
+ (uchar_t *)&boundif, NULL, tcp->tcp_cred);
+ }
+ if (tcpopt->to_flags & TCPOPT_RECVPKTINFO) {
+ uint_t on = 1;
+ uint_t len = sizeof (uint_t);
+ (void) tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, IPPROTO_IPV6,
+ IPV6_RECVPKTINFO, len, (uchar_t *)&on, &len,
+ (uchar_t *)&on, NULL, tcp->tcp_cred);
}
- mp = NULL;
/*
* For a loopback connection with tcp_direct_sockfs on, note that
@@ -18331,42 +17999,50 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
* Set the max window size (tcp_rq->q_hiwat) of the acceptor
* properly. This is the first time we know of the acceptor'
* queue. So we do it here.
+ *
+ * XXX
*/
if (tcp->tcp_rcv_list == NULL) {
/*
* Recv queue is empty, tcp_rwnd should not have changed.
* That means it should be equal to the listener's tcp_rwnd.
*/
- tcp->tcp_rq->q_hiwat = tcp->tcp_rwnd;
+ if (!IPCL_IS_NONSTR(connp))
+ tcp->tcp_rq->q_hiwat = tcp->tcp_rwnd;
+ tcp->tcp_recv_hiwater = tcp->tcp_rwnd;
} else {
#ifdef DEBUG
- uint_t cnt = 0;
+ mblk_t *tmp;
+ mblk_t *mp1;
+ uint_t cnt = 0;
mp1 = tcp->tcp_rcv_list;
- while ((mp = mp1) != NULL) {
- mp1 = mp->b_next;
- cnt += msgdsize(mp);
+ while ((tmp = mp1) != NULL) {
+ mp1 = tmp->b_next;
+ cnt += msgdsize(tmp);
}
ASSERT(cnt != 0 && tcp->tcp_rcv_cnt == cnt);
#endif
/* There is some data, add them back to get the max. */
- tcp->tcp_rq->q_hiwat = tcp->tcp_rwnd + tcp->tcp_rcv_cnt;
+ if (!IPCL_IS_NONSTR(connp))
+ tcp->tcp_rq->q_hiwat = tcp->tcp_rwnd + tcp->tcp_rcv_cnt;
+ tcp->tcp_recv_hiwater = tcp->tcp_rwnd + tcp->tcp_rcv_cnt;
}
/*
* This is the first time we run on the correct
* queue after tcp_accept. So fix all the q parameters
* here.
*/
- stropt->so_flags = SO_HIWAT | SO_MAXBLK | SO_WROFF;
- stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
+ sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_MAXBLK | SOCKOPT_WROFF;
+ sopp_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
/*
* Record the stream head's high water mark for this endpoint;
* this is used for flow-control purposes.
*/
- stropt->so_hiwat = tcp->tcp_fused ?
- tcp_fuse_set_rcv_hiwat(tcp, q->q_hiwat) :
- MAX(q->q_hiwat, tcps->tcps_sth_rcv_hiwat);
+ sopp_rxhiwat = tcp->tcp_fused ?
+ tcp_fuse_set_rcv_hiwat(tcp, tcp->tcp_recv_hiwater) :
+ MAX(tcp->tcp_recv_hiwater, tcps->tcps_sth_rcv_hiwat);
/*
* Determine what write offset value to use depending on SACK and
@@ -18382,17 +18058,17 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
* since it would reduce the amount of work done by kmem.
* Non-fused tcp loopback case is handled separately below.
*/
- stropt->so_wroff = 0;
+ sopp_wroff = 0;
/*
* Update the peer's transmit parameters according to
* our recently calculated high water mark value.
*/
(void) tcp_maxpsz_set(tcp->tcp_loopback_peer, B_TRUE);
} else if (tcp->tcp_snd_sack_ok) {
- stropt->so_wroff = tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN +
+ sopp_wroff = tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN +
(tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra);
} else {
- stropt->so_wroff = tcp->tcp_hdr_len + (tcp->tcp_loopback ? 0 :
+ sopp_wroff = tcp->tcp_hdr_len + (tcp->tcp_loopback ? 0 :
tcps->tcps_wroff_xtra);
}
@@ -18408,20 +18084,62 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
* costs.
*/
if (tcp->tcp_kssl_ctx != NULL) {
- stropt->so_wroff += SSL3_WROFFSET;
+ sopp_wroff += SSL3_WROFFSET;
- stropt->so_flags |= SO_TAIL;
- stropt->so_tail = SSL3_MAX_TAIL_LEN;
+ sopp_flags |= SOCKOPT_TAIL;
+ sopp_tail = SSL3_MAX_TAIL_LEN;
- stropt->so_flags |= SO_COPYOPT;
- stropt->so_copyopt = ZCVMUNSAFE;
+ sopp_flags |= SOCKOPT_ZCOPY;
+ sopp_copyopt = ZCVMUNSAFE;
- stropt->so_maxblk = SSL3_MAX_RECORD_LEN;
+ sopp_maxblk = SSL3_MAX_RECORD_LEN;
}
/* Send the options up */
- putnext(q, stropt_mp);
+ if (IPCL_IS_NONSTR(connp)) {
+ struct sock_proto_props sopp;
+
+ sopp.sopp_flags = sopp_flags;
+ sopp.sopp_wroff = sopp_wroff;
+ sopp.sopp_maxblk = sopp_maxblk;
+ sopp.sopp_rxhiwat = sopp_rxhiwat;
+ if (sopp_flags & SOCKOPT_TAIL) {
+ ASSERT(tcp->tcp_kssl_ctx != NULL);
+ ASSERT(sopp_flags & SOCKOPT_ZCOPY);
+ sopp.sopp_tail = sopp_tail;
+ sopp.sopp_zcopyflag = sopp_copyopt;
+ }
+ (*connp->conn_upcalls->su_set_proto_props)
+ (connp->conn_upper_handle, &sopp);
+ } else {
+ struct stroptions *stropt;
+ mblk_t *stropt_mp = allocb(sizeof (struct stroptions), BPRI_HI);
+ if (stropt_mp == NULL) {
+ tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
+ return;
+ }
+ DB_TYPE(stropt_mp) = M_SETOPTS;
+ stropt = (struct stroptions *)stropt_mp->b_rptr;
+ stropt_mp->b_wptr += sizeof (struct stroptions);
+ stropt = (struct stroptions *)stropt_mp->b_rptr;
+ stropt->so_flags |= SO_HIWAT | SO_WROFF | SO_MAXBLK;
+ stropt->so_hiwat = sopp_rxhiwat;
+ stropt->so_wroff = sopp_wroff;
+ stropt->so_maxblk = sopp_maxblk;
+
+ if (sopp_flags & SOCKOPT_TAIL) {
+ ASSERT(tcp->tcp_kssl_ctx != NULL);
+
+ stropt->so_flags |= SO_TAIL | SO_COPYOPT;
+ stropt->so_tail = sopp_tail;
+ stropt->so_copyopt = sopp_copyopt;
+ }
+
+ /* Send the options up */
+ putnext(q, stropt_mp);
+ }
+ freemsg(mp);
/*
* Pass up any data and/or a fin that has been received.
*
@@ -18432,43 +18150,77 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
* code, the rwnd may never open up again!
*/
if (tcp->tcp_rcv_list != NULL) {
- /* We drain directly in case of fused tcp loopback */
- sodirect_t *sodp;
-
- if (!tcp->tcp_fused && canputnext(q)) {
- tcp->tcp_rwnd = q->q_hiwat;
- thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win))
- << tcp->tcp_rcv_ws;
- thwin -= tcp->tcp_rnxt - tcp->tcp_rack;
- if (tcp->tcp_state >= TCPS_ESTABLISHED &&
- (q->q_hiwat - thwin >= tcp->tcp_mss)) {
- tcp_xmit_ctl(NULL,
- tcp, (tcp->tcp_swnd == 0) ?
- tcp->tcp_suna : tcp->tcp_snxt,
- tcp->tcp_rnxt, TH_ACK);
- BUMP_MIB(&tcps->tcps_mib, tcpOutWinUpdate);
+ if (IPCL_IS_NONSTR(connp)) {
+ mblk_t *mp;
+ int space_left;
+ int error;
+ boolean_t push = B_TRUE;
+
+ if (!tcp->tcp_fused && (*connp->conn_upcalls->su_recv)
+ (connp->conn_upper_handle, NULL, 0, 0, &error,
+ &push) >= 0) {
+ tcp->tcp_rwnd = tcp->tcp_recv_hiwater;
+ if (tcp->tcp_state >= TCPS_ESTABLISHED &&
+ tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
+ tcp_xmit_ctl(NULL,
+ tcp, (tcp->tcp_swnd == 0) ?
+ tcp->tcp_suna : tcp->tcp_snxt,
+ tcp->tcp_rnxt, TH_ACK);
+ }
}
-
- }
-
- SOD_PTR_ENTER(tcp, sodp);
- if (sodp != NULL) {
- /* Sodirect, move from rcv_list */
- ASSERT(!tcp->tcp_fused);
while ((mp = tcp->tcp_rcv_list) != NULL) {
+ push = B_TRUE;
tcp->tcp_rcv_list = mp->b_next;
mp->b_next = NULL;
- (void) tcp_rcv_sod_enqueue(tcp, sodp, mp,
- msgdsize(mp));
+ space_left = (*connp->conn_upcalls->su_recv)
+ (connp->conn_upper_handle, mp, msgdsize(mp),
+ 0, &error, &push);
+ if (space_left < 0) {
+ /*
+ * At this point the eager is not
+ * visible to anyone, so fallback
+ * can not happen.
+ */
+ ASSERT(error != EOPNOTSUPP);
+ }
}
tcp->tcp_rcv_last_head = NULL;
tcp->tcp_rcv_last_tail = NULL;
tcp->tcp_rcv_cnt = 0;
- (void) tcp_rcv_sod_wakeup(tcp, sodp);
- /* sod_wakeup() did the mutex_exit() */
} else {
- /* Not sodirect, drain */
- (void) tcp_rcv_drain(q, tcp);
+ /* We drain directly in case of fused tcp loopback */
+ sodirect_t *sodp;
+
+ if (!tcp->tcp_fused && canputnext(q)) {
+ tcp->tcp_rwnd = q->q_hiwat;
+ if (tcp->tcp_state >= TCPS_ESTABLISHED &&
+ tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
+ tcp_xmit_ctl(NULL,
+ tcp, (tcp->tcp_swnd == 0) ?
+ tcp->tcp_suna : tcp->tcp_snxt,
+ tcp->tcp_rnxt, TH_ACK);
+ }
+ }
+
+ SOD_PTR_ENTER(tcp, sodp);
+ if (sodp != NULL) {
+ /* Sodirect, move from rcv_list */
+ ASSERT(!tcp->tcp_fused);
+ while ((mp = tcp->tcp_rcv_list) != NULL) {
+ tcp->tcp_rcv_list = mp->b_next;
+ mp->b_next = NULL;
+ (void) tcp_rcv_sod_enqueue(tcp, sodp,
+ mp, msgdsize(mp));
+ }
+ tcp->tcp_rcv_last_head = NULL;
+ tcp->tcp_rcv_last_tail = NULL;
+ tcp->tcp_rcv_cnt = 0;
+ (void) tcp_rcv_sod_wakeup(tcp, sodp);
+ /* sod_wakeup() did the mutex_exit() */
+ } else {
+ /* Not sodirect, drain */
+ (void) tcp_rcv_drain(tcp);
+ }
}
/*
@@ -18502,18 +18254,27 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
}
ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
- mp = tcp->tcp_ordrel_mp;
- tcp->tcp_ordrel_mp = NULL;
tcp->tcp_ordrel_done = B_TRUE;
- putnext(q, mp);
+ if (IPCL_IS_NONSTR(connp)) {
+ ASSERT(tcp->tcp_ordrel_mp == NULL);
+ (*connp->conn_upcalls->su_opctl)(
+ connp->conn_upper_handle,
+ SOCK_OPCTL_SHUT_RECV, 0);
+ } else {
+ mp = tcp->tcp_ordrel_mp;
+ tcp->tcp_ordrel_mp = NULL;
+ putnext(q, mp);
+ }
}
if (tcp->tcp_hard_binding) {
tcp->tcp_hard_binding = B_FALSE;
tcp->tcp_hard_bound = B_TRUE;
}
- /* We can enable synchronous streams now */
- if (tcp->tcp_fused) {
+ /* We can enable synchronous streams for STREAMS tcp endpoint now */
+ if (tcp->tcp_fused && !IPCL_IS_NONSTR(connp) &&
+ tcp->tcp_loopback_peer != NULL &&
+ !IPCL_IS_NONSTR(tcp->tcp_loopback_peer->tcp_connp)) {
tcp_fuse_syncstr_enable_pair(tcp);
}
@@ -18547,6 +18308,8 @@ tcp_send_pending(void *arg, mblk_t *mp, void *arg2)
{
conn_t *connp = (conn_t *)arg;
tcp_t *listener = connp->conn_tcp;
+ struct T_conn_ind *conn_ind;
+ tcp_t *tcp;
if (listener->tcp_state == TCPS_CLOSED ||
TCP_IS_DETACHED(listener)) {
@@ -18554,8 +18317,6 @@ tcp_send_pending(void *arg, mblk_t *mp, void *arg2)
* If listener has closed, it would have caused a
* a cleanup/blowoff to happen for the eager.
*/
- tcp_t *tcp;
- struct T_conn_ind *conn_ind;
conn_ind = (struct T_conn_ind *)mp->b_rptr;
bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp,
@@ -18571,7 +18332,218 @@ tcp_send_pending(void *arg, mblk_t *mp, void *arg2)
freemsg(mp);
return;
}
- putnext(listener->tcp_rq, mp);
+ if (IPCL_IS_NONSTR(connp)) {
+ conn_ind = (struct T_conn_ind *)mp->b_rptr;
+ bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp,
+ conn_ind->OPT_length);
+
+ if ((*connp->conn_upcalls->su_newconn)
+ (connp->conn_upper_handle,
+ (sock_lower_handle_t)tcp->tcp_connp,
+ &sock_tcp_downcalls, DB_CRED(mp), DB_CPID(mp),
+ &tcp->tcp_connp->conn_upcalls) != NULL) {
+ /* Keep the message around in case of fallback */
+ tcp->tcp_conn.tcp_eager_conn_ind = mp;
+ } else {
+ freemsg(mp);
+ }
+ } else {
+ putnext(listener->tcp_rq, mp);
+ }
+}
+
+/* ARGSUSED */
+static int
+tcp_accept_common(conn_t *lconnp, conn_t *econnp,
+ sock_upper_handle_t sock_handle, cred_t *cr)
+{
+ tcp_t *listener, *eager;
+ mblk_t *opt_mp;
+ struct tcp_options *tcpopt;
+
+ listener = lconnp->conn_tcp;
+ ASSERT(listener->tcp_state == TCPS_LISTEN);
+ eager = econnp->conn_tcp;
+ ASSERT(eager->tcp_listener != NULL);
+
+ ASSERT(eager->tcp_rq != NULL);
+
+ /* If tcp_fused and sodirect enabled disable it */
+ if (eager->tcp_fused && eager->tcp_sodirect != NULL) {
+ /* Fused, disable sodirect */
+ mutex_enter(eager->tcp_sodirect->sod_lockp);
+ SOD_DISABLE(eager->tcp_sodirect);
+ mutex_exit(eager->tcp_sodirect->sod_lockp);
+ eager->tcp_sodirect = NULL;
+ }
+
+ opt_mp = allocb(sizeof (struct tcp_options), BPRI_HI);
+ if (opt_mp == NULL) {
+ return (-TPROTO);
+ }
+ bzero((char *)opt_mp->b_rptr, sizeof (struct tcp_options));
+ eager->tcp_issocket = B_TRUE;
+
+ econnp->conn_upcalls = lconnp->conn_upcalls;
+ econnp->conn_zoneid = listener->tcp_connp->conn_zoneid;
+ econnp->conn_allzones = listener->tcp_connp->conn_allzones;
+ ASSERT(econnp->conn_netstack ==
+ listener->tcp_connp->conn_netstack);
+ ASSERT(eager->tcp_tcps == listener->tcp_tcps);
+
+ /* Put the ref for IP */
+ CONN_INC_REF(econnp);
+
+ /*
+ * We should have minimum of 3 references on the conn
+ * at this point. One each for TCP and IP and one for
+ * the T_conn_ind that was sent up when the 3-way handshake
+ * completed. In the normal case we would also have another
+ * reference (making a total of 4) for the conn being in the
+ * classifier hash list. However the eager could have received
+ * an RST subsequently and tcp_closei_local could have removed
+ * the eager from the classifier hash list, hence we can't
+ * assert that reference.
+ */
+ ASSERT(econnp->conn_ref >= 3);
+
+ opt_mp->b_datap->db_type = M_SETOPTS;
+ opt_mp->b_wptr += sizeof (struct tcp_options);
+
+ /*
+ * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO
+ * from listener to acceptor. In case of non-STREAMS sockets,
+ * we also need to pass the upper handle along.
+ */
+ tcpopt = (struct tcp_options *)opt_mp->b_rptr;
+ tcpopt->to_flags = 0;
+
+ if (IPCL_IS_NONSTR(econnp)) {
+ ASSERT(sock_handle != NULL);
+ tcpopt->to_flags |= TCPOPT_UPPERHANDLE;
+ tcpopt->to_handle = sock_handle;
+ }
+ if (listener->tcp_bound_if != 0) {
+ tcpopt->to_flags |= TCPOPT_BOUNDIF;
+ tcpopt->to_boundif = listener->tcp_bound_if;
+ }
+ if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) {
+ tcpopt->to_flags |= TCPOPT_RECVPKTINFO;
+ }
+
+ mutex_enter(&listener->tcp_eager_lock);
+ if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
+
+ tcp_t *tail;
+ tcp_t *tcp;
+ mblk_t *mp1;
+
+ tcp = listener->tcp_eager_prev_q0;
+ /*
+ * listener->tcp_eager_prev_q0 points to the TAIL of the
+ * deferred T_conn_ind queue. We need to get to the head
+ * of the queue in order to send up T_conn_ind the same
+ * order as how the 3WHS is completed.
+ */
+ while (tcp != listener) {
+ if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0 &&
+ !tcp->tcp_kssl_pending)
+ break;
+ else
+ tcp = tcp->tcp_eager_prev_q0;
+ }
+ /* None of the pending eagers can be sent up now */
+ if (tcp == listener)
+ goto no_more_eagers;
+
+ mp1 = tcp->tcp_conn.tcp_eager_conn_ind;
+ tcp->tcp_conn.tcp_eager_conn_ind = NULL;
+ /* Move from q0 to q */
+ ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
+ listener->tcp_conn_req_cnt_q0--;
+ listener->tcp_conn_req_cnt_q++;
+ tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
+ tcp->tcp_eager_prev_q0;
+ tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
+ tcp->tcp_eager_next_q0;
+ tcp->tcp_eager_prev_q0 = NULL;
+ tcp->tcp_eager_next_q0 = NULL;
+ tcp->tcp_conn_def_q0 = B_FALSE;
+
+ /* Make sure the tcp isn't in the list of droppables */
+ ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
+ tcp->tcp_eager_prev_drop_q0 == NULL);
+
+ /*
+ * Insert at end of the queue because sockfs sends
+ * down T_CONN_RES in chronological order. Leaving
+ * the older conn indications at front of the queue
+ * helps reducing search time.
+ */
+ tail = listener->tcp_eager_last_q;
+ if (tail != NULL) {
+ tail->tcp_eager_next_q = tcp;
+ } else {
+ listener->tcp_eager_next_q = tcp;
+ }
+ listener->tcp_eager_last_q = tcp;
+ tcp->tcp_eager_next_q = NULL;
+
+ /* Need to get inside the listener perimeter */
+ CONN_INC_REF(listener->tcp_connp);
+ SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp1,
+ tcp_send_pending, listener->tcp_connp, SQ_FILL,
+ SQTAG_TCP_SEND_PENDING);
+ }
+no_more_eagers:
+ tcp_eager_unlink(eager);
+ mutex_exit(&listener->tcp_eager_lock);
+
+ /*
+ * At this point, the eager is detached from the listener
+ * but we still have an extra refs on eager (apart from the
+ * usual tcp references). The ref was placed in tcp_rput_data
+ * before sending the conn_ind in tcp_send_conn_ind.
+ * The ref will be dropped in tcp_accept_finish().
+ */
+ SQUEUE_ENTER_ONE(econnp->conn_sqp, opt_mp, tcp_accept_finish,
+ econnp, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0);
+ return (0);
+}
+
+int
+tcp_accept(sock_lower_handle_t lproto_handle,
+ sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
+ cred_t *cr)
+{
+ conn_t *lconnp, *econnp;
+ tcp_t *listener, *eager;
+ tcp_stack_t *tcps;
+
+ lconnp = (conn_t *)lproto_handle;
+ listener = lconnp->conn_tcp;
+ ASSERT(listener->tcp_state == TCPS_LISTEN);
+ econnp = (conn_t *)eproto_handle;
+ eager = econnp->conn_tcp;
+ ASSERT(eager->tcp_listener != NULL);
+ tcps = eager->tcp_tcps;
+
+ ASSERT(IPCL_IS_NONSTR(econnp));
+ /*
+ * Create helper stream if it is a non-TPI TCP connection.
+ */
+ if (ip_create_helper_stream(econnp, tcps->tcps_ldi_ident)) {
+ ip1dbg(("tcp_accept: create of IP helper stream"
+ " failed\n"));
+ return (EPROTO);
+ }
+ eager->tcp_rq = econnp->conn_rq;
+ eager->tcp_wq = econnp->conn_wq;
+
+ ASSERT(eager->tcp_rq != NULL);
+
+ eager->tcp_sodirect = SOD_SOTOSODP(sock_handle);
+ return (tcp_accept_common(lconnp, econnp, sock_handle, cr));
}
@@ -18581,7 +18553,7 @@ tcp_send_pending(void *arg, mblk_t *mp, void *arg2)
* Read the block comment on top of tcp_conn_request().
*/
void
-tcp_wput_accept(queue_t *q, mblk_t *mp)
+tcp_tpi_accept(queue_t *q, mblk_t *mp)
{
queue_t *rq = RD(q);
struct T_conn_res *conn_res;
@@ -18589,7 +18561,6 @@ tcp_wput_accept(queue_t *q, mblk_t *mp)
tcp_t *listener;
struct T_ok_ack *ok;
t_scalar_t PRIM_type;
- mblk_t *opt_mp;
conn_t *econnp;
ASSERT(DB_TYPE(mp) == M_PROTO);
@@ -18615,14 +18586,6 @@ tcp_wput_accept(queue_t *q, mblk_t *mp)
* correct function (tcpclose_accept) in case allocb
* fails.
*/
- opt_mp = allocb(sizeof (struct stroptions), BPRI_HI);
- if (opt_mp == NULL) {
- mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
- if (mp != NULL)
- putnext(rq, mp);
- return;
- }
-
bcopy(mp->b_rptr + conn_res->OPT_offset,
&eager, conn_res->OPT_length);
PRIM_type = conn_res->PRIM_type;
@@ -18641,45 +18604,20 @@ tcp_wput_accept(queue_t *q, mblk_t *mp)
q->q_ptr = econnp;
q->q_qinfo = &tcp_winit;
listener = eager->tcp_listener;
- eager->tcp_issocket = B_TRUE;
/*
* TCP is _D_SODIRECT and sockfs is directly above so
* save shared sodirect_t pointer (if any).
- *
- * If tcp_fused and sodirect enabled disable it.
*/
eager->tcp_sodirect = SOD_QTOSODP(eager->tcp_rq);
- if (eager->tcp_fused && eager->tcp_sodirect != NULL) {
- /* Fused, disable sodirect */
- mutex_enter(eager->tcp_sodirect->sod_lockp);
- SOD_DISABLE(eager->tcp_sodirect);
- mutex_exit(eager->tcp_sodirect->sod_lockp);
- eager->tcp_sodirect = NULL;
+ if (tcp_accept_common(listener->tcp_connp,
+ econnp, NULL, CRED()) < 0) {
+ mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
+ if (mp != NULL)
+ putnext(rq, mp);
+ return;
}
- econnp->conn_zoneid = listener->tcp_connp->conn_zoneid;
- econnp->conn_allzones = listener->tcp_connp->conn_allzones;
- ASSERT(econnp->conn_netstack ==
- listener->tcp_connp->conn_netstack);
- ASSERT(eager->tcp_tcps == listener->tcp_tcps);
-
- /* Put the ref for IP */
- CONN_INC_REF(econnp);
-
- /*
- * We should have minimum of 3 references on the conn
- * at this point. One each for TCP and IP and one for
- * the T_conn_ind that was sent up when the 3-way handshake
- * completed. In the normal case we would also have another
- * reference (making a total of 4) for the conn being in the
- * classifier hash list. However the eager could have received
- * an RST subsequently and tcp_closei_local could have removed
- * the eager from the classifier hash list, hence we can't
- * assert that reference.
- */
- ASSERT(econnp->conn_ref >= 3);
-
/*
* Send the new local address also up to sockfs. There
* should already be enough space in the mp that came
@@ -18721,115 +18659,6 @@ tcp_wput_accept(queue_t *q, mblk_t *mp)
}
putnext(rq, mp);
-
- opt_mp->b_datap->db_type = M_SETOPTS;
- opt_mp->b_wptr += sizeof (struct stroptions);
-
- /*
- * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO
- * from listener to acceptor. The message is chained on the
- * bind_mp which tcp_rput_other will send down to IP.
- */
- if (listener->tcp_bound_if != 0) {
- /* allocate optmgmt req */
- mp = tcp_setsockopt_mp(IPPROTO_IPV6,
- IPV6_BOUND_IF, (char *)&listener->tcp_bound_if,
- sizeof (int));
- if (mp != NULL)
- linkb(opt_mp, mp);
- }
- if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) {
- uint_t on = 1;
-
- /* allocate optmgmt req */
- mp = tcp_setsockopt_mp(IPPROTO_IPV6,
- IPV6_RECVPKTINFO, (char *)&on, sizeof (on));
- if (mp != NULL)
- linkb(opt_mp, mp);
- }
-
-
- mutex_enter(&listener->tcp_eager_lock);
-
- if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
-
- tcp_t *tail;
- tcp_t *tcp;
- mblk_t *mp1;
-
- tcp = listener->tcp_eager_prev_q0;
- /*
- * listener->tcp_eager_prev_q0 points to the TAIL of the
- * deferred T_conn_ind queue. We need to get to the head
- * of the queue in order to send up T_conn_ind the same
- * order as how the 3WHS is completed.
- */
- while (tcp != listener) {
- if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0 &&
- !tcp->tcp_kssl_pending)
- break;
- else
- tcp = tcp->tcp_eager_prev_q0;
- }
- /* None of the pending eagers can be sent up now */
- if (tcp == listener)
- goto no_more_eagers;
-
- mp1 = tcp->tcp_conn.tcp_eager_conn_ind;
- tcp->tcp_conn.tcp_eager_conn_ind = NULL;
- /* Move from q0 to q */
- ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
- listener->tcp_conn_req_cnt_q0--;
- listener->tcp_conn_req_cnt_q++;
- tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
- tcp->tcp_eager_prev_q0;
- tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
- tcp->tcp_eager_next_q0;
- tcp->tcp_eager_prev_q0 = NULL;
- tcp->tcp_eager_next_q0 = NULL;
- tcp->tcp_conn_def_q0 = B_FALSE;
-
- /* Make sure the tcp isn't in the list of droppables */
- ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
- tcp->tcp_eager_prev_drop_q0 == NULL);
-
- /*
- * Insert at end of the queue because sockfs sends
- * down T_CONN_RES in chronological order. Leaving
- * the older conn indications at front of the queue
- * helps reducing search time.
- */
- tail = listener->tcp_eager_last_q;
- if (tail != NULL) {
- tail->tcp_eager_next_q = tcp;
- } else {
- listener->tcp_eager_next_q = tcp;
- }
- listener->tcp_eager_last_q = tcp;
- tcp->tcp_eager_next_q = NULL;
-
- /* Need to get inside the listener perimeter */
- CONN_INC_REF(listener->tcp_connp);
- SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp1,
- tcp_send_pending, listener->tcp_connp,
- SQ_FILL, SQTAG_TCP_SEND_PENDING);
- }
-no_more_eagers:
- tcp_eager_unlink(eager);
- mutex_exit(&listener->tcp_eager_lock);
-
- /*
- * At this point, the eager is detached from the listener
- * but we still have an extra refs on eager (apart from the
- * usual tcp references). The ref was placed in tcp_rput_data
- * before sending the conn_ind in tcp_send_conn_ind.
- * The ref will be dropped in tcp_accept_finish(). As sockfs
- * has already established this tcp with it's own stream,
- * it's OK to set tcp_detached to B_FALSE.
- */
- econnp->conn_tcp->tcp_detached = B_FALSE;
- SQUEUE_ENTER_ONE(econnp->conn_sqp, opt_mp, tcp_accept_finish,
- econnp, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0);
return;
default:
mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0);
@@ -18878,7 +18707,7 @@ tcp_getmyname(tcp_t *tcp, struct sockaddr *sa, uint_t *salenp)
}
static int
-tcp_getpeername(tcp_t *tcp, struct sockaddr *sa, uint_t *salenp)
+i_tcp_getpeername(tcp_t *tcp, struct sockaddr *sa, uint_t *salenp)
{
sin_t *sin = (sin_t *)sa;
sin6_t *sin6 = (sin6_t *)sa;
@@ -18898,6 +18727,7 @@ tcp_getpeername(tcp_t *tcp, struct sockaddr *sa, uint_t *salenp)
sin->sin_port = tcp->tcp_fport;
IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_remote_v6,
sin->sin_addr.s_addr);
+ *salenp = sizeof (sin_t);
break;
case AF_INET6:
@@ -18912,6 +18742,7 @@ tcp_getpeername(tcp_t *tcp, struct sockaddr *sa, uint_t *salenp)
sin6->sin6_flowinfo = tcp->tcp_ip6h->ip6_vcf &
~IPV6_VERS_AND_FLOW_MASK;
}
+ *salenp = sizeof (sin6_t);
break;
}
@@ -18939,7 +18770,7 @@ tcp_wput_cmdblk(queue_t *q, mblk_t *mp)
switch (cmdp->cb_cmd) {
case TI_GETPEERNAME:
- cmdp->cb_error = tcp_getpeername(tcp, data, &cmdp->cb_len);
+ cmdp->cb_error = i_tcp_getpeername(tcp, data, &cmdp->cb_len);
break;
case TI_GETMYNAME:
cmdp->cb_error = tcp_getmyname(tcp, data, &cmdp->cb_len);
@@ -18961,6 +18792,7 @@ tcp_wput(queue_t *q, mblk_t *mp)
t_scalar_t type;
uchar_t *rptr;
struct iocblk *iocp;
+ size_t size;
tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
ASSERT(connp->conn_ref >= 2);
@@ -18970,13 +18802,18 @@ tcp_wput(queue_t *q, mblk_t *mp)
tcp = connp->conn_tcp;
ASSERT(tcp != NULL);
+ size = msgdsize(mp);
+
mutex_enter(&tcp->tcp_non_sq_lock);
- tcp->tcp_squeue_bytes += msgdsize(mp);
+ tcp->tcp_squeue_bytes += size;
if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) {
tcp_setqfull(tcp);
}
mutex_exit(&tcp->tcp_non_sq_lock);
+ if (DB_CRED(mp) == NULL && is_system_labeled())
+ msg_setcredpid(mp, CONN_CRED(connp), curproc->p_pid);
+
CONN_INC_REF(connp);
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp,
tcp_squeue_flag, SQTAG_TCP_OUTPUT);
@@ -19108,6 +18945,16 @@ tcp_wput_sock(queue_t *wq, mblk_t *mp)
tcp_wput(wq, mp);
}
+/* ARGSUSED */
+static void
+tcp_wput_fallback(queue_t *wq, mblk_t *mp)
+{
+#ifdef DEBUG
+ cmn_err(CE_CONT, "tcp_wput_fallback: Message during fallback \n");
+#endif
+ freemsg(mp);
+}
+
static boolean_t
tcp_zcopy_check(tcp_t *tcp)
{
@@ -19150,10 +18997,12 @@ tcp_zcopy_check(tcp_t *tcp)
tcp->tcp_snd_zcopy_on = zc_enabled;
if (!TCP_IS_DETACHED(tcp)) {
if (zc_enabled) {
- (void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMSAFE);
+ (void) proto_set_tx_copyopt(tcp->tcp_rq, connp,
+ ZCVMSAFE);
TCP_STAT(tcps, tcp_zcopy_on);
} else {
- (void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMUNSAFE);
+ (void) proto_set_tx_copyopt(tcp->tcp_rq, connp,
+ ZCVMUNSAFE);
TCP_STAT(tcps, tcp_zcopy_off);
}
}
@@ -19170,7 +19019,8 @@ tcp_zcopy_disable(tcp_t *tcp, mblk_t *bp)
else if (tcp->tcp_snd_zcopy_on) {
tcp->tcp_snd_zcopy_on = B_FALSE;
if (!TCP_IS_DETACHED(tcp)) {
- (void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMUNSAFE);
+ (void) proto_set_tx_copyopt(tcp->tcp_rq, tcp->tcp_connp,
+ ZCVMUNSAFE);
TCP_STAT(tcps, tcp_zcopy_disable);
}
}
@@ -19259,9 +19109,16 @@ static void
tcp_zcopy_notify(tcp_t *tcp)
{
struct stdata *stp;
+ conn_t *connp;
if (tcp->tcp_detached)
return;
+ connp = tcp->tcp_connp;
+ if (IPCL_IS_NONSTR(connp)) {
+ (*connp->conn_upcalls->su_zcopy_notify)
+ (connp->conn_upper_handle);
+ return;
+ }
stp = STREAM(tcp->tcp_rq);
mutex_enter(&stp->sd_lock);
stp->sd_flag |= STZCNOTIFY;
@@ -19423,13 +19280,14 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp)
ASSERT(DB_TYPE(mp) == M_DATA);
- if (DB_CRED(mp) == NULL)
- mblk_setcred(mp, CONN_CRED(connp));
+ if (is_system_labeled() && DB_CRED(mp) == NULL)
+ mblk_setcred(mp, CONN_CRED(tcp->tcp_connp));
ipha = (ipha_t *)mp->b_rptr;
src = ipha->ipha_src;
dst = ipha->ipha_dst;
+ ASSERT(q != NULL);
DTRACE_PROBE2(tcp__trace__send, mblk_t *, mp, tcp_t *, tcp);
/*
@@ -22430,7 +22288,7 @@ tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp)
error = tcp_getmyname(tcp, (void *)mp1->b_rptr, &addrlen);
break;
case TI_GETPEERNAME:
- error = tcp_getpeername(tcp, (void *)mp1->b_rptr, &addrlen);
+ error = i_tcp_getpeername(tcp, (void *)mp1->b_rptr, &addrlen);
break;
}
@@ -22445,6 +22303,35 @@ tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp)
}
}
+static void
+tcp_disable_direct_sockfs(tcp_t *tcp)
+{
+#ifdef _ILP32
+ tcp->tcp_acceptor_id = (t_uscalar_t)tcp->tcp_rq;
+#else
+ tcp->tcp_acceptor_id = tcp->tcp_connp->conn_dev;
+#endif
+ /*
+ * Insert this socket into the acceptor hash.
+ * We might need it for T_CONN_RES message
+ */
+ tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
+
+ if (tcp->tcp_fused) {
+ /*
+ * This is a fused loopback tcp; disable
+ * read-side synchronous streams interface
+ * and drain any queued data. It is okay
+ * to do this for non-synchronous streams
+ * fused tcp as well.
+ */
+ tcp_fuse_disable_pair(tcp, B_FALSE);
+ }
+ tcp->tcp_issocket = B_FALSE;
+ tcp->tcp_sodirect = NULL;
+ TCP_STAT(tcp->tcp_tcps, tcp_sock_fallback);
+}
+
/*
* tcp_wput_ioctl is called by tcp_wput_nondata() to handle all M_IOCTL
* messages.
@@ -22457,7 +22344,6 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2)
tcp_t *tcp = connp->conn_tcp;
queue_t *q = tcp->tcp_wq;
struct iocblk *iocp;
- tcp_stack_t *tcps = tcp->tcp_tcps;
ASSERT(DB_TYPE(mp) == M_IOCTL);
/*
@@ -22498,31 +22384,7 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2)
DB_TYPE(mp) = M_IOCNAK;
iocp->ioc_error = EINVAL;
} else {
-#ifdef _ILP32
- tcp->tcp_acceptor_id = (t_uscalar_t)RD(q);
-#else
- tcp->tcp_acceptor_id = tcp->tcp_connp->conn_dev;
-#endif
- /*
- * Insert this socket into the acceptor hash.
- * We might need it for T_CONN_RES message
- */
- tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
-
- if (tcp->tcp_fused) {
- /*
- * This is a fused loopback tcp; disable
- * read-side synchronous streams interface
- * and drain any queued data. It is okay
- * to do this for non-synchronous streams
- * fused tcp as well.
- */
- tcp_fuse_disable_pair(tcp, B_FALSE);
- }
- tcp->tcp_issocket = B_FALSE;
- tcp->tcp_sodirect = NULL;
- TCP_STAT(tcps, tcp_sock_fallback);
-
+ tcp_disable_direct_sockfs(tcp);
DB_TYPE(mp) = M_IOCACK;
iocp->ioc_error = 0;
}
@@ -22546,7 +22408,6 @@ tcp_wput_proto(void *arg, mblk_t *mp, void *arg2)
union T_primitives *tprim = (union T_primitives *)mp->b_rptr;
uchar_t *rptr;
t_scalar_t type;
- int len;
cred_t *cr = DB_CREDDEF(mp, tcp->tcp_cred);
/*
@@ -22566,34 +22427,16 @@ tcp_wput_proto(void *arg, mblk_t *mp, void *arg2)
if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) {
type = ((union T_primitives *)rptr)->type;
if (type == T_EXDATA_REQ) {
- uint32_t msize = msgdsize(mp->b_cont);
-
- len = msize - 1;
- if (len < 0) {
- freemsg(mp);
- return;
- }
- /*
- * Try to force urgent data out on the wire.
- * Even if we have unsent data this will
- * at least send the urgent flag.
- * XXX does not handle more flag correctly.
- */
- len += tcp->tcp_unsent;
- len += tcp->tcp_snxt;
- tcp->tcp_urg = len;
- tcp->tcp_valid_bits |= TCP_URG_VALID;
-
- /* Bypass tcp protocol for fused tcp loopback */
- if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize))
- return;
+ tcp_output_urgent(connp, mp->b_cont, arg2);
+ freeb(mp);
} else if (type != T_DATA_REQ) {
goto non_urgent_data;
+ } else {
+ /* TODO: options, flags, ... from user */
+ /* Set length to zero for reclamation below */
+ tcp_wput_data(tcp, mp->b_cont, B_TRUE);
+ freeb(mp);
}
- /* TODO: options, flags, ... from user */
- /* Set length to zero for reclamation below */
- tcp_wput_data(tcp, mp->b_cont, B_TRUE);
- freeb(mp);
return;
} else {
if (tcp->tcp_debug) {
@@ -22631,17 +22474,17 @@ non_urgent_data:
/* FALLTHROUGH */
case O_T_BIND_REQ: /* bind request */
case T_BIND_REQ: /* new semantics bind request */
- tcp_bind(tcp, mp);
+ tcp_tpi_bind(tcp, mp);
break;
case T_UNBIND_REQ: /* unbind request */
- tcp_unbind(tcp, mp);
+ tcp_tpi_unbind(tcp, mp);
break;
case O_T_CONN_RES: /* old connection response XXX */
case T_CONN_RES: /* connection response */
- tcp_accept(tcp, mp);
+ tcp_tli_accept(tcp, mp);
break;
case T_CONN_REQ: /* connection request */
- tcp_connect(tcp, mp);
+ tcp_tpi_connect(tcp, mp);
break;
case T_DISCON_REQ: /* disconnect request */
tcp_disconnect(tcp, mp);
@@ -23278,6 +23121,7 @@ tcp_xmit_end(tcp_t *tcp)
ipic->ipic_rtt_sd = tcp->tcp_rtt_sd;
CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp);
+
return (0);
}
@@ -23798,14 +23642,15 @@ tcp_push_timer(void *arg)
{
conn_t *connp = (conn_t *)arg;
tcp_t *tcp = connp->conn_tcp;
- tcp_stack_t *tcps = tcp->tcp_tcps;
uint_t flags;
sodirect_t *sodp;
- TCP_DBGSTAT(tcps, tcp_push_timer_cnt);
+ TCP_DBGSTAT(tcp->tcp_tcps, tcp_push_timer_cnt);
ASSERT(tcp->tcp_listener == NULL);
+ ASSERT(!IPCL_IS_NONSTR(connp));
+
/*
* We need to plug synchronous streams during our drain to prevent
* a race with tcp_fuse_rrw() or tcp_fusion_rinfop().
@@ -23818,7 +23663,7 @@ tcp_push_timer(void *arg)
flags = tcp_rcv_sod_wakeup(tcp, sodp);
/* sod_wakeup() does the mutex_exit() */
} else if (tcp->tcp_rcv_list != NULL) {
- flags = tcp_rcv_drain(tcp->tcp_rq, tcp);
+ flags = tcp_rcv_drain(tcp);
}
if (flags == TH_ACK_NEEDED)
tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
@@ -24030,15 +23875,19 @@ tcp_ack_mp(tcp_t *tcp)
}
/*
- * Hash list insertion routine for tcp_t structures.
- * Inserts entries with the ones bound to a specific IP address first
- * followed by those bound to INADDR_ANY.
+ * Hash list insertion routine for tcp_t structures. Each hash bucket
+ * contains a list of tcp_t entries, and each entry is bound to a unique
+ * port. If there are multiple tcp_t's that are bound to the same port, then
+ * one of them will be linked into the hash bucket list, and the rest will
+ * hang off of that one entry. For each port, entries bound to a specific IP
+ * address will be inserted before those those bound to INADDR_ANY.
*/
static void
tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
{
tcp_t **tcpp;
tcp_t *tcpnext;
+ tcp_t *tcphash;
if (tcp->tcp_ptpbhn != NULL) {
ASSERT(!caller_holds_lock);
@@ -24050,9 +23899,22 @@ tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
} else {
ASSERT(MUTEX_HELD(&tbf->tf_lock));
}
- tcpnext = tcpp[0];
- if (tcpnext) {
+ tcphash = tcpp[0];
+ tcpnext = NULL;
+ if (tcphash != NULL) {
+ /* Look for an entry using the same port */
+ while ((tcphash = tcpp[0]) != NULL &&
+ tcp->tcp_lport != tcphash->tcp_lport)
+ tcpp = &(tcphash->tcp_bind_hash);
+
+ /* The port was not found, just add to the end */
+ if (tcphash == NULL)
+ goto insert;
+
/*
+ * OK, there already exists an entry bound to the
+ * same port.
+ *
* If the new tcp bound to the INADDR_ANY address
* and the first one in the list is not bound to
* INADDR_ANY we skip all entries until we find the
@@ -24061,17 +23923,36 @@ tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
* specific address get preference over those binding to
* INADDR_ANY.
*/
+ tcpnext = tcphash;
+ tcphash = NULL;
if (V6_OR_V4_INADDR_ANY(tcp->tcp_bound_source_v6) &&
!V6_OR_V4_INADDR_ANY(tcpnext->tcp_bound_source_v6)) {
while ((tcpnext = tcpp[0]) != NULL &&
!V6_OR_V4_INADDR_ANY(tcpnext->tcp_bound_source_v6))
- tcpp = &(tcpnext->tcp_bind_hash);
- if (tcpnext)
- tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash;
- } else
- tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash;
+ tcpp = &(tcpnext->tcp_bind_hash_port);
+
+ if (tcpnext) {
+ tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
+ tcphash = tcpnext->tcp_bind_hash;
+ if (tcphash != NULL) {
+ tcphash->tcp_ptpbhn =
+ &(tcp->tcp_bind_hash);
+ tcpnext->tcp_bind_hash = NULL;
+ }
+ }
+ } else {
+ tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
+ tcphash = tcpnext->tcp_bind_hash;
+ if (tcphash != NULL) {
+ tcphash->tcp_ptpbhn =
+ &(tcp->tcp_bind_hash);
+ tcpnext->tcp_bind_hash = NULL;
+ }
+ }
}
- tcp->tcp_bind_hash = tcpnext;
+insert:
+ tcp->tcp_bind_hash_port = tcpnext;
+ tcp->tcp_bind_hash = tcphash;
tcp->tcp_ptpbhn = tcpp;
tcpp[0] = tcp;
if (!caller_holds_lock)
@@ -24101,8 +23982,17 @@ tcp_bind_hash_remove(tcp_t *tcp)
ASSERT(lockp != NULL);
mutex_enter(lockp);
if (tcp->tcp_ptpbhn) {
- tcpnext = tcp->tcp_bind_hash;
- if (tcpnext) {
+ tcpnext = tcp->tcp_bind_hash_port;
+ if (tcpnext != NULL) {
+ tcp->tcp_bind_hash_port = NULL;
+ tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
+ tcpnext->tcp_bind_hash = tcp->tcp_bind_hash;
+ if (tcpnext->tcp_bind_hash != NULL) {
+ tcpnext->tcp_bind_hash->tcp_ptpbhn =
+ &(tcpnext->tcp_bind_hash);
+ tcp->tcp_bind_hash = NULL;
+ }
+ } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) {
tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
tcp->tcp_bind_hash = NULL;
}
@@ -24507,36 +24397,6 @@ tcp_random(void)
return (i);
}
-/*
- * XXX This will go away when TPI is extended to send
- * info reqs to sockfs/timod .....
- * Given a queue, set the max packet size for the write
- * side of the queue below stream head. This value is
- * cached on the stream head.
- * Returns 1 on success, 0 otherwise.
- */
-static int
-setmaxps(queue_t *q, int maxpsz)
-{
- struct stdata *stp;
- queue_t *wq;
- stp = STREAM(q);
-
- /*
- * At this point change of a queue parameter is not allowed
- * when a multiplexor is sitting on top.
- */
- if (stp->sd_flag & STPLEX)
- return (0);
-
- claimstr(stp->sd_wrq);
- wq = stp->sd_wrq->q_next;
- ASSERT(wq != NULL);
- (void) strqset(wq, QMAXPSZ, 0, maxpsz);
- releasestr(stp->sd_wrq);
- return (1);
-}
-
static int
tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp,
int *t_errorp, int *sys_errorp)
@@ -24964,6 +24824,8 @@ tcp_ddi_g_init(void)
}
+#define INET_NAME "ip"
+
/*
* Initialize the TCP stack instance.
*/
@@ -24973,6 +24835,8 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns)
tcp_stack_t *tcps;
tcpparam_t *pa;
int i;
+ int error = 0;
+ major_t major;
tcps = (tcp_stack_t *)kmem_zalloc(sizeof (*tcps), KM_SLEEP);
tcps->tcps_netstack = ns;
@@ -25038,6 +24902,9 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns)
tcps->tcps_kstat = tcp_kstat2_init(stackid, &tcps->tcps_statistics);
tcps->tcps_mibkp = tcp_kstat_init(stackid, tcps);
+ major = mod_name_to_major(INET_NAME);
+ error = ldi_ident_from_major(major, &tcps->tcps_ldi_ident);
+ ASSERT(error == 0);
return (tcps);
}
@@ -25125,6 +24992,7 @@ tcp_stack_fini(netstackid_t stackid, void *arg)
tcp_kstat_fini(stackid, tcps->tcps_mibkp);
tcps->tcps_mibkp = NULL;
+ ldi_ident_release(tcps->tcps_ldi_ident);
kmem_free(tcps, sizeof (*tcps));
}
@@ -25922,44 +25790,6 @@ done:
}
/*
- * Allocate a T_SVR4_OPTMGMT_REQ.
- * The caller needs to increment tcp_drop_opt_ack_cnt when sending these so
- * that tcp_rput_other can drop the acks.
- */
-static mblk_t *
-tcp_setsockopt_mp(int level, int cmd, char *opt, int optlen)
-{
- mblk_t *mp;
- struct T_optmgmt_req *tor;
- struct opthdr *oh;
- uint_t size;
- char *optptr;
-
- size = sizeof (*tor) + sizeof (*oh) + optlen;
- mp = allocb(size, BPRI_MED);
- if (mp == NULL)
- return (NULL);
-
- mp->b_wptr += size;
- mp->b_datap->db_type = M_PROTO;
- tor = (struct T_optmgmt_req *)mp->b_rptr;
- tor->PRIM_type = T_SVR4_OPTMGMT_REQ;
- tor->MGMT_flags = T_NEGOTIATE;
- tor->OPT_length = sizeof (*oh) + optlen;
- tor->OPT_offset = (t_scalar_t)sizeof (*tor);
-
- oh = (struct opthdr *)&tor[1];
- oh->level = level;
- oh->name = cmd;
- oh->len = optlen;
- if (optlen != 0) {
- optptr = (char *)&oh[1];
- bcopy(opt, optptr, optlen);
- }
- return (mp);
-}
-
-/*
* TCP Timers Implementation.
*/
timeout_id_t
@@ -25968,16 +25798,15 @@ tcp_timeout(conn_t *connp, void (*f)(void *), clock_t tim)
mblk_t *mp;
tcp_timer_t *tcpt;
tcp_t *tcp = connp->conn_tcp;
- tcp_stack_t *tcps = tcp->tcp_tcps;
ASSERT(connp->conn_sqp != NULL);
- TCP_DBGSTAT(tcps, tcp_timeout_calls);
+ TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_calls);
if (tcp->tcp_timercache == NULL) {
mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC);
} else {
- TCP_DBGSTAT(tcps, tcp_timeout_cached_alloc);
+ TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_cached_alloc);
mp = tcp->tcp_timercache;
tcp->tcp_timercache = mp->b_next;
mp->b_next = NULL;
@@ -26052,9 +25881,8 @@ tcp_timeout_cancel(conn_t *connp, timeout_id_t id)
mblk_t *mp = (mblk_t *)id;
tcp_timer_t *tcpt;
clock_t delta;
- tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps;
- TCP_DBGSTAT(tcps, tcp_timeout_cancel_reqs);
+ TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_cancel_reqs);
if (mp == NULL)
return (-1);
@@ -26065,7 +25893,7 @@ tcp_timeout_cancel(conn_t *connp, timeout_id_t id)
delta = untimeout_default(tcpt->tcpt_tid, 0);
if (delta >= 0) {
- TCP_DBGSTAT(tcps, tcp_timeout_canceled);
+ TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_canceled);
tcp_timer_free(connp->conn_tcp, mp);
CONN_DEC_REF(connp);
}
@@ -26156,7 +25984,6 @@ static void
tcp_timer_free(tcp_t *tcp, mblk_t *mp)
{
mblk_t *mp1 = tcp->tcp_timercache;
- tcp_stack_t *tcps = tcp->tcp_tcps;
if (mp->b_wptr != NULL) {
/*
@@ -26174,7 +26001,7 @@ tcp_timer_free(tcp_t *tcp, mblk_t *mp)
tcp->tcp_timercache = mp;
} else {
kmem_cache_free(tcp_timercache, mp);
- TCP_DBGSTAT(tcps, tcp_timermp_freed);
+ TCP_DBGSTAT(tcp->tcp_tcps, tcp_timermp_freed);
}
}
@@ -26188,23 +26015,33 @@ tcp_timer_free(tcp_t *tcp, mblk_t *mp)
* decision to call based on the tcp_t.tcp_flow_stopped value which
* when check outside the q's lock is only an advisory check ...
*/
-
void
tcp_setqfull(tcp_t *tcp)
{
- queue_t *q = tcp->tcp_wq;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ conn_t *connp = tcp->tcp_connp;
+
+ if (tcp->tcp_closed)
+ return;
+
+ if (IPCL_IS_NONSTR(connp)) {
+ (*connp->conn_upcalls->su_txq_full)
+ (tcp->tcp_connp->conn_upper_handle, B_TRUE);
+ tcp->tcp_flow_stopped = B_TRUE;
+ } else {
+ queue_t *q = tcp->tcp_wq;
- if (!(q->q_flag & QFULL)) {
- mutex_enter(QLOCK(q));
if (!(q->q_flag & QFULL)) {
- /* still need to set QFULL */
- q->q_flag |= QFULL;
- tcp->tcp_flow_stopped = B_TRUE;
- mutex_exit(QLOCK(q));
- TCP_STAT(tcps, tcp_flwctl_on);
- } else {
- mutex_exit(QLOCK(q));
+ mutex_enter(QLOCK(q));
+ if (!(q->q_flag & QFULL)) {
+ /* still need to set QFULL */
+ q->q_flag |= QFULL;
+ tcp->tcp_flow_stopped = B_TRUE;
+ mutex_exit(QLOCK(q));
+ TCP_STAT(tcps, tcp_flwctl_on);
+ } else {
+ mutex_exit(QLOCK(q));
+ }
}
}
}
@@ -26212,23 +26049,33 @@ tcp_setqfull(tcp_t *tcp)
void
tcp_clrqfull(tcp_t *tcp)
{
- queue_t *q = tcp->tcp_wq;
+ conn_t *connp = tcp->tcp_connp;
+
+ if (tcp->tcp_closed)
+ return;
+
+ if (IPCL_IS_NONSTR(connp)) {
+ (*connp->conn_upcalls->su_txq_full)
+ (tcp->tcp_connp->conn_upper_handle, B_FALSE);
+ tcp->tcp_flow_stopped = B_FALSE;
+ } else {
+ queue_t *q = tcp->tcp_wq;
- if (q->q_flag & QFULL) {
- mutex_enter(QLOCK(q));
if (q->q_flag & QFULL) {
- q->q_flag &= ~QFULL;
- tcp->tcp_flow_stopped = B_FALSE;
- mutex_exit(QLOCK(q));
- if (q->q_flag & QWANTW)
- qbackenable(q, 0);
- } else {
- mutex_exit(QLOCK(q));
+ mutex_enter(QLOCK(q));
+ if (q->q_flag & QFULL) {
+ q->q_flag &= ~QFULL;
+ tcp->tcp_flow_stopped = B_FALSE;
+ mutex_exit(QLOCK(q));
+ if (q->q_flag & QWANTW)
+ qbackenable(q, 0);
+ } else {
+ mutex_exit(QLOCK(q));
+ }
}
}
}
-
/*
* kstats related to squeues i.e. not per IP instance
*/
@@ -26681,3 +26528,1626 @@ tcp_squeue_add(squeue_t *sqp)
}
tcp_time_wait->tcp_free_list_cnt = 0;
}
+
+static int
+tcp_post_ip_bind(tcp_t *tcp, mblk_t *mp, int error)
+{
+ mblk_t *ire_mp = NULL;
+ mblk_t *syn_mp;
+ mblk_t *mdti;
+ mblk_t *lsoi;
+ int retval;
+ tcph_t *tcph;
+ uint32_t mss;
+ queue_t *q = tcp->tcp_rq;
+ conn_t *connp = tcp->tcp_connp;
+ tcp_stack_t *tcps = tcp->tcp_tcps;
+
+ if (error == 0) {
+ /*
+ * Adapt Multidata information, if any. The
+ * following tcp_mdt_update routine will free
+ * the message.
+ */
+ if (mp != NULL && ((mdti = tcp_mdt_info_mp(mp)) != NULL)) {
+ tcp_mdt_update(tcp, &((ip_mdt_info_t *)mdti->
+ b_rptr)->mdt_capab, B_TRUE);
+ freemsg(mdti);
+ }
+
+ /*
+ * Check to update LSO information with tcp, and
+ * tcp_lso_update routine will free the message.
+ */
+ if (mp != NULL && ((lsoi = tcp_lso_info_mp(mp)) != NULL)) {
+ tcp_lso_update(tcp, &((ip_lso_info_t *)lsoi->
+ b_rptr)->lso_capab);
+ freemsg(lsoi);
+ }
+
+ /* Get the IRE, if we had requested for it */
+ if (mp != NULL)
+ ire_mp = tcp_ire_mp(&mp);
+
+ if (tcp->tcp_hard_binding) {
+ tcp->tcp_hard_binding = B_FALSE;
+ tcp->tcp_hard_bound = B_TRUE;
+ CL_INET_CONNECT(tcp);
+ } else {
+ if (ire_mp != NULL)
+ freeb(ire_mp);
+ goto after_syn_sent;
+ }
+
+ retval = tcp_adapt_ire(tcp, ire_mp);
+ if (ire_mp != NULL)
+ freeb(ire_mp);
+ if (retval == 0) {
+ error = (int)((tcp->tcp_state >= TCPS_SYN_SENT) ?
+ ENETUNREACH : EADDRNOTAVAIL);
+ goto ipcl_rm;
+ }
+ /*
+ * Don't let an endpoint connect to itself.
+ * Also checked in tcp_connect() but that
+ * check can't handle the case when the
+ * local IP address is INADDR_ANY.
+ */
+ if (tcp->tcp_ipversion == IPV4_VERSION) {
+ if ((tcp->tcp_ipha->ipha_dst ==
+ tcp->tcp_ipha->ipha_src) &&
+ (BE16_EQL(tcp->tcp_tcph->th_lport,
+ tcp->tcp_tcph->th_fport))) {
+ error = EADDRNOTAVAIL;
+ goto ipcl_rm;
+ }
+ } else {
+ if (IN6_ARE_ADDR_EQUAL(
+ &tcp->tcp_ip6h->ip6_dst,
+ &tcp->tcp_ip6h->ip6_src) &&
+ (BE16_EQL(tcp->tcp_tcph->th_lport,
+ tcp->tcp_tcph->th_fport))) {
+ error = EADDRNOTAVAIL;
+ goto ipcl_rm;
+ }
+ }
+ ASSERT(tcp->tcp_state == TCPS_SYN_SENT);
+ /*
+ * This should not be possible! Just for
+ * defensive coding...
+ */
+ if (tcp->tcp_state != TCPS_SYN_SENT)
+ goto after_syn_sent;
+
+ if (is_system_labeled() &&
+ !tcp_update_label(tcp, CONN_CRED(tcp->tcp_connp))) {
+ error = EHOSTUNREACH;
+ goto ipcl_rm;
+ }
+
+ /*
+ * tcp_adapt_ire() does not adjust
+ * for TCP/IP header length.
+ */
+ mss = tcp->tcp_mss - tcp->tcp_hdr_len;
+
+ /*
+ * Just make sure our rwnd is at
+ * least tcp_recv_hiwat_mss * MSS
+ * large, and round up to the nearest
+ * MSS.
+ *
+ * We do the round up here because
+ * we need to get the interface
+ * MTU first before we can do the
+ * round up.
+ */
+ tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss),
+ tcps->tcps_recv_hiwat_minmss * mss);
+ if (!IPCL_IS_NONSTR(connp))
+ q->q_hiwat = tcp->tcp_rwnd;
+ tcp->tcp_recv_hiwater = tcp->tcp_rwnd;
+ tcp_set_ws_value(tcp);
+ U32_TO_ABE16((tcp->tcp_rwnd >> tcp->tcp_rcv_ws),
+ tcp->tcp_tcph->th_win);
+ if (tcp->tcp_rcv_ws > 0 || tcps->tcps_wscale_always)
+ tcp->tcp_snd_ws_ok = B_TRUE;
+
+ /*
+ * Set tcp_snd_ts_ok to true
+ * so that tcp_xmit_mp will
+ * include the timestamp
+ * option in the SYN segment.
+ */
+ if (tcps->tcps_tstamp_always ||
+ (tcp->tcp_rcv_ws && tcps->tcps_tstamp_if_wscale)) {
+ tcp->tcp_snd_ts_ok = B_TRUE;
+ }
+
+ /*
+ * tcp_snd_sack_ok can be set in
+ * tcp_adapt_ire() if the sack metric
+ * is set. So check it here also.
+ */
+ if (tcps->tcps_sack_permitted == 2 ||
+ tcp->tcp_snd_sack_ok) {
+ if (tcp->tcp_sack_info == NULL) {
+ tcp->tcp_sack_info =
+ kmem_cache_alloc(tcp_sack_info_cache,
+ KM_SLEEP);
+ }
+ tcp->tcp_snd_sack_ok = B_TRUE;
+ }
+
+ /*
+ * Should we use ECN? Note that the current
+ * default value (SunOS 5.9) of tcp_ecn_permitted
+ * is 1. The reason for doing this is that there
+ * are equipments out there that will drop ECN
+ * enabled IP packets. Setting it to 1 avoids
+ * compatibility problems.
+ */
+ if (tcps->tcps_ecn_permitted == 2)
+ tcp->tcp_ecn_ok = B_TRUE;
+
+ TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
+ syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL,
+ tcp->tcp_iss, B_FALSE, NULL, B_FALSE);
+ if (syn_mp) {
+ cred_t *cr;
+ pid_t pid;
+
+ /*
+ * Obtain the credential from the
+ * thread calling connect().
+ * If none can be found, default to
+ * the creator of the socket.
+ */
+ if (mp == NULL ||
+ (cr = DB_CRED(mp)) == NULL) {
+ cr = tcp->tcp_cred;
+ pid = tcp->tcp_cpid;
+ } else {
+ pid = DB_CPID(mp);
+ }
+
+ mblk_setcred(syn_mp, cr);
+ DB_CPID(syn_mp) = pid;
+ tcp_send_data(tcp, tcp->tcp_wq, syn_mp);
+ }
+ after_syn_sent:
+ /*
+ * A trailer mblk indicates a waiting client upstream.
+ * We complete here the processing begun in
+ * either tcp_bind() or tcp_connect() by passing
+ * upstream the reply message they supplied.
+ */
+ if (mp != NULL) {
+ ASSERT(mp->b_cont == NULL);
+ freeb(mp);
+ }
+ return (error);
+ } else {
+ /* error */
+ if (tcp->tcp_debug) {
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
+ "tcp_post_ip_bind: error == %d", error);
+ }
+ if (mp != NULL) {
+ freeb(mp);
+ }
+ }
+
+ipcl_rm:
+ /*
+ * Need to unbind with classifier since we were just
+ * told that our bind succeeded. a.k.a error == 0 at the entry.
+ */
+ tcp->tcp_hard_bound = B_FALSE;
+ tcp->tcp_hard_binding = B_FALSE;
+
+ ipcl_hash_remove(connp);
+
+bind_failed:
+ tcp->tcp_state = TCPS_IDLE;
+ if (tcp->tcp_ipversion == IPV4_VERSION)
+ tcp->tcp_ipha->ipha_src = 0;
+ else
+ V6_SET_ZERO(tcp->tcp_ip6h->ip6_src);
+ /*
+ * Copy of the src addr. in tcp_t is needed since
+ * the lookup funcs. can only look at tcp_t
+ */
+ V6_SET_ZERO(tcp->tcp_ip_src_v6);
+
+ tcph = tcp->tcp_tcph;
+ tcph->th_lport[0] = 0;
+ tcph->th_lport[1] = 0;
+ tcp_bind_hash_remove(tcp);
+ bzero(&connp->u_port, sizeof (connp->u_port));
+ /* blow away saved option results if any */
+ if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
+ tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
+
+ conn_delete_ire(tcp->tcp_connp, NULL);
+
+ return (error);
+}
+
+static int
+tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
+ boolean_t bind_to_req_port_only, cred_t *cr)
+{
+ in_port_t mlp_port;
+ mlp_type_t addrtype, mlptype;
+ boolean_t user_specified;
+ in_port_t allocated_port;
+ in_port_t requested_port = *requested_port_ptr;
+ conn_t *connp;
+ zone_t *zone;
+ tcp_stack_t *tcps = tcp->tcp_tcps;
+ in6_addr_t v6addr = tcp->tcp_ip_src_v6;
+
+ /*
+ * XXX It's up to the caller to specify bind_to_req_port_only or not.
+ */
+ if (cr == NULL)
+ cr = tcp->tcp_cred;
+ /*
+ * Get a valid port (within the anonymous range and should not
+ * be a privileged one) to use if the user has not given a port.
+ * If multiple threads are here, they may all start with
+ * with the same initial port. But, it should be fine as long as
+ * tcp_bindi will ensure that no two threads will be assigned
+ * the same port.
+ *
+ * NOTE: XXX If a privileged process asks for an anonymous port, we
+ * still check for ports only in the range > tcp_smallest_non_priv_port,
+ * unless TCP_ANONPRIVBIND option is set.
+ */
+ mlptype = mlptSingle;
+ mlp_port = requested_port;
+ if (requested_port == 0) {
+ requested_port = tcp->tcp_anon_priv_bind ?
+ tcp_get_next_priv_port(tcp) :
+ tcp_update_next_port(tcps->tcps_next_port_to_try,
+ tcp, B_TRUE);
+ if (requested_port == 0) {
+ return (-TNOADDR);
+ }
+ user_specified = B_FALSE;
+
+ /*
+ * If the user went through one of the RPC interfaces to create
+ * this socket and RPC is MLP in this zone, then give him an
+ * anonymous MLP.
+ */
+ connp = tcp->tcp_connp;
+ if (connp->conn_anon_mlp && is_system_labeled()) {
+ zone = crgetzone(cr);
+ addrtype = tsol_mlp_addr_type(zone->zone_id,
+ IPV6_VERSION, &v6addr,
+ tcps->tcps_netstack->netstack_ip);
+ if (addrtype == mlptSingle) {
+ return (-TNOADDR);
+ }
+ mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
+ PMAPPORT, addrtype);
+ mlp_port = PMAPPORT;
+ }
+ } else {
+ int i;
+ boolean_t priv = B_FALSE;
+
+ /*
+ * If the requested_port is in the well-known privileged range,
+ * verify that the stream was opened by a privileged user.
+ * Note: No locks are held when inspecting tcp_g_*epriv_ports
+ * but instead the code relies on:
+ * - the fact that the address of the array and its size never
+ * changes
+ * - the atomic assignment of the elements of the array
+ */
+ if (requested_port < tcps->tcps_smallest_nonpriv_port) {
+ priv = B_TRUE;
+ } else {
+ for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
+ if (requested_port ==
+ tcps->tcps_g_epriv_ports[i]) {
+ priv = B_TRUE;
+ break;
+ }
+ }
+ }
+ if (priv) {
+ if (secpolicy_net_privaddr(cr, requested_port,
+ IPPROTO_TCP) != 0) {
+ if (tcp->tcp_debug) {
+ (void) strlog(TCP_MOD_ID, 0, 1,
+ SL_ERROR|SL_TRACE,
+ "tcp_bind: no priv for port %d",
+ requested_port);
+ }
+ return (-TACCES);
+ }
+ }
+ user_specified = B_TRUE;
+
+ connp = tcp->tcp_connp;
+ if (is_system_labeled()) {
+ zone = crgetzone(cr);
+ addrtype = tsol_mlp_addr_type(zone->zone_id,
+ IPV6_VERSION, &v6addr,
+ tcps->tcps_netstack->netstack_ip);
+ if (addrtype == mlptSingle) {
+ return (-TNOADDR);
+ }
+ mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
+ requested_port, addrtype);
+ }
+ }
+
+ if (mlptype != mlptSingle) {
+ if (secpolicy_net_bindmlp(cr) != 0) {
+ if (tcp->tcp_debug) {
+ (void) strlog(TCP_MOD_ID, 0, 1,
+ SL_ERROR|SL_TRACE,
+ "tcp_bind: no priv for multilevel port %d",
+ requested_port);
+ }
+ return (-TACCES);
+ }
+
+ /*
+ * If we're specifically binding a shared IP address and the
+ * port is MLP on shared addresses, then check to see if this
+ * zone actually owns the MLP. Reject if not.
+ */
+ if (mlptype == mlptShared && addrtype == mlptShared) {
+ /*
+ * No need to handle exclusive-stack zones since
+ * ALL_ZONES only applies to the shared stack.
+ */
+ zoneid_t mlpzone;
+
+ mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
+ htons(mlp_port));
+ if (connp->conn_zoneid != mlpzone) {
+ if (tcp->tcp_debug) {
+ (void) strlog(TCP_MOD_ID, 0, 1,
+ SL_ERROR|SL_TRACE,
+ "tcp_bind: attempt to bind port "
+ "%d on shared addr in zone %d "
+ "(should be %d)",
+ mlp_port, connp->conn_zoneid,
+ mlpzone);
+ }
+ return (-TACCES);
+ }
+ }
+
+ if (!user_specified) {
+ int err;
+ err = tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
+ requested_port, B_TRUE);
+ if (err != 0) {
+ if (tcp->tcp_debug) {
+ (void) strlog(TCP_MOD_ID, 0, 1,
+ SL_ERROR|SL_TRACE,
+ "tcp_bind: cannot establish anon "
+ "MLP for port %d",
+ requested_port);
+ }
+ return (err);
+ }
+ connp->conn_anon_port = B_TRUE;
+ }
+ connp->conn_mlp_type = mlptype;
+ }
+
+ allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
+ tcp->tcp_reuseaddr, B_FALSE, bind_to_req_port_only, user_specified);
+
+ if (allocated_port == 0) {
+ connp->conn_mlp_type = mlptSingle;
+ if (connp->conn_anon_port) {
+ connp->conn_anon_port = B_FALSE;
+ (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
+ requested_port, B_FALSE);
+ }
+ if (bind_to_req_port_only) {
+ if (tcp->tcp_debug) {
+ (void) strlog(TCP_MOD_ID, 0, 1,
+ SL_ERROR|SL_TRACE,
+ "tcp_bind: requested addr busy");
+ }
+ return (-TADDRBUSY);
+ } else {
+ /* If we are out of ports, fail the bind. */
+ if (tcp->tcp_debug) {
+ (void) strlog(TCP_MOD_ID, 0, 1,
+ SL_ERROR|SL_TRACE,
+ "tcp_bind: out of ports?");
+ }
+ return (-TNOADDR);
+ }
+ }
+
+ /* Pass the allocated port back */
+ *requested_port_ptr = allocated_port;
+ return (0);
+}
+
+static int
+tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
+ boolean_t bind_to_req_port_only)
+{
+ tcp_t *tcp = connp->conn_tcp;
+
+ sin_t *sin;
+ sin6_t *sin6;
+ sin6_t sin6addr;
+ in_port_t requested_port;
+ ipaddr_t v4addr;
+ in6_addr_t v6addr;
+ uint_t origipversion;
+ int error = 0;
+
+ ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX);
+
+ if (tcp->tcp_state == TCPS_BOUND) {
+ return (0);
+ } else if (tcp->tcp_state > TCPS_BOUND) {
+ if (tcp->tcp_debug) {
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
+ "tcp_bind: bad state, %d", tcp->tcp_state);
+ }
+ return (-TOUTSTATE);
+ }
+ origipversion = tcp->tcp_ipversion;
+
+ if (sa != NULL && !OK_32PTR((char *)sa)) {
+ if (tcp->tcp_debug) {
+ (void) strlog(TCP_MOD_ID, 0, 1,
+ SL_ERROR|SL_TRACE,
+ "tcp_bind: bad address parameter, "
+ "address %p, len %d",
+ (void *)sa, len);
+ }
+ return (-TPROTO);
+ }
+
+ switch (len) {
+ case 0: /* request for a generic port */
+ if (tcp->tcp_family == AF_INET) {
+ sin = (sin_t *)&sin6addr;
+ *sin = sin_null;
+ sin->sin_family = AF_INET;
+ tcp->tcp_ipversion = IPV4_VERSION;
+ IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &v6addr);
+ } else {
+ ASSERT(tcp->tcp_family == AF_INET6);
+ sin6 = (sin6_t *)&sin6addr;
+ *sin6 = sin6_null;
+ sin6->sin6_family = AF_INET6;
+ tcp->tcp_ipversion = IPV6_VERSION;
+ V6_SET_ZERO(v6addr);
+ }
+ requested_port = 0;
+ break;
+
+ case sizeof (sin_t): /* Complete IPv4 address */
+ sin = (sin_t *)sa;
+ /*
+ * With sockets sockfs will accept bogus sin_family in
+ * bind() and replace it with the family used in the socket
+ * call.
+ */
+ if (sin->sin_family != AF_INET ||
+ tcp->tcp_family != AF_INET) {
+ return (EAFNOSUPPORT);
+ }
+ requested_port = ntohs(sin->sin_port);
+ tcp->tcp_ipversion = IPV4_VERSION;
+ v4addr = sin->sin_addr.s_addr;
+ IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
+ break;
+
+ case sizeof (sin6_t): /* Complete IPv6 address */
+ sin6 = (sin6_t *)sa;
+ if (sin6->sin6_family != AF_INET6 ||
+ tcp->tcp_family != AF_INET6) {
+ return (EAFNOSUPPORT);
+ }
+ requested_port = ntohs(sin6->sin6_port);
+ tcp->tcp_ipversion = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ?
+ IPV4_VERSION : IPV6_VERSION;
+ v6addr = sin6->sin6_addr;
+ break;
+
+ default:
+ if (tcp->tcp_debug) {
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
+ "tcp_bind: bad address length, %d", len);
+ }
+ return (EAFNOSUPPORT);
+ /* return (-TBADADDR); */
+ }
+
+ tcp->tcp_bound_source_v6 = v6addr;
+
+ /* Check for change in ipversion */
+ if (origipversion != tcp->tcp_ipversion) {
+ ASSERT(tcp->tcp_family == AF_INET6);
+ error = tcp->tcp_ipversion == IPV6_VERSION ?
+ tcp_header_init_ipv6(tcp) : tcp_header_init_ipv4(tcp);
+ if (error) {
+ return (ENOMEM);
+ }
+ }
+
+ /*
+ * Initialize family specific fields. Copy of the src addr.
+ * in tcp_t is needed for the lookup funcs.
+ */
+ if (tcp->tcp_ipversion == IPV6_VERSION) {
+ tcp->tcp_ip6h->ip6_src = v6addr;
+ } else {
+ IN6_V4MAPPED_TO_IPADDR(&v6addr, tcp->tcp_ipha->ipha_src);
+ }
+ tcp->tcp_ip_src_v6 = v6addr;
+
+ bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only;
+
+ error = tcp_bind_select_lport(tcp, &requested_port,
+ bind_to_req_port_only, cr);
+
+ return (error);
+}
+
+/*
+ * Return unix error is tli error is TSYSERR, otherwise return a negative
+ * tli error.
+ */
+int
+tcp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
+ boolean_t bind_to_req_port_only)
+{
+ int error;
+ tcp_t *tcp = connp->conn_tcp;
+
+ if (tcp->tcp_state >= TCPS_BOUND) {
+ if (tcp->tcp_debug) {
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
+ "tcp_bind: bad state, %d", tcp->tcp_state);
+ }
+ return (-TOUTSTATE);
+ }
+
+ error = tcp_bind_check(connp, sa, len, cr, bind_to_req_port_only);
+ if (error != 0)
+ return (error);
+
+ ASSERT(tcp->tcp_state == TCPS_BOUND);
+
+ tcp->tcp_conn_req_max = 0;
+
+ /*
+ * We need to make sure that the conn_recv is set to a non-null
+ * value before we insert the conn into the classifier table.
+ * This is to avoid a race with an incoming packet which does an
+ * ipcl_classify().
+ */
+ connp->conn_recv = tcp_conn_request;
+
+ if (tcp->tcp_family == AF_INET6) {
+ ASSERT(tcp->tcp_connp->conn_af_isv6);
+ error = ip_proto_bind_laddr_v6(connp, NULL, IPPROTO_TCP,
+ &tcp->tcp_bound_source_v6, 0, B_FALSE);
+ } else {
+ ASSERT(!tcp->tcp_connp->conn_af_isv6);
+ error = ip_proto_bind_laddr_v4(connp, NULL, IPPROTO_TCP,
+ tcp->tcp_ipha->ipha_src, 0, B_FALSE);
+ }
+ return (tcp_post_ip_bind(tcp, NULL, error));
+}
+
+int
+tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
+ socklen_t len, cred_t *cr)
+{
+ int error;
+ conn_t *connp = (conn_t *)proto_handle;
+ squeue_t *sqp = connp->conn_sqp;
+
+ ASSERT(sqp != NULL);
+
+ error = squeue_synch_enter(sqp, connp, 0);
+ if (error != 0) {
+ /* failed to enter */
+ return (ENOSR);
+ }
+
+ /* binding to a NULL address really means unbind */
+ if (sa == NULL) {
+ if (connp->conn_tcp->tcp_state < TCPS_LISTEN)
+ error = tcp_do_unbind(connp);
+ else
+ error = EINVAL;
+ } else {
+ error = tcp_do_bind(connp, sa, len, cr, B_TRUE);
+ }
+
+ squeue_synch_exit(sqp, connp);
+
+ if (error < 0) {
+ if (error == -TOUTSTATE)
+ error = EINVAL;
+ else
+ error = proto_tlitosyserr(-error);
+ }
+
+ return (error);
+}
+
+/*
+ * If the return value from this function is positive, it's a UNIX error.
+ * Otherwise, if it's negative, then the absolute value is a TLI error.
+ * the TPI routine tcp_tpi_connect() is a wrapper function for this.
+ */
+int
+tcp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len,
+ cred_t *cr, pid_t pid)
+{
+ tcp_t *tcp = connp->conn_tcp;
+ sin_t *sin = (sin_t *)sa;
+ sin6_t *sin6 = (sin6_t *)sa;
+ ipaddr_t *dstaddrp;
+ in_port_t dstport;
+ uint_t srcid;
+ int error = 0;
+
+ switch (len) {
+ default:
+ /*
+ * Should never happen
+ */
+ return (EINVAL);
+
+ case sizeof (sin_t):
+ sin = (sin_t *)sa;
+ if (sin->sin_port == 0) {
+ return (-TBADADDR);
+ }
+ if (tcp->tcp_connp && tcp->tcp_connp->conn_ipv6_v6only) {
+ return (EAFNOSUPPORT);
+ }
+ break;
+
+ case sizeof (sin6_t):
+ sin6 = (sin6_t *)sa;
+ if (sin6->sin6_port == 0) {
+ return (-TBADADDR);
+ }
+ break;
+ }
+ /*
+ * If we're connecting to an IPv4-mapped IPv6 address, we need to
+ * make sure that the template IP header in the tcp structure is an
+ * IPv4 header, and that the tcp_ipversion is IPV4_VERSION. We
+ * need to this before we call tcp_bindi() so that the port lookup
+ * code will look for ports in the correct port space (IPv4 and
+ * IPv6 have separate port spaces).
+ */
+ if (tcp->tcp_family == AF_INET6 && tcp->tcp_ipversion == IPV6_VERSION &&
+ IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ int err = 0;
+
+ err = tcp_header_init_ipv4(tcp);
+ if (err != 0) {
+ error = ENOMEM;
+ goto connect_failed;
+ }
+ if (tcp->tcp_lport != 0)
+ *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport;
+ }
+
+ switch (tcp->tcp_state) {
+ case TCPS_LISTEN:
+ /*
+ * Listening sockets are not allowed to issue connect().
+ */
+ if (IPCL_IS_NONSTR(connp))
+ return (EOPNOTSUPP);
+ /* FALLTHRU */
+ case TCPS_IDLE:
+ /*
+ * We support quick connect, refer to comments in
+ * tcp_connect_*()
+ */
+ /* FALLTHRU */
+ case TCPS_BOUND:
+ /*
+ * We must bump the generation before the operation start.
+ * This is done to ensure that any upcall made later on sends
+ * up the right generation to the socket.
+ */
+ SOCK_CONNID_BUMP(tcp->tcp_connid);
+
+ if (tcp->tcp_family == AF_INET6) {
+ if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ return (tcp_connect_ipv6(tcp,
+ &sin6->sin6_addr,
+ sin6->sin6_port, sin6->sin6_flowinfo,
+ sin6->__sin6_src_id, sin6->sin6_scope_id,
+ cr, pid));
+ }
+ /*
+ * Destination adress is mapped IPv6 address.
+ * Source bound address should be unspecified or
+ * IPv6 mapped address as well.
+ */
+ if (!IN6_IS_ADDR_UNSPECIFIED(
+ &tcp->tcp_bound_source_v6) &&
+ !IN6_IS_ADDR_V4MAPPED(&tcp->tcp_bound_source_v6)) {
+ return (EADDRNOTAVAIL);
+ }
+ dstaddrp = &V4_PART_OF_V6((sin6->sin6_addr));
+ dstport = sin6->sin6_port;
+ srcid = sin6->__sin6_src_id;
+ } else {
+ dstaddrp = &sin->sin_addr.s_addr;
+ dstport = sin->sin_port;
+ srcid = 0;
+ }
+
+ error = tcp_connect_ipv4(tcp, dstaddrp, dstport, srcid, cr,
+ pid);
+ break;
+ default:
+ return (-TOUTSTATE);
+ }
+ /*
+ * Note: Code below is the "failure" case
+ */
+connect_failed:
+ if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
+ tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
+ return (error);
+}
+
+int
+tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
+ socklen_t len, sock_connid_t *id, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ tcp_t *tcp = connp->conn_tcp;
+ squeue_t *sqp = connp->conn_sqp;
+ int error;
+
+ error = proto_verify_ip_addr(tcp->tcp_family, sa, len);
+ if (error != 0) {
+ return (error);
+ }
+
+ error = squeue_synch_enter(sqp, connp, 0);
+ if (error != 0) {
+ /* failed to enter */
+ return (ENOSR);
+ }
+
+ /*
+ * TCP supports quick connect, so no need to do an implicit bind
+ */
+ error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid);
+ if (error == 0) {
+ *id = connp->conn_tcp->tcp_connid;
+ } else if (error < 0) {
+ if (error == -TOUTSTATE) {
+ switch (connp->conn_tcp->tcp_state) {
+ case TCPS_SYN_SENT:
+ error = EALREADY;
+ break;
+ case TCPS_ESTABLISHED:
+ error = EISCONN;
+ break;
+ case TCPS_LISTEN:
+ error = EOPNOTSUPP;
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ } else {
+ error = proto_tlitosyserr(-error);
+ }
+ }
+done:
+ squeue_synch_exit(sqp, connp);
+
+ return ((error == 0) ? EINPROGRESS : error);
+}
+
+/* ARGSUSED */
+sock_lower_handle_t
+tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
+ uint_t *smodep, int *errorp, int flags, cred_t *credp)
+{
+ conn_t *connp;
+ boolean_t isv6 = family == AF_INET6;
+ if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) ||
+ (proto != 0 && proto != IPPROTO_TCP)) {
+ *errorp = EPROTONOSUPPORT;
+ return (NULL);
+ }
+
+ connp = tcp_create_common(NULL, credp, isv6, B_TRUE, errorp);
+ if (connp == NULL) {
+ return (NULL);
+ }
+
+ /*
+ * Put the ref for TCP. Ref for IP was already put
+ * by ipcl_conn_create. Also Make the conn_t globally
+ * visible to walkers
+ */
+ mutex_enter(&connp->conn_lock);
+ CONN_INC_REF_LOCKED(connp);
+ ASSERT(connp->conn_ref == 2);
+ connp->conn_state_flags &= ~CONN_INCIPIENT;
+
+ connp->conn_flags |= IPCL_NONSTR;
+ mutex_exit(&connp->conn_lock);
+
+ ASSERT(errorp != NULL);
+ *errorp = 0;
+ *sock_downcalls = &sock_tcp_downcalls;
+ *smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP;
+
+ return ((sock_lower_handle_t)connp);
+}
+
+/* ARGSUSED */
+void
+tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
+ sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ struct sock_proto_props sopp;
+
+ sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT |
+ SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER |
+ SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ;
+
+ sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
+ sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
+ sopp.sopp_maxpsz = INFPSZ;
+ sopp.sopp_maxblk = INFPSZ;
+ sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL;
+ sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3;
+ sopp.sopp_maxaddrlen = sizeof (sin6_t);
+ sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 :
+ tcp_rinfo.mi_minpsz;
+
+ connp->conn_upcalls = sock_upcalls;
+ connp->conn_upper_handle = sock_handle;
+
+ (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp);
+}
+
+/* ARGSUSED */
+int
+tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+
+ tcp_close_common(connp, flags);
+
+ ip_close_helper_stream(connp);
+
+ /*
+ * Drop IP's reference on the conn. This is the last reference
+ * on the connp if the state was less than established. If the
+ * connection has gone into timewait state, then we will have
+ * one ref for the TCP and one more ref (total of two) for the
+ * classifier connected hash list (a timewait connections stays
+ * in connected hash till closed).
+ *
+ * We can't assert the references because there might be other
+ * transient reference places because of some walkers or queued
+ * packets in squeue for the timewait state.
+ */
+ CONN_DEC_REF(connp);
+ return (0);
+}
+
+/* ARGSUSED */
+int
+tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
+ cred_t *cr)
+{
+ tcp_t *tcp;
+ uint32_t msize;
+ conn_t *connp = (conn_t *)proto_handle;
+ int32_t tcpstate;
+
+ ASSERT(connp->conn_ref >= 2);
+
+ if (msg->msg_controllen != 0) {
+ return (EOPNOTSUPP);
+
+ }
+ switch (DB_TYPE(mp)) {
+ case M_DATA:
+ tcp = connp->conn_tcp;
+ ASSERT(tcp != NULL);
+
+ tcpstate = tcp->tcp_state;
+ if (tcpstate < TCPS_ESTABLISHED) {
+ freemsg(mp);
+ return (ENOTCONN);
+ } else if (tcpstate > TCPS_CLOSE_WAIT) {
+ freemsg(mp);
+ return (EPIPE);
+ }
+
+ if (is_system_labeled())
+ msg_setcredpid(mp, cr, curproc->p_pid);
+
+ /* XXX pass the size down and to the squeue */
+ msize = msgdsize(mp);
+
+ mutex_enter(&tcp->tcp_non_sq_lock);
+ tcp->tcp_squeue_bytes += msize;
+ /*
+ * Squeue Flow Control
+ */
+ if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) {
+ tcp_setqfull(tcp);
+ }
+ mutex_exit(&tcp->tcp_non_sq_lock);
+
+ /*
+ * The application may pass in an address in the msghdr, but
+ * we ignore the address on connection-oriented sockets.
+ * Just like BSD this code does not generate an error for
+ * TCP (a CONNREQUIRED socket) when sending to an address
+ * passed in with sendto/sendmsg. Instead the data is
+ * delivered on the connection as if no address had been
+ * supplied.
+ */
+ CONN_INC_REF(connp);
+
+ if (msg != NULL && msg->msg_flags & MSG_OOB) {
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
+ tcp_output_urgent, connp, tcp_squeue_flag,
+ SQTAG_TCP_OUTPUT);
+ } else {
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output,
+ connp, tcp_squeue_flag, SQTAG_TCP_OUTPUT);
+ }
+
+ return (0);
+
+ default:
+ ASSERT(0);
+ }
+
+ freemsg(mp);
+ return (0);
+}
+
+/* ARGSUSED */
+void
+tcp_output_urgent(void *arg, mblk_t *mp, void *arg2)
+{
+ int len;
+ uint32_t msize;
+ conn_t *connp = (conn_t *)arg;
+ tcp_t *tcp = connp->conn_tcp;
+
+ msize = msgdsize(mp);
+
+ len = msize - 1;
+ if (len < 0) {
+ freemsg(mp);
+ return;
+ }
+
+ /*
+ * Try to force urgent data out on the wire.
+ * Even if we have unsent data this will
+ * at least send the urgent flag.
+ * XXX does not handle more flag correctly.
+ */
+ len += tcp->tcp_unsent;
+ len += tcp->tcp_snxt;
+ tcp->tcp_urg = len;
+ tcp->tcp_valid_bits |= TCP_URG_VALID;
+
+ /* Bypass tcp protocol for fused tcp loopback */
+ if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize))
+ return;
+ tcp_wput_data(tcp, mp, B_TRUE);
+}
+
+/* ARGSUSED */
+int
+tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr,
+ socklen_t *addrlen, cred_t *cr)
+{
+ sin_t *sin;
+ sin6_t *sin6;
+ conn_t *connp = (conn_t *)proto_handle;
+ tcp_t *tcp = connp->conn_tcp;
+
+ ASSERT(tcp != NULL);
+ if (tcp->tcp_state < TCPS_SYN_RCVD)
+ return (ENOTCONN);
+
+ addr->sa_family = tcp->tcp_family;
+ switch (tcp->tcp_family) {
+ case AF_INET:
+ if (*addrlen < sizeof (sin_t))
+ return (EINVAL);
+
+ sin = (sin_t *)addr;
+ *sin = sin_null;
+ sin->sin_family = AF_INET;
+ if (tcp->tcp_ipversion == IPV4_VERSION) {
+ IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_remote_v6,
+ sin->sin_addr.s_addr);
+ }
+ sin->sin_port = tcp->tcp_fport;
+ *addrlen = sizeof (struct sockaddr_in);
+ break;
+ case AF_INET6:
+ sin6 = (sin6_t *)addr;
+ *sin6 = sin6_null;
+ sin6->sin6_family = AF_INET6;
+
+ if (*addrlen < sizeof (struct sockaddr_in6))
+ return (EINVAL);
+
+ if (tcp->tcp_ipversion == IPV6_VERSION) {
+ sin6->sin6_flowinfo = tcp->tcp_ip6h->ip6_vcf &
+ ~IPV6_VERS_AND_FLOW_MASK;
+ }
+
+ sin6->sin6_addr = tcp->tcp_remote_v6;
+ sin6->sin6_port = tcp->tcp_fport;
+ *addrlen = sizeof (struct sockaddr_in6);
+ break;
+ }
+ return (0);
+}
+
+/* ARGSUSED */
+int
+tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr,
+ socklen_t *addrlenp, cred_t *cr)
+{
+ sin_t *sin;
+ sin6_t *sin6;
+ conn_t *connp = (conn_t *)proto_handle;
+ tcp_t *tcp = connp->conn_tcp;
+
+ switch (tcp->tcp_family) {
+ case AF_INET:
+ ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
+ if (*addrlenp < sizeof (sin_t))
+ return (EINVAL);
+ sin = (sin_t *)addr;
+ *sin = sin_null;
+ sin->sin_family = AF_INET;
+ *addrlenp = sizeof (sin_t);
+ if (tcp->tcp_state >= TCPS_BOUND) {
+ sin->sin_addr.s_addr = tcp->tcp_ipha->ipha_src;
+ sin->sin_port = tcp->tcp_lport;
+ }
+ break;
+
+ case AF_INET6:
+ if (*addrlenp < sizeof (sin6_t))
+ return (EINVAL);
+ sin6 = (sin6_t *)addr;
+ *sin6 = sin6_null;
+ sin6->sin6_family = AF_INET6;
+ *addrlenp = sizeof (sin6_t);
+ if (tcp->tcp_state >= TCPS_BOUND) {
+ sin6->sin6_port = tcp->tcp_lport;
+ if (tcp->tcp_ipversion == IPV4_VERSION) {
+ IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src,
+ &sin6->sin6_addr);
+ } else {
+ sin6->sin6_addr = tcp->tcp_ip6h->ip6_src;
+ }
+ }
+ break;
+ }
+ return (0);
+}
+
+/*
+ * tcp_fallback
+ *
+ * A direct socket is falling back to using STREAMS. Hanging
+ * off of the queue is a temporary tcp_t, which was created using
+ * tcp_open(). The tcp_open() was called as part of the regular
+ * sockfs create path, i.e., the SO_SOCKSTR flag is passed down,
+ * and therefore the temporary tcp_t is marked to be a socket
+ * (i.e., IPCL_SOCKET, tcp_issocket). So the optimizations
+ * introduced by FireEngine will be used.
+ *
+ * The tcp_t associated with the socket falling back will
+ * still be marked as a socket, although the direct socket flag
+ * (IPCL_NONSTR) is removed. A fall back to true TPI semantics
+ * will not take place until a _SIOCSOCKFALLBACK ioctl is issued.
+ *
+ * If the above mentioned behavior, i.e., the tmp tcp_t is created
+ * as a STREAMS/TPI endpoint, then we will need to do more work here.
+ * Such as inserting the direct socket into the acceptor hash.
+ */
+void
+tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
+ boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
+{
+ tcp_t *tcp, *eager;
+ conn_t *connp = (conn_t *)proto_handle;
+ int error;
+ struct T_capability_ack tca;
+ struct sockaddr_in6 laddr, faddr;
+ socklen_t laddrlen, faddrlen;
+ short opts;
+ struct stroptions *stropt;
+ mblk_t *stropt_mp;
+ mblk_t *mp;
+ mblk_t *conn_ind_head = NULL;
+ mblk_t *conn_ind_tail = NULL;
+ mblk_t *ordrel_mp;
+ mblk_t *fused_sigurp_mp;
+
+ tcp = connp->conn_tcp;
+ /*
+ * No support for acceptor fallback
+ */
+ ASSERT(q->q_qinfo != &tcp_acceptor_rinit);
+
+ stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL);
+
+ /* Pre-allocate the T_ordrel_ind mblk. */
+ ASSERT(tcp->tcp_ordrel_mp == NULL);
+ ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
+ STR_NOSIG, NULL);
+ ordrel_mp->b_datap->db_type = M_PROTO;
+ ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
+ ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
+
+ /* Pre-allocate the M_PCSIG anyway */
+ fused_sigurp_mp = allocb_wait(1, BPRI_HI, STR_NOSIG, NULL);
+
+ /*
+ * Enter the squeue so that no new packets can come in
+ */
+ error = squeue_synch_enter(connp->conn_sqp, connp, 0);
+ if (error != 0) {
+ /* failed to enter, free all the pre-allocated messages. */
+ freeb(stropt_mp);
+ freeb(ordrel_mp);
+ freeb(fused_sigurp_mp);
+ return;
+ }
+
+ /* Disable I/OAT during fallback */
+ tcp->tcp_sodirect = NULL;
+
+ connp->conn_dev = (dev_t)RD(q)->q_ptr;
+ connp->conn_minor_arena = WR(q)->q_ptr;
+
+ RD(q)->q_ptr = WR(q)->q_ptr = connp;
+
+ connp->conn_tcp->tcp_rq = connp->conn_rq = RD(q);
+ connp->conn_tcp->tcp_wq = connp->conn_wq = WR(q);
+
+ WR(q)->q_qinfo = &tcp_sock_winit;
+
+ if (!direct_sockfs)
+ tcp_disable_direct_sockfs(tcp);
+
+ /*
+ * free the helper stream
+ */
+ ip_close_helper_stream(connp);
+
+ /*
+ * Notify the STREAM head about options
+ */
+ DB_TYPE(stropt_mp) = M_SETOPTS;
+ stropt = (struct stroptions *)stropt_mp->b_rptr;
+ stropt_mp->b_wptr += sizeof (struct stroptions);
+ stropt = (struct stroptions *)stropt_mp->b_rptr;
+ stropt->so_flags |= SO_HIWAT | SO_WROFF | SO_MAXBLK;
+
+ stropt->so_wroff = tcp->tcp_hdr_len + (tcp->tcp_loopback ? 0 :
+ tcp->tcp_tcps->tcps_wroff_xtra);
+ if (tcp->tcp_snd_sack_ok)
+ stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
+ stropt->so_hiwat = tcp->tcp_fused ?
+ tcp_fuse_set_rcv_hiwat(tcp, tcp->tcp_recv_hiwater) :
+ MAX(tcp->tcp_recv_hiwater, tcp->tcp_tcps->tcps_sth_rcv_hiwat);
+ stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
+
+ putnext(RD(q), stropt_mp);
+
+ /*
+ * Collect the information needed to sync with the sonode
+ */
+ tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
+
+ laddrlen = faddrlen = sizeof (sin6_t);
+ (void) tcp_getsockname(proto_handle, (struct sockaddr *)&laddr,
+ &laddrlen, CRED());
+ error = tcp_getpeername(proto_handle, (struct sockaddr *)&faddr,
+ &faddrlen, CRED());
+ if (error != 0)
+ faddrlen = 0;
+
+ opts = 0;
+ if (tcp->tcp_oobinline)
+ opts |= SO_OOBINLINE;
+ if (tcp->tcp_dontroute)
+ opts |= SO_DONTROUTE;
+
+ /*
+ * Notify the socket that the protocol is now quiescent,
+ * and it's therefore safe move data from the socket
+ * to the stream head.
+ */
+ (*quiesced_cb)(connp->conn_upper_handle, q, &tca,
+ (struct sockaddr *)&laddr, laddrlen,
+ (struct sockaddr *)&faddr, faddrlen, opts);
+
+ while ((mp = tcp->tcp_rcv_list) != NULL) {
+ tcp->tcp_rcv_list = mp->b_next;
+ mp->b_next = NULL;
+ putnext(q, mp);
+ }
+ tcp->tcp_rcv_last_head = NULL;
+ tcp->tcp_rcv_last_tail = NULL;
+ tcp->tcp_rcv_cnt = 0;
+
+ /*
+ * No longer a direct socket
+ */
+ connp->conn_flags &= ~IPCL_NONSTR;
+
+ tcp->tcp_ordrel_mp = ordrel_mp;
+
+ if (tcp->tcp_fused) {
+ ASSERT(tcp->tcp_fused_sigurg_mp == NULL);
+ tcp->tcp_fused_sigurg_mp = fused_sigurp_mp;
+ } else {
+ freeb(fused_sigurp_mp);
+ }
+
+ /*
+ * Send T_CONN_IND messages for all ESTABLISHED connections.
+ */
+ mutex_enter(&tcp->tcp_eager_lock);
+ for (eager = tcp->tcp_eager_next_q; eager != NULL;
+ eager = eager->tcp_eager_next_q) {
+ mp = eager->tcp_conn.tcp_eager_conn_ind;
+
+ eager->tcp_conn.tcp_eager_conn_ind = NULL;
+ ASSERT(mp != NULL);
+ /*
+ * TLI/XTI applications will get confused by
+ * sending eager as an option since it violates
+ * the option semantics. So remove the eager as
+ * option since TLI/XTI app doesn't need it anyway.
+ */
+ if (!TCP_IS_SOCKET(tcp)) {
+ struct T_conn_ind *conn_ind;
+
+ conn_ind = (struct T_conn_ind *)mp->b_rptr;
+ conn_ind->OPT_length = 0;
+ conn_ind->OPT_offset = 0;
+ }
+ if (conn_ind_head == NULL) {
+ conn_ind_head = mp;
+ } else {
+ conn_ind_tail->b_next = mp;
+ }
+ conn_ind_tail = mp;
+ }
+ mutex_exit(&tcp->tcp_eager_lock);
+
+ mp = conn_ind_head;
+ while (mp != NULL) {
+ mblk_t *nmp = mp->b_next;
+ mp->b_next = NULL;
+
+ putnext(tcp->tcp_rq, mp);
+ mp = nmp;
+ }
+
+ /*
+ * There should be atleast two ref's (IP + TCP)
+ */
+ ASSERT(connp->conn_ref >= 2);
+ squeue_synch_exit(connp->conn_sqp, connp);
+}
+
+/* ARGSUSED */
+static void
+tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2)
+{
+ conn_t *connp = (conn_t *)arg;
+ tcp_t *tcp = connp->conn_tcp;
+
+ freemsg(mp);
+
+ if (tcp->tcp_fused)
+ tcp_unfuse(tcp);
+
+ if (tcp_xmit_end(tcp) != 0) {
+ /*
+ * We were crossing FINs and got a reset from
+ * the other side. Just ignore it.
+ */
+ if (tcp->tcp_debug) {
+ (void) strlog(TCP_MOD_ID, 0, 1,
+ SL_ERROR|SL_TRACE,
+ "tcp_shutdown_output() out of state %s",
+ tcp_display(tcp, NULL, DISP_ADDR_AND_PORT));
+ }
+ }
+}
+
+/* ARGSUSED */
+int
+tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ tcp_t *tcp = connp->conn_tcp;
+
+ /*
+ * X/Open requires that we check the connected state.
+ */
+ if (tcp->tcp_state < TCPS_SYN_SENT)
+ return (ENOTCONN);
+
+ /* shutdown the send side */
+ if (how != SHUT_RD) {
+ mblk_t *bp;
+
+ bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
+ CONN_INC_REF(connp);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output,
+ connp, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT);
+
+ (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
+ SOCK_OPCTL_SHUT_SEND, 0);
+ }
+
+ /* shutdown the recv side */
+ if (how != SHUT_WR)
+ (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
+ SOCK_OPCTL_SHUT_RECV, 0);
+
+ return (0);
+}
+
+/*
+ * SOP_LISTEN() calls into tcp_listen().
+ */
+/* ARGSUSED */
+int
+tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ int error;
+ squeue_t *sqp = connp->conn_sqp;
+
+ error = squeue_synch_enter(sqp, connp, 0);
+ if (error != 0) {
+ /* failed to enter */
+ return (ENOBUFS);
+ }
+
+ error = tcp_do_listen(connp, backlog, cr);
+ if (error == 0) {
+ (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
+ SOCK_OPCTL_ENAB_ACCEPT, (uintptr_t)backlog);
+ } else if (error < 0) {
+ if (error == -TOUTSTATE)
+ error = EINVAL;
+ else
+ error = proto_tlitosyserr(-error);
+ }
+ squeue_synch_exit(sqp, connp);
+ return (error);
+}
+
+static int
+tcp_do_listen(conn_t *connp, int backlog, cred_t *cr)
+{
+ tcp_t *tcp = connp->conn_tcp;
+ sin_t *sin;
+ sin6_t *sin6;
+ int error = 0;
+ tcp_stack_t *tcps = tcp->tcp_tcps;
+
+ if (tcp->tcp_state >= TCPS_BOUND) {
+ if ((tcp->tcp_state == TCPS_BOUND ||
+ tcp->tcp_state == TCPS_LISTEN) &&
+ backlog > 0) {
+ /*
+ * Handle listen() increasing backlog.
+ * This is more "liberal" then what the TPI spec
+ * requires but is needed to avoid a t_unbind
+ * when handling listen() since the port number
+ * might be "stolen" between the unbind and bind.
+ */
+ goto do_listen;
+ }
+ if (tcp->tcp_debug) {
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
+ "tcp_bind: bad state, %d", tcp->tcp_state);
+ }
+ return (-TOUTSTATE);
+ } else {
+ int32_t len;
+ sin6_t addr;
+
+ /* Do an implicit bind: Request for a generic port. */
+ if (tcp->tcp_family == AF_INET) {
+ len = sizeof (sin_t);
+ sin = (sin_t *)&addr;
+ *sin = sin_null;
+ sin->sin_family = AF_INET;
+ tcp->tcp_ipversion = IPV4_VERSION;
+ } else {
+ ASSERT(tcp->tcp_family == AF_INET6);
+ len = sizeof (sin6_t);
+ sin6 = (sin6_t *)&addr;
+ *sin6 = sin6_null;
+ sin6->sin6_family = AF_INET6;
+ tcp->tcp_ipversion = IPV6_VERSION;
+ }
+
+ error = tcp_bind_check(connp, (struct sockaddr *)&addr, len,
+ cr, B_FALSE);
+ if (error)
+ return (error);
+ /* Fall through and do the fanout insertion */
+ }
+
+do_listen:
+ ASSERT(tcp->tcp_state == TCPS_BOUND || tcp->tcp_state == TCPS_LISTEN);
+ tcp->tcp_conn_req_max = backlog;
+ if (tcp->tcp_conn_req_max) {
+ if (tcp->tcp_conn_req_max < tcps->tcps_conn_req_min)
+ tcp->tcp_conn_req_max = tcps->tcps_conn_req_min;
+ if (tcp->tcp_conn_req_max > tcps->tcps_conn_req_max_q)
+ tcp->tcp_conn_req_max = tcps->tcps_conn_req_max_q;
+ /*
+ * If this is a listener, do not reset the eager list
+ * and other stuffs. Note that we don't check if the
+ * existing eager list meets the new tcp_conn_req_max
+ * requirement.
+ */
+ if (tcp->tcp_state != TCPS_LISTEN) {
+ tcp->tcp_state = TCPS_LISTEN;
+ /* Initialize the chain. Don't need the eager_lock */
+ tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
+ tcp->tcp_eager_next_drop_q0 = tcp;
+ tcp->tcp_eager_prev_drop_q0 = tcp;
+ tcp->tcp_second_ctimer_threshold =
+ tcps->tcps_ip_abort_linterval;
+ }
+ }
+
+ /*
+ * We can call ip_bind directly which returns a T_BIND_ACK mp. The
+ * processing continues in tcp_rput_other().
+ *
+ * We need to make sure that the conn_recv is set to a non-null
+ * value before we insert the conn into the classifier table.
+ * This is to avoid a race with an incoming packet which does an
+ * ipcl_classify().
+ */
+ connp->conn_recv = tcp_conn_request;
+ if (tcp->tcp_family == AF_INET) {
+ error = ip_proto_bind_laddr_v4(connp, NULL,
+ IPPROTO_TCP, tcp->tcp_bound_source, tcp->tcp_lport, B_TRUE);
+ } else {
+ error = ip_proto_bind_laddr_v6(connp, NULL, IPPROTO_TCP,
+ &tcp->tcp_bound_source_v6, tcp->tcp_lport, B_TRUE);
+ }
+ return (tcp_post_ip_bind(tcp, NULL, error));
+}
+
+void
+tcp_clr_flowctrl(sock_lower_handle_t proto_handle)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ tcp_t *tcp = connp->conn_tcp;
+ tcp_stack_t *tcps = tcp->tcp_tcps;
+ uint_t thwin;
+
+ (void) squeue_synch_enter(connp->conn_sqp, connp, 0);
+
+ /* Flow control condition has been removed. */
+ tcp->tcp_rwnd = tcp->tcp_recv_hiwater;
+ thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win))
+ << tcp->tcp_rcv_ws;
+ thwin -= tcp->tcp_rnxt - tcp->tcp_rack;
+ /*
+ * Send back a window update immediately if TCP is above
+ * ESTABLISHED state and the increase of the rcv window
+ * that the other side knows is at least 1 MSS after flow
+ * control is lifted.
+ */
+ if (tcp->tcp_state >= TCPS_ESTABLISHED &&
+ (tcp->tcp_recv_hiwater - thwin >= tcp->tcp_mss)) {
+ tcp_xmit_ctl(NULL, tcp,
+ (tcp->tcp_swnd == 0) ? tcp->tcp_suna :
+ tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
+ BUMP_MIB(&tcps->tcps_mib, tcpOutWinUpdate);
+ }
+
+ squeue_synch_exit(connp->conn_sqp, connp);
+}
+
+/* ARGSUSED */
+int
+tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
+ int mode, int32_t *rvalp, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ int error;
+
+ switch (cmd) {
+ case ND_SET:
+ case ND_GET:
+ case TCP_IOC_DEFAULT_Q:
+ case _SIOCSOCKFALLBACK:
+ case TCP_IOC_ABORT_CONN:
+ case TI_GETPEERNAME:
+ case TI_GETMYNAME:
+ ip1dbg(("tcp_ioctl: cmd 0x%x on non sreams socket",
+ cmd));
+ error = EINVAL;
+ break;
+ default:
+ /*
+ * Pass on to IP using helper stream
+ */
+ error = ldi_ioctl(
+ connp->conn_helper_info->ip_helper_stream_handle,
+ cmd, arg, mode, cr, rvalp);
+ break;
+ }
+ return (error);
+}
+
+sock_downcalls_t sock_tcp_downcalls = {
+ tcp_activate,
+ tcp_accept,
+ tcp_bind,
+ tcp_listen,
+ tcp_connect,
+ tcp_getpeername,
+ tcp_getsockname,
+ tcp_getsockopt,
+ tcp_setsockopt,
+ tcp_sendmsg,
+ NULL,
+ NULL,
+ NULL,
+ tcp_shutdown,
+ tcp_clr_flowctrl,
+ tcp_ioctl,
+ tcp_close,
+};
diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c
index a192c7ad07..15b5d04d61 100644
--- a/usr/src/uts/common/inet/tcp/tcp_fusion.c
+++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c
@@ -261,10 +261,9 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
tcp->tcp_kssl_ent == NULL &&
!IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN, ipst)) {
mblk_t *mp;
- struct stroptions *stropt;
queue_t *peer_rq = peer_tcp->tcp_rq;
- ASSERT(!TCP_IS_DETACHED(peer_tcp) && peer_rq != NULL);
+ ASSERT(!TCP_IS_DETACHED(peer_tcp));
ASSERT(tcp->tcp_fused_sigurg_mp == NULL);
ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL);
ASSERT(tcp->tcp_kssl_ctx == NULL);
@@ -276,19 +275,25 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
* This is why we pre-allocate the M_PCSIG mblks for both
* endpoints which will only be used during/after unfuse.
*/
- if ((mp = allocb(1, BPRI_HI)) == NULL)
- goto failed;
+ if (!IPCL_IS_NONSTR(tcp->tcp_connp)) {
+ if ((mp = allocb(1, BPRI_HI)) == NULL)
+ goto failed;
- tcp->tcp_fused_sigurg_mp = mp;
+ tcp->tcp_fused_sigurg_mp = mp;
+ }
- if ((mp = allocb(1, BPRI_HI)) == NULL)
- goto failed;
+ if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp)) {
+ if ((mp = allocb(1, BPRI_HI)) == NULL)
+ goto failed;
- peer_tcp->tcp_fused_sigurg_mp = mp;
+ peer_tcp->tcp_fused_sigurg_mp = mp;
+ }
- /* Allocate M_SETOPTS mblk */
- if ((mp = allocb(sizeof (*stropt), BPRI_HI)) == NULL)
+ if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp) &&
+ (mp = allocb(sizeof (struct stroptions),
+ BPRI_HI)) == NULL) {
goto failed;
+ }
/* If either tcp or peer_tcp sodirect enabled then disable */
if (tcp->tcp_sodirect != NULL) {
@@ -329,12 +334,12 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
* us data as soon as fusion is finished, and we need to be
* able to flow control it in case it sends down huge amount
* of data while we're still detached. To prevent that we
- * inherit the listener's q_hiwat value; this is temporary
- * since we'll repeat the process in tcp_accept_finish().
+ * inherit the listener's recv_hiwater value; this is temporary
+ * since we'll repeat the process intcp_accept_finish().
*/
if (!tcp->tcp_refuse) {
(void) tcp_fuse_set_rcv_hiwat(tcp,
- tcp->tcp_saved_listener->tcp_rq->q_hiwat);
+ tcp->tcp_saved_listener->tcp_recv_hiwater);
/*
* Set the stream head's write offset value to zero
@@ -342,30 +347,53 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
* headers; tell it to not break up the writes (this
* would reduce the amount of work done by kmem); and
* configure our receive buffer. Note that we can only
- * do this for the active connect tcp since our eager
- * is still detached; it will be dealt with later in
+ * do this for the active connect tcp since our eager is
+ * still detached; it will be dealt with later in
* tcp_accept_finish().
*/
- DB_TYPE(mp) = M_SETOPTS;
- mp->b_wptr += sizeof (*stropt);
-
- stropt = (struct stroptions *)mp->b_rptr;
- stropt->so_flags = SO_MAXBLK | SO_WROFF | SO_HIWAT;
- stropt->so_maxblk = tcp_maxpsz_set(peer_tcp, B_FALSE);
- stropt->so_wroff = 0;
-
- /*
- * Record the stream head's high water mark for
- * peer endpoint; this is used for flow-control
- * purposes in tcp_fuse_output().
- */
- stropt->so_hiwat = tcp_fuse_set_rcv_hiwat(peer_tcp,
- peer_rq->q_hiwat);
-
- tcp->tcp_refuse = B_FALSE;
- peer_tcp->tcp_refuse = B_FALSE;
- /* Send the options up */
- putnext(peer_rq, mp);
+ if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp)) {
+ struct stroptions *stropt;
+
+ DB_TYPE(mp) = M_SETOPTS;
+ mp->b_wptr += sizeof (*stropt);
+
+ stropt = (struct stroptions *)mp->b_rptr;
+ stropt->so_flags = SO_MAXBLK|SO_WROFF|SO_HIWAT;
+ stropt->so_maxblk = tcp_maxpsz_set(peer_tcp,
+ B_FALSE);
+ stropt->so_wroff = 0;
+
+ /*
+ * Record the stream head's high water mark for
+ * peer endpoint; this is used for flow-control
+ * purposes in tcp_fuse_output().
+ */
+ stropt->so_hiwat = tcp_fuse_set_rcv_hiwat(
+ peer_tcp, peer_rq->q_hiwat);
+
+ tcp->tcp_refuse = B_FALSE;
+ peer_tcp->tcp_refuse = B_FALSE;
+ /* Send the options up */
+ putnext(peer_rq, mp);
+ } else {
+ struct sock_proto_props sopp;
+
+ /* The peer is a non-STREAMS end point */
+ ASSERT(IPCL_IS_TCP(peer_connp));
+
+ (void) tcp_fuse_set_rcv_hiwat(tcp,
+ tcp->tcp_saved_listener->tcp_recv_hiwater);
+
+ sopp.sopp_flags = SOCKOPT_MAXBLK |
+ SOCKOPT_WROFF | SOCKOPT_RCVHIWAT;
+ sopp.sopp_maxblk = tcp_maxpsz_set(peer_tcp,
+ B_FALSE);
+ sopp.sopp_wroff = 0;
+ sopp.sopp_rxhiwat = tcp_fuse_set_rcv_hiwat(
+ peer_tcp, peer_tcp->tcp_recv_hiwater);
+ (*peer_connp->conn_upcalls->su_set_proto_props)
+ (peer_connp->conn_upper_handle, &sopp);
+ }
}
tcp->tcp_refuse = B_FALSE;
peer_tcp->tcp_refuse = B_FALSE;
@@ -399,8 +427,6 @@ tcp_unfuse(tcp_t *tcp)
ASSERT(peer_tcp->tcp_fused && peer_tcp->tcp_loopback_peer == tcp);
ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp);
ASSERT(tcp->tcp_unsent == 0 && peer_tcp->tcp_unsent == 0);
- ASSERT(tcp->tcp_fused_sigurg_mp != NULL);
- ASSERT(peer_tcp->tcp_fused_sigurg_mp != NULL);
/*
* We disable synchronous streams, drain any queued data and
@@ -420,10 +446,16 @@ tcp_unfuse(tcp_t *tcp)
/* Unfuse the endpoints */
peer_tcp->tcp_fused = tcp->tcp_fused = B_FALSE;
peer_tcp->tcp_loopback_peer = tcp->tcp_loopback_peer = NULL;
- freeb(peer_tcp->tcp_fused_sigurg_mp);
- freeb(tcp->tcp_fused_sigurg_mp);
- peer_tcp->tcp_fused_sigurg_mp = NULL;
- tcp->tcp_fused_sigurg_mp = NULL;
+ if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp)) {
+ ASSERT(peer_tcp->tcp_fused_sigurg_mp != NULL);
+ freeb(peer_tcp->tcp_fused_sigurg_mp);
+ peer_tcp->tcp_fused_sigurg_mp = NULL;
+ }
+ if (!IPCL_IS_NONSTR(tcp->tcp_connp)) {
+ ASSERT(tcp->tcp_fused_sigurg_mp != NULL);
+ freeb(tcp->tcp_fused_sigurg_mp);
+ tcp->tcp_fused_sigurg_mp = NULL;
+ }
}
/*
@@ -527,6 +559,7 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
uint_t max_unread;
boolean_t flow_stopped, peer_data_queued = B_FALSE;
boolean_t urgent = (DB_TYPE(mp) != M_DATA);
+ boolean_t push = B_FALSE;
mblk_t *mp1 = mp;
ill_t *ilp, *olp;
ipif_t *iifp, *oifp;
@@ -546,7 +579,6 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO ||
DB_TYPE(mp) == M_PCPROTO);
-
/* If this connection requires IP, unfuse and use regular path */
if (tcp_loopback_needs_ip(tcp, ns) ||
tcp_loopback_needs_ip(peer_tcp, ns) ||
@@ -749,7 +781,38 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
* Enqueue data into the peer's receive list; we may or may not
* drain the contents depending on the conditions below.
*/
- tcp_rcv_enqueue(peer_tcp, mp, recv_size);
+ if (IPCL_IS_NONSTR(peer_tcp->tcp_connp) &&
+ peer_tcp->tcp_connp->conn_upper_handle != NULL) {
+ int error;
+ int flags = 0;
+
+ if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
+ (tcp->tcp_urg == tcp->tcp_snxt)) {
+ flags = MSG_OOB;
+ (*peer_tcp->tcp_connp->conn_upcalls->su_signal_oob)
+ (peer_tcp->tcp_connp->conn_upper_handle, 0);
+ tcp->tcp_valid_bits &= ~TCP_URG_VALID;
+ }
+ (*peer_tcp->tcp_connp->conn_upcalls->su_recv)(
+ peer_tcp->tcp_connp->conn_upper_handle, mp, recv_size,
+ flags, &error, &push);
+ } else {
+ if (IPCL_IS_NONSTR(peer_tcp->tcp_connp) &&
+ (tcp->tcp_valid_bits & TCP_URG_VALID) &&
+ (tcp->tcp_urg == tcp->tcp_snxt)) {
+ /*
+ * Can not deal with urgent pointers
+ * that arrive before the connection has been
+ * accept()ed.
+ */
+ tcp->tcp_valid_bits &= ~TCP_URG_VALID;
+ freemsg(mp);
+ mutex_exit(&peer_tcp->tcp_non_sq_lock);
+ return (B_TRUE);
+ }
+
+ tcp_rcv_enqueue(peer_tcp, mp, recv_size);
+ }
/* In case it wrapped around and also to keep it constant */
peer_tcp->tcp_rwnd += recv_size;
@@ -797,6 +860,7 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
(peer_tcp->tcp_rcv_cnt >= peer_tcp->tcp_fuse_rcv_hiwater ||
peer_tcp->tcp_fuse_rcv_unread_cnt >= max_unread)) ||
(!peer_tcp->tcp_direct_sockfs && !TCP_IS_DETACHED(peer_tcp) &&
+ !IPCL_IS_NONSTR(peer_tcp->tcp_connp) &&
!canputnext(peer_tcp->tcp_rq))) {
peer_data_queued = B_TRUE;
}
@@ -861,7 +925,8 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
* will pull the data via tcp_fuse_rrw().
*/
if (urgent || (!flow_stopped && !peer_tcp->tcp_direct_sockfs)) {
- ASSERT(peer_tcp->tcp_rcv_list != NULL);
+ ASSERT(IPCL_IS_NONSTR(peer_tcp->tcp_connp) ||
+ peer_tcp->tcp_rcv_list != NULL);
/*
* For TLI-based streams, a thread in tcp_accept_swap()
* can race with us. That thread will ensure that the
@@ -897,6 +962,8 @@ boolean_t
tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp)
{
mblk_t *mp;
+ conn_t *connp = tcp->tcp_connp;
+
#ifdef DEBUG
uint_t cnt = 0;
#endif
@@ -907,7 +974,7 @@ tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp)
ASSERT(tcp->tcp_loopback);
ASSERT(tcp->tcp_fused || tcp->tcp_fused_sigurg);
ASSERT(!tcp->tcp_fused || tcp->tcp_loopback_peer != NULL);
- ASSERT(sigurg_mpp != NULL || tcp->tcp_fused);
+ ASSERT(IPCL_IS_NONSTR(connp) || sigurg_mpp != NULL || tcp->tcp_fused);
/* No need for the push timer now, in case it was scheduled */
if (tcp->tcp_push_tid != 0) {
@@ -921,34 +988,41 @@ tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp)
* works properly.
*/
if (tcp->tcp_fused_sigurg) {
- /*
- * sigurg_mpp is normally NULL, i.e. when we're still
- * fused and didn't get here because of tcp_unfuse().
- * In this case try hard to allocate the M_PCSIG mblk.
- */
- if (sigurg_mpp == NULL &&
- (mp = allocb(1, BPRI_HI)) == NULL &&
- (mp = allocb_tryhard(1)) == NULL) {
- /* Alloc failed; try again next time */
- tcp->tcp_push_tid = TCP_TIMER(tcp, tcp_push_timer,
- MSEC_TO_TICK(tcps->tcps_push_timer_interval));
- return (B_TRUE);
- } else if (sigurg_mpp != NULL) {
+ tcp->tcp_fused_sigurg = B_FALSE;
+ if (IPCL_IS_NONSTR(connp)) {
+ (*connp->conn_upcalls->su_signal_oob)
+ (connp->conn_upper_handle, 0);
+ } else {
/*
- * Use the supplied M_PCSIG mblk; it means we're
- * either unfused or in the process of unfusing,
- * and the drain must happen now.
+ * sigurg_mpp is normally NULL, i.e. when we're still
+ * fused and didn't get here because of tcp_unfuse().
+ * In this case try hard to allocate the M_PCSIG mblk.
*/
- mp = *sigurg_mpp;
- *sigurg_mpp = NULL;
- }
- ASSERT(mp != NULL);
+ if (sigurg_mpp == NULL &&
+ (mp = allocb(1, BPRI_HI)) == NULL &&
+ (mp = allocb_tryhard(1)) == NULL) {
+ /* Alloc failed; try again next time */
+ tcp->tcp_push_tid = TCP_TIMER(tcp,
+ tcp_push_timer,
+ MSEC_TO_TICK(
+ tcps->tcps_push_timer_interval));
+ return (B_TRUE);
+ } else if (sigurg_mpp != NULL) {
+ /*
+ * Use the supplied M_PCSIG mblk; it means we're
+ * either unfused or in the process of unfusing,
+ * and the drain must happen now.
+ */
+ mp = *sigurg_mpp;
+ *sigurg_mpp = NULL;
+ }
+ ASSERT(mp != NULL);
- tcp->tcp_fused_sigurg = B_FALSE;
- /* Send up the signal */
- DB_TYPE(mp) = M_PCSIG;
- *mp->b_wptr++ = (uchar_t)SIGURG;
- putnext(q, mp);
+ /* Send up the signal */
+ DB_TYPE(mp) = M_PCSIG;
+ *mp->b_wptr++ = (uchar_t)SIGURG;
+ putnext(q, mp);
+ }
/*
* Let the regular tcp_rcv_drain() path handle
* draining the data if we're no longer fused.
@@ -980,6 +1054,7 @@ tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp)
#ifdef DEBUG
cnt += msgdsize(mp);
#endif
+ ASSERT(!IPCL_IS_NONSTR(connp));
if (sd_rd_eof) {
freemsg(mp);
} else {
@@ -991,12 +1066,14 @@ tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp)
if (tcp->tcp_direct_sockfs && !sd_rd_eof)
(void) strrput_sig(q, B_TRUE);
+#ifdef DEBUG
ASSERT(cnt == tcp->tcp_rcv_cnt);
+#endif
tcp->tcp_rcv_last_head = NULL;
tcp->tcp_rcv_last_tail = NULL;
tcp->tcp_rcv_cnt = 0;
tcp->tcp_fuse_rcv_unread_cnt = 0;
- tcp->tcp_rwnd = q->q_hiwat;
+ tcp->tcp_rwnd = tcp->tcp_recv_hiwater;
if (peer_tcp->tcp_flow_stopped && (TCP_UNSENT_BYTES(peer_tcp) <=
peer_tcp->tcp_xmit_lowater)) {
@@ -1409,8 +1486,10 @@ tcp_fuse_disable_pair(tcp_t *tcp, boolean_t unfusing)
}
/* Disable synchronous streams */
- tcp_fuse_syncstr_disable(tcp);
- tcp_fuse_syncstr_disable(peer_tcp);
+ if (!IPCL_IS_NONSTR(tcp->tcp_connp))
+ tcp_fuse_syncstr_disable(tcp);
+ if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp))
+ tcp_fuse_syncstr_disable(peer_tcp);
}
/*
diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
index 4f0d767774..d977c27e53 100644
--- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c
+++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/stream.h>
#define _SUN_TPI_VERSION 2
@@ -43,8 +41,8 @@
extern int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr);
-extern int tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr);
-extern int tcp_opt_set(queue_t *q, uint_t optset_context, int level,
+extern int tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr);
+extern int tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level,
int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
void *thisdg_attrs, cred_t *cr, mblk_t *mblk);
@@ -125,10 +123,10 @@ opdes_t tcp_opt_arr[] = {
{ IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
(OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
- 40, -1 /* not initialized */ },
+ IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
{ T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
(OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
- 40, -1 /* not initialized */ },
+ IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
{ IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
{ T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
@@ -244,8 +242,8 @@ uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */
optdb_obj_t tcp_opt_obj = {
tcp_opt_default, /* TCP default value function pointer */
- tcp_opt_get, /* TCP get function pointer */
- tcp_opt_set, /* TCP set function pointer */
+ tcp_tpi_opt_get, /* TCP get function pointer */
+ tcp_tpi_opt_set, /* TCP set function pointer */
B_TRUE, /* TCP is tpi provider */
TCP_OPT_ARR_CNT, /* TCP option database count of entries */
tcp_opt_arr, /* TCP option database */
diff --git a/usr/src/uts/common/inet/tcp/tcpddi.c b/usr/src/uts/common/inet/tcp/tcpddi.c
index ee5b0181b6..91da903826 100644
--- a/usr/src/uts/common/inet/tcp/tcpddi.c
+++ b/usr/src/uts/common/inet/tcp/tcpddi.c
@@ -29,12 +29,18 @@
#include <sys/modctl.h>
#include <inet/common.h>
#include <inet/ip.h>
+#include <inet/tcp_impl.h>
+#include <sys/strsubr.h>
+#include <sys/socketvar.h>
#define INET_NAME "tcp"
#define INET_MODSTRTAB dummymodinfo
#define INET_DEVSTRTAB tcpinfov4
#define INET_DEVDESC "TCP STREAMS driver"
#define INET_MODDESC "TCP dummy STREAMS module"
+#define INET_SOCKDESC "TCP socket module"
+#define INET_SOCK_PROTO_CREATE_FUNC (*tcp_create)
+#define INET_SOCK_PROTO_FB_FUNC (*tcp_fallback)
#define INET_DEVMINOR 0
#define INET_MODMTFLAGS D_MP
/*
diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h
index 98d8d17f61..97374be482 100644
--- a/usr/src/uts/common/inet/tcp_impl.h
+++ b/usr/src/uts/common/inet/tcp_impl.h
@@ -39,6 +39,7 @@ extern "C" {
#ifdef _KERNEL
+#include <inet/optcom.h>
#include <inet/tcp.h>
#define TCP_MOD_ID 5105
@@ -274,6 +275,14 @@ extern int tcp_fuse_maxpsz_set(tcp_t *);
extern optdb_obj_t tcp_opt_obj;
extern uint_t tcp_max_optsize;
+extern sock_lower_handle_t tcp_create(int, int, int, sock_downcalls_t **,
+ uint_t *, int *, int, cred_t *);
+extern void tcp_fallback(sock_lower_handle_t, queue_t *, boolean_t,
+ so_proto_quiesced_cb_t);
+
+extern sock_downcalls_t sock_tcp_downcalls;
+
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/inet/tcp_stack.h b/usr/src/uts/common/inet/tcp_stack.h
index 43da079d5a..173875f0da 100644
--- a/usr/src/uts/common/inet/tcp_stack.h
+++ b/usr/src/uts/common/inet/tcp_stack.h
@@ -30,6 +30,8 @@
#include <sys/netstack.h>
#include <inet/ip.h>
#include <inet/ipdrop.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
#ifdef __cplusplus
extern "C" {
@@ -232,6 +234,7 @@ struct tcp_stack {
uint32_t tcps_rst_cnt;
/* The number of RST not sent because of the rate limit. */
uint32_t tcps_rst_unsent;
+ ldi_ident_t tcps_ldi_ident;
};
typedef struct tcp_stack tcp_stack_t;
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index 70677c86d8..5f819f1285 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -40,13 +40,13 @@
#include <sys/strsubr.h>
#include <sys/suntpi.h>
#include <sys/xti_inet.h>
-#include <sys/cmn_err.h>
#include <sys/kmem.h>
#include <sys/policy.h>
#include <sys/ucred.h>
#include <sys/zone.h>
#include <sys/socket.h>
+#include <sys/socketvar.h>
#include <sys/sockio.h>
#include <sys/vtrace.h>
#include <sys/sdt.h>
@@ -68,7 +68,7 @@
#include <inet/ip_if.h>
#include <inet/ip_multi.h>
#include <inet/ip_ndp.h>
-#include <inet/mi.h>
+#include <inet/proto_set.h>
#include <inet/mib2.h>
#include <inet/nd.h>
#include <inet/optcom.h>
@@ -150,17 +150,14 @@ typedef struct udpattrs_s {
} udpattrs_t;
static void udp_addr_req(queue_t *q, mblk_t *mp);
-static void udp_bind(queue_t *q, mblk_t *mp);
+static void udp_tpi_bind(queue_t *q, mblk_t *mp);
static void udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp);
static void udp_bind_hash_remove(udp_t *udp, boolean_t caller_holds_lock);
-static void udp_bind_result(conn_t *, mblk_t *);
-static void udp_bind_ack(conn_t *, mblk_t *mp);
-static void udp_bind_error(conn_t *, mblk_t *mp);
static int udp_build_hdrs(udp_t *udp);
static void udp_capability_req(queue_t *q, mblk_t *mp);
-static int udp_close(queue_t *q);
-static void udp_connect(queue_t *q, mblk_t *mp);
-static void udp_disconnect(queue_t *q, mblk_t *mp);
+static int udp_tpi_close(queue_t *q, int flags);
+static void udp_tpi_connect(queue_t *q, mblk_t *mp);
+static void udp_tpi_disconnect(queue_t *q, mblk_t *mp);
static void udp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error,
int sys_error);
static void udp_err_ack_prim(queue_t *q, mblk_t *mp, int primitive,
@@ -171,8 +168,8 @@ static int udp_extra_priv_ports_add(queue_t *q, mblk_t *mp,
char *value, caddr_t cp, cred_t *cr);
static int udp_extra_priv_ports_del(queue_t *q, mblk_t *mp,
char *value, caddr_t cp, cred_t *cr);
-static void udp_icmp_error(queue_t *q, mblk_t *mp);
-static void udp_icmp_error_ipv6(queue_t *q, mblk_t *mp);
+static void udp_icmp_error(conn_t *, mblk_t *);
+static void udp_icmp_error_ipv6(conn_t *, mblk_t *);
static void udp_info_req(queue_t *q, mblk_t *mp);
static void udp_input(void *, mblk_t *, void *);
static mblk_t *udp_ip_bind_mp(udp_t *udp, t_scalar_t bind_prim,
@@ -201,15 +198,16 @@ static void udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp,
ipha_t *ipha);
static void udp_ud_err(queue_t *q, mblk_t *mp, uchar_t *destaddr,
t_scalar_t destlen, t_scalar_t err);
-static void udp_unbind(queue_t *q, mblk_t *mp);
+static void udp_tpi_unbind(queue_t *q, mblk_t *mp);
static in_port_t udp_update_next_port(udp_t *udp, in_port_t port,
boolean_t random);
static mblk_t *udp_output_v4(conn_t *, mblk_t *, ipaddr_t, uint16_t, uint_t,
- int *, boolean_t);
+ int *, boolean_t, struct nmsghdr *, cred_t *, pid_t);
static mblk_t *udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6,
- int *error);
+ int *error, struct nmsghdr *msg, cred_t *cr, pid_t pid);
static void udp_wput_other(queue_t *q, mblk_t *mp);
static void udp_wput_iocdata(queue_t *q, mblk_t *mp);
+static void udp_wput_fallback(queue_t *q, mblk_t *mp);
static size_t udp_set_rcv_hiwat(udp_t *udp, size_t size);
static void *udp_stack_init(netstackid_t stackid, netstack_t *ns);
@@ -226,6 +224,25 @@ static void udp_rcv_enqueue(queue_t *q, udp_t *udp, mblk_t *mp,
static void udp_rcv_drain(queue_t *q, udp_t *udp, boolean_t closing);
static void udp_xmit(queue_t *, mblk_t *, ire_t *ire, conn_t *, zoneid_t);
+static int udp_send_connected(conn_t *, mblk_t *, struct nmsghdr *,
+ cred_t *, pid_t);
+
+/* Common routine for TPI and socket module */
+static conn_t *udp_do_open(cred_t *, boolean_t, int);
+static void udp_do_close(conn_t *);
+static int udp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *,
+ boolean_t);
+static int udp_do_unbind(conn_t *);
+static int udp_do_getsockname(udp_t *, struct sockaddr *, uint_t *);
+static int udp_do_getpeername(udp_t *, struct sockaddr *, uint_t *);
+
+int udp_getsockname(sock_lower_handle_t,
+ struct sockaddr *, socklen_t *, cred_t *);
+int udp_getpeername(sock_lower_handle_t,
+ struct sockaddr *, socklen_t *, cred_t *);
+static int udp_do_connect(conn_t *, const struct sockaddr *, socklen_t);
+static int udp_post_ip_bind_connect(udp_t *, mblk_t *, int);
+
#define UDP_RECV_HIWATER (56 * 1024)
#define UDP_RECV_LOWATER 128
#define UDP_XMIT_HIWATER (56 * 1024)
@@ -240,12 +257,12 @@ static struct module_info udp_mod_info = {
* We have separate open functions for the /dev/udp and /dev/udp6 devices.
*/
static struct qinit udp_rinitv4 = {
- NULL, NULL, udp_openv4, udp_close, NULL,
+ NULL, NULL, udp_openv4, udp_tpi_close, NULL,
&udp_mod_info, NULL, udp_rrw, udp_rinfop, STRUIOT_STANDARD
};
static struct qinit udp_rinitv6 = {
- NULL, NULL, udp_openv6, udp_close, NULL,
+ NULL, NULL, udp_openv6, udp_tpi_close, NULL,
&udp_mod_info, NULL, udp_rrw, udp_rinfop, STRUIOT_STANDARD
};
@@ -254,17 +271,22 @@ static struct qinit udp_winit = {
&udp_mod_info, NULL, NULL, NULL, STRUIOT_NONE
};
+/* UDP entry point during fallback */
+struct qinit udp_fallback_sock_winit = {
+ (pfi_t)udp_wput_fallback, NULL, NULL, NULL, NULL, &udp_mod_info
+};
+
/*
* UDP needs to handle I_LINK and I_PLINK since ifconfig
* likes to use it as a place to hang the various streams.
*/
static struct qinit udp_lrinit = {
- (pfi_t)udp_lrput, NULL, udp_openv4, udp_close, NULL,
+ (pfi_t)udp_lrput, NULL, udp_openv4, udp_tpi_close, NULL,
&udp_mod_info
};
static struct qinit udp_lwinit = {
- (pfi_t)udp_lwput, NULL, udp_openv4, udp_close, NULL,
+ (pfi_t)udp_lwput, NULL, udp_openv4, udp_tpi_close, NULL,
&udp_mod_info
};
@@ -559,30 +581,19 @@ udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp)
* duplicating the us->us_next_port_to_try.
*/
static void
-udp_bind(queue_t *q, mblk_t *mp)
+udp_tpi_bind(queue_t *q, mblk_t *mp)
{
sin_t *sin;
sin6_t *sin6;
mblk_t *mp1;
- in_port_t port; /* Host byte order */
- in_port_t requested_port; /* Host byte order */
struct T_bind_req *tbr;
- int count;
- in6_addr_t v6src;
- boolean_t bind_to_req_port_only;
- int loopmax;
- udp_fanout_t *udpf;
- in_port_t lport; /* Network byte order */
- zoneid_t zoneid;
conn_t *connp;
udp_t *udp;
- boolean_t is_inaddr_any;
- mlp_type_t addrtype, mlptype;
- udp_stack_t *us;
+ int error;
+ struct sockaddr *sa;
connp = Q_TO_CONN(q);
udp = connp->conn_udp;
- us = udp->udp_us;
if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
"udp_bind: bad req, len %u",
@@ -607,6 +618,10 @@ udp_bind(queue_t *q, mblk_t *mp)
}
mp = mp1;
+
+ /* Reset the message type in preparation for shipping it back. */
+ DB_TYPE(mp) = M_PCPROTO;
+
tbr = (struct T_bind_req *)mp->b_rptr;
switch (tbr->ADDR_length) {
case 0: /* Request for a generic port */
@@ -617,6 +632,7 @@ udp_bind(queue_t *q, mblk_t *mp)
*sin = sin_null;
sin->sin_family = AF_INET;
mp->b_wptr = (uchar_t *)&sin[1];
+ sa = (struct sockaddr *)sin;
} else {
ASSERT(udp->udp_family == AF_INET6);
tbr->ADDR_length = sizeof (sin6_t);
@@ -624,38 +640,36 @@ udp_bind(queue_t *q, mblk_t *mp)
*sin6 = sin6_null;
sin6->sin6_family = AF_INET6;
mp->b_wptr = (uchar_t *)&sin6[1];
+ sa = (struct sockaddr *)sin6;
}
- port = 0;
break;
case sizeof (sin_t): /* Complete IPv4 address */
- sin = (sin_t *)mi_offset_param(mp, tbr->ADDR_offset,
+ sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
sizeof (sin_t));
- if (sin == NULL || !OK_32PTR((char *)sin)) {
+ if (sa == NULL || !OK_32PTR((char *)sa)) {
udp_err_ack(q, mp, TSYSERR, EINVAL);
return;
}
if (udp->udp_family != AF_INET ||
- sin->sin_family != AF_INET) {
+ sa->sa_family != AF_INET) {
udp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
return;
}
- port = ntohs(sin->sin_port);
break;
case sizeof (sin6_t): /* complete IPv6 address */
- sin6 = (sin6_t *)mi_offset_param(mp, tbr->ADDR_offset,
+ sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
sizeof (sin6_t));
- if (sin6 == NULL || !OK_32PTR((char *)sin6)) {
+ if (sa == NULL || !OK_32PTR((char *)sa)) {
udp_err_ack(q, mp, TSYSERR, EINVAL);
return;
}
if (udp->udp_family != AF_INET6 ||
- sin6->sin6_family != AF_INET6) {
+ sa->sa_family != AF_INET6) {
udp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
return;
}
- port = ntohs(sin6->sin6_port);
break;
default: /* Invalid request */
@@ -665,503 +679,21 @@ udp_bind(queue_t *q, mblk_t *mp)
return;
}
- requested_port = port;
-
- if (requested_port == 0 || tbr->PRIM_type == O_T_BIND_REQ)
- bind_to_req_port_only = B_FALSE;
- else /* T_BIND_REQ and requested_port != 0 */
- bind_to_req_port_only = B_TRUE;
-
- if (requested_port == 0) {
- /*
- * If the application passed in zero for the port number, it
- * doesn't care which port number we bind to. Get one in the
- * valid range.
- */
- if (udp->udp_anon_priv_bind) {
- port = udp_get_next_priv_port(udp);
- } else {
- port = udp_update_next_port(udp,
- us->us_next_port_to_try, B_TRUE);
- }
- } else {
- /*
- * If the port is in the well-known privileged range,
- * make sure the caller was privileged.
- */
- int i;
- boolean_t priv = B_FALSE;
-
- if (port < us->us_smallest_nonpriv_port) {
- priv = B_TRUE;
- } else {
- for (i = 0; i < us->us_num_epriv_ports; i++) {
- if (port == us->us_epriv_ports[i]) {
- priv = B_TRUE;
- break;
- }
- }
- }
-
- if (priv) {
- cred_t *cr = DB_CREDDEF(mp, connp->conn_cred);
-
- if (secpolicy_net_privaddr(cr, port,
- IPPROTO_UDP) != 0) {
- udp_err_ack(q, mp, TACCES, 0);
- return;
- }
- }
- }
-
- if (port == 0) {
- udp_err_ack(q, mp, TNOADDR, 0);
- return;
- }
-
- /*
- * The state must be TS_UNBND. TPI mandates that users must send
- * TPI primitives only 1 at a time and wait for the response before
- * sending the next primitive.
- */
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- if (udp->udp_state != TS_UNBND || udp->udp_pending_op != -1) {
- rw_exit(&udp->udp_rwlock);
- (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
- "udp_bind: bad state, %u", udp->udp_state);
- udp_err_ack(q, mp, TOUTSTATE, 0);
- return;
- }
- udp->udp_pending_op = tbr->PRIM_type;
- /*
- * Copy the source address into our udp structure. This address
- * may still be zero; if so, IP will fill in the correct address
- * each time an outbound packet is passed to it. Since the udp is
- * not yet in the bind hash list, we don't grab the uf_lock to
- * change udp_ipversion
- */
- if (udp->udp_family == AF_INET) {
- ASSERT(sin != NULL);
- ASSERT(udp->udp_ipversion == IPV4_VERSION);
- udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE +
- udp->udp_ip_snd_options_len;
- IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6src);
- } else {
- ASSERT(sin6 != NULL);
- v6src = sin6->sin6_addr;
- if (IN6_IS_ADDR_V4MAPPED(&v6src)) {
- /*
- * no need to hold the uf_lock to set the udp_ipversion
- * since we are not yet in the fanout list
- */
- udp->udp_ipversion = IPV4_VERSION;
- udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
- UDPH_SIZE + udp->udp_ip_snd_options_len;
- } else {
- udp->udp_ipversion = IPV6_VERSION;
- udp->udp_max_hdr_len = udp->udp_sticky_hdrs_len;
- }
- }
-
- /*
- * If udp_reuseaddr is not set, then we have to make sure that
- * the IP address and port number the application requested
- * (or we selected for the application) is not being used by
- * another stream. If another stream is already using the
- * requested IP address and port, the behavior depends on
- * "bind_to_req_port_only". If set the bind fails; otherwise we
- * search for any an unused port to bind to the the stream.
- *
- * As per the BSD semantics, as modified by the Deering multicast
- * changes, if udp_reuseaddr is set, then we allow multiple binds
- * to the same port independent of the local IP address.
- *
- * This is slightly different than in SunOS 4.X which did not
- * support IP multicast. Note that the change implemented by the
- * Deering multicast code effects all binds - not only binding
- * to IP multicast addresses.
- *
- * Note that when binding to port zero we ignore SO_REUSEADDR in
- * order to guarantee a unique port.
- */
- count = 0;
- if (udp->udp_anon_priv_bind) {
- /*
- * loopmax = (IPPORT_RESERVED-1) -
- * us->us_min_anonpriv_port + 1
- */
- loopmax = IPPORT_RESERVED - us->us_min_anonpriv_port;
- } else {
- loopmax = us->us_largest_anon_port -
- us->us_smallest_anon_port + 1;
- }
-
- is_inaddr_any = V6_OR_V4_INADDR_ANY(v6src);
- zoneid = connp->conn_zoneid;
-
- for (;;) {
- udp_t *udp1;
- boolean_t found_exclbind = B_FALSE;
-
- /*
- * Walk through the list of udp streams bound to
- * requested port with the same IP address.
- */
- lport = htons(port);
- udpf = &us->us_bind_fanout[UDP_BIND_HASH(lport,
- us->us_bind_fanout_size)];
- mutex_enter(&udpf->uf_lock);
- for (udp1 = udpf->uf_udp; udp1 != NULL;
- udp1 = udp1->udp_bind_hash) {
- if (lport != udp1->udp_port)
- continue;
-
- /*
- * On a labeled system, we must treat bindings to ports
- * on shared IP addresses by sockets with MAC exemption
- * privilege as being in all zones, as there's
- * otherwise no way to identify the right receiver.
- */
- if (!(IPCL_ZONE_MATCH(udp1->udp_connp, zoneid) ||
- IPCL_ZONE_MATCH(connp,
- udp1->udp_connp->conn_zoneid)) &&
- !connp->conn_mac_exempt && \
- !udp1->udp_connp->conn_mac_exempt)
- continue;
+ cred_t *cr = DB_CREDDEF(mp, connp->conn_cred);
+ error = udp_do_bind(connp, sa, tbr->ADDR_length, cr,
+ tbr->PRIM_type != O_T_BIND_REQ);
- /*
- * If UDP_EXCLBIND is set for either the bound or
- * binding endpoint, the semantics of bind
- * is changed according to the following chart.
- *
- * spec = specified address (v4 or v6)
- * unspec = unspecified address (v4 or v6)
- * A = specified addresses are different for endpoints
- *
- * bound bind to allowed?
- * -------------------------------------
- * unspec unspec no
- * unspec spec no
- * spec unspec no
- * spec spec yes if A
- *
- * For labeled systems, SO_MAC_EXEMPT behaves the same
- * as UDP_EXCLBIND, except that zoneid is ignored.
- */
- if (udp1->udp_exclbind || udp->udp_exclbind ||
- udp1->udp_connp->conn_mac_exempt ||
- connp->conn_mac_exempt) {
- if (V6_OR_V4_INADDR_ANY(
- udp1->udp_bound_v6src) ||
- is_inaddr_any ||
- IN6_ARE_ADDR_EQUAL(&udp1->udp_bound_v6src,
- &v6src)) {
- found_exclbind = B_TRUE;
- break;
- }
- continue;
- }
-
- /*
- * Check ipversion to allow IPv4 and IPv6 sockets to
- * have disjoint port number spaces.
- */
- if (udp->udp_ipversion != udp1->udp_ipversion) {
-
- /*
- * On the first time through the loop, if the
- * the user intentionally specified a
- * particular port number, then ignore any
- * bindings of the other protocol that may
- * conflict. This allows the user to bind IPv6
- * alone and get both v4 and v6, or bind both
- * both and get each seperately. On subsequent
- * times through the loop, we're checking a
- * port that we chose (not the user) and thus
- * we do not allow casual duplicate bindings.
- */
- if (count == 0 && requested_port != 0)
- continue;
- }
-
- /*
- * No difference depending on SO_REUSEADDR.
- *
- * If existing port is bound to a
- * non-wildcard IP address and
- * the requesting stream is bound to
- * a distinct different IP addresses
- * (non-wildcard, also), keep going.
- */
- if (!is_inaddr_any &&
- !V6_OR_V4_INADDR_ANY(udp1->udp_bound_v6src) &&
- !IN6_ARE_ADDR_EQUAL(&udp1->udp_bound_v6src,
- &v6src)) {
- continue;
- }
- break;
- }
-
- if (!found_exclbind &&
- (udp->udp_reuseaddr && requested_port != 0)) {
- break;
- }
-
- if (udp1 == NULL) {
- /*
- * No other stream has this IP address
- * and port number. We can use it.
- */
- break;
- }
- mutex_exit(&udpf->uf_lock);
- if (bind_to_req_port_only) {
- /*
- * We get here only when requested port
- * is bound (and only first of the for()
- * loop iteration).
- *
- * The semantics of this bind request
- * require it to fail so we return from
- * the routine (and exit the loop).
- *
- */
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- udp_err_ack(q, mp, TADDRBUSY, 0);
- return;
- }
-
- if (udp->udp_anon_priv_bind) {
- port = udp_get_next_priv_port(udp);
- } else {
- if ((count == 0) && (requested_port != 0)) {
- /*
- * If the application wants us to find
- * a port, get one to start with. Set
- * requested_port to 0, so that we will
- * update us->us_next_port_to_try below.
- */
- port = udp_update_next_port(udp,
- us->us_next_port_to_try, B_TRUE);
- requested_port = 0;
- } else {
- port = udp_update_next_port(udp, port + 1,
- B_FALSE);
- }
- }
-
- if (port == 0 || ++count >= loopmax) {
- /*
- * We've tried every possible port number and
- * there are none available, so send an error
- * to the user.
- */
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- udp_err_ack(q, mp, TNOADDR, 0);
- return;
- }
- }
-
- /*
- * Copy the source address into our udp structure. This address
- * may still be zero; if so, ip will fill in the correct address
- * each time an outbound packet is passed to it.
- * If we are binding to a broadcast or multicast address then
- * udp_bind_ack will clear the source address when it receives
- * the T_BIND_ACK.
- */
- udp->udp_v6src = udp->udp_bound_v6src = v6src;
- udp->udp_port = lport;
- /*
- * Now reset the the next anonymous port if the application requested
- * an anonymous port, or we handed out the next anonymous port.
- */
- if ((requested_port == 0) && (!udp->udp_anon_priv_bind)) {
- us->us_next_port_to_try = port + 1;
- }
-
- /* Initialize the O_T_BIND_REQ/T_BIND_REQ for ip. */
- if (udp->udp_family == AF_INET) {
- sin->sin_port = udp->udp_port;
- } else {
- int error;
-
- sin6->sin6_port = udp->udp_port;
- /* Rebuild the header template */
- error = udp_build_hdrs(udp);
- if (error != 0) {
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- mutex_exit(&udpf->uf_lock);
+ if (error != 0) {
+ if (error > 0) {
udp_err_ack(q, mp, TSYSERR, error);
- return;
- }
- }
- udp->udp_state = TS_IDLE;
- udp_bind_hash_insert(udpf, udp);
- mutex_exit(&udpf->uf_lock);
- rw_exit(&udp->udp_rwlock);
-
- if (cl_inet_bind) {
- /*
- * Running in cluster mode - register bind information
- */
- if (udp->udp_ipversion == IPV4_VERSION) {
- (*cl_inet_bind)(IPPROTO_UDP, AF_INET,
- (uint8_t *)(&V4_PART_OF_V6(udp->udp_v6src)),
- (in_port_t)udp->udp_port);
} else {
- (*cl_inet_bind)(IPPROTO_UDP, AF_INET6,
- (uint8_t *)&(udp->udp_v6src),
- (in_port_t)udp->udp_port);
+ udp_err_ack(q, mp, -error, 0);
}
-
- }
-
- connp->conn_anon_port = (is_system_labeled() && requested_port == 0);
- if (is_system_labeled() && (!connp->conn_anon_port ||
- connp->conn_anon_mlp)) {
- uint16_t mlpport;
- cred_t *cr = connp->conn_cred;
- zone_t *zone;
-
- zone = crgetzone(cr);
- connp->conn_mlp_type = udp->udp_recvucred ? mlptBoth :
- mlptSingle;
- addrtype = tsol_mlp_addr_type(zone->zone_id, IPV6_VERSION,
- &v6src, us->us_netstack->netstack_ip);
- if (addrtype == mlptSingle) {
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- udp_err_ack(q, mp, TNOADDR, 0);
- connp->conn_anon_port = B_FALSE;
- connp->conn_mlp_type = mlptSingle;
- return;
- }
- mlpport = connp->conn_anon_port ? PMAPPORT : port;
- mlptype = tsol_mlp_port_type(zone, IPPROTO_UDP, mlpport,
- addrtype);
- if (mlptype != mlptSingle &&
- (connp->conn_mlp_type == mlptSingle ||
- secpolicy_net_bindmlp(cr) != 0)) {
- if (udp->udp_debug) {
- (void) strlog(UDP_MOD_ID, 0, 1,
- SL_ERROR|SL_TRACE,
- "udp_bind: no priv for multilevel port %d",
- mlpport);
- }
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- udp_err_ack(q, mp, TACCES, 0);
- connp->conn_anon_port = B_FALSE;
- connp->conn_mlp_type = mlptSingle;
- return;
- }
-
- /*
- * If we're specifically binding a shared IP address and the
- * port is MLP on shared addresses, then check to see if this
- * zone actually owns the MLP. Reject if not.
- */
- if (mlptype == mlptShared && addrtype == mlptShared) {
- /*
- * No need to handle exclusive-stack zones since
- * ALL_ZONES only applies to the shared stack.
- */
- zoneid_t mlpzone;
-
- mlpzone = tsol_mlp_findzone(IPPROTO_UDP,
- htons(mlpport));
- if (connp->conn_zoneid != mlpzone) {
- if (udp->udp_debug) {
- (void) strlog(UDP_MOD_ID, 0, 1,
- SL_ERROR|SL_TRACE,
- "udp_bind: attempt to bind port "
- "%d on shared addr in zone %d "
- "(should be %d)",
- mlpport, connp->conn_zoneid,
- mlpzone);
- }
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- udp_err_ack(q, mp, TACCES, 0);
- connp->conn_anon_port = B_FALSE;
- connp->conn_mlp_type = mlptSingle;
- return;
- }
- }
- if (connp->conn_anon_port) {
- int error;
-
- error = tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
- port, B_TRUE);
- if (error != 0) {
- if (udp->udp_debug) {
- (void) strlog(UDP_MOD_ID, 0, 1,
- SL_ERROR|SL_TRACE,
- "udp_bind: cannot establish anon "
- "MLP for port %d", port);
- }
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- udp_err_ack(q, mp, TACCES, 0);
- connp->conn_anon_port = B_FALSE;
- connp->conn_mlp_type = mlptSingle;
- return;
- }
- }
- connp->conn_mlp_type = mlptype;
- }
-
- /* Pass the protocol number in the message following the address. */
- *mp->b_wptr++ = IPPROTO_UDP;
- if (!V6_OR_V4_INADDR_ANY(udp->udp_v6src)) {
- /*
- * Append a request for an IRE if udp_v6src not
- * zero (IPv4 - INADDR_ANY, or IPv6 - all-zeroes address).
- */
- mp->b_cont = allocb(sizeof (ire_t), BPRI_HI);
- if (!mp->b_cont) {
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- udp_err_ack(q, mp, TSYSERR, ENOMEM);
- return;
- }
- mp->b_cont->b_wptr += sizeof (ire_t);
- mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
+ } else {
+ tbr->PRIM_type = T_BIND_ACK;
+ qreply(q, mp);
}
- if (udp->udp_family == AF_INET6)
- mp = ip_bind_v6(q, mp, connp, NULL);
- else
- mp = ip_bind_v4(q, mp, connp);
-
- /* The above return NULL if the bind needs to be deferred */
- if (mp != NULL)
- udp_bind_result(connp, mp);
- else
- CONN_INC_REF(connp);
-}
-
-/*
- * This is called from ip_wput_nondata to handle the results of a
- * deferred UDP bind. It is called once the bind has been completed.
- */
-void
-udp_resume_bind(conn_t *connp, mblk_t *mp)
-{
- ASSERT(connp != NULL && IPCL_IS_UDP(connp));
-
- udp_bind_result(connp, mp);
-
- CONN_OPER_PENDING_DONE(connp);
}
/*
@@ -1174,32 +706,25 @@ udp_resume_bind(conn_t *connp, mblk_t *mp)
* T_OK_ACK - for the T_CONN_REQ
* T_CONN_CON - to keep the TPI user happy
*
- * The connect completes in udp_bind_result.
+ * The connect completes in udp_do_connect.
* When a T_BIND_ACK is received information is extracted from the IRE
* and the two appended messages are sent to the TPI user.
* Should udp_bind_result receive T_ERROR_ACK for the T_BIND_REQ it will
* convert it to an error ack for the appropriate primitive.
*/
static void
-udp_connect(queue_t *q, mblk_t *mp)
+udp_tpi_connect(queue_t *q, mblk_t *mp)
{
- sin6_t *sin6;
- sin_t *sin;
+ mblk_t *mp1;
+ udp_t *udp;
+ conn_t *connp = Q_TO_CONN(q);
+ int error;
+ socklen_t len;
+ struct sockaddr *sa;
struct T_conn_req *tcr;
- in6_addr_t v6dst;
- ipaddr_t v4dst;
- uint16_t dstport;
- uint32_t flowinfo;
- mblk_t *mp1, *mp2;
- udp_fanout_t *udpf;
- udp_t *udp, *udp1;
- ushort_t ipversion;
- udp_stack_t *us;
- conn_t *connp = Q_TO_CONN(q);
udp = connp->conn_udp;
tcr = (struct T_conn_req *)mp->b_rptr;
- us = udp->udp_us;
/* A bit of sanity checking */
if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) {
@@ -1218,285 +743,87 @@ udp_connect(queue_t *q, mblk_t *mp)
* Make sure that address family matches the type of
* family of the the address passed down
*/
+ len = tcr->DEST_length;
switch (tcr->DEST_length) {
default:
udp_err_ack(q, mp, TBADADDR, 0);
return;
case sizeof (sin_t):
- sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
+ sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
sizeof (sin_t));
- if (sin == NULL || !OK_32PTR((char *)sin)) {
- udp_err_ack(q, mp, TSYSERR, EINVAL);
- return;
- }
- if (udp->udp_family != AF_INET ||
- sin->sin_family != AF_INET) {
- udp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
- return;
- }
- v4dst = sin->sin_addr.s_addr;
- dstport = sin->sin_port;
- IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
- ASSERT(udp->udp_ipversion == IPV4_VERSION);
- ipversion = IPV4_VERSION;
break;
case sizeof (sin6_t):
- sin6 = (sin6_t *)mi_offset_param(mp, tcr->DEST_offset,
+ sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
sizeof (sin6_t));
- if (sin6 == NULL || !OK_32PTR((char *)sin6)) {
- udp_err_ack(q, mp, TSYSERR, EINVAL);
- return;
- }
- if (udp->udp_family != AF_INET6 ||
- sin6->sin6_family != AF_INET6) {
- udp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT);
- return;
- }
- v6dst = sin6->sin6_addr;
- dstport = sin6->sin6_port;
- if (IN6_IS_ADDR_V4MAPPED(&v6dst)) {
- IN6_V4MAPPED_TO_IPADDR(&v6dst, v4dst);
- ipversion = IPV4_VERSION;
- flowinfo = 0;
- } else {
- ipversion = IPV6_VERSION;
- flowinfo = sin6->sin6_flowinfo;
- }
break;
}
- if (dstport == 0) {
- udp_err_ack(q, mp, TBADADDR, 0);
- return;
- }
-
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- /*
- * This UDP must have bound to a port already before doing a connect.
- * TPI mandates that users must send TPI primitives only 1 at a time
- * and wait for the response before sending the next primitive.
- */
- if (udp->udp_state == TS_UNBND || udp->udp_pending_op != -1) {
- rw_exit(&udp->udp_rwlock);
- (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
- "udp_connect: bad state, %u", udp->udp_state);
- udp_err_ack(q, mp, TOUTSTATE, 0);
+ error = proto_verify_ip_addr(udp->udp_family, sa, len);
+ if (error != 0) {
+ udp_err_ack(q, mp, TSYSERR, error);
return;
}
- udp->udp_pending_op = T_CONN_REQ;
- ASSERT(udp->udp_port != 0 && udp->udp_ptpbhn != NULL);
-
- if (ipversion == IPV4_VERSION) {
- udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE +
- udp->udp_ip_snd_options_len;
- } else {
- udp->udp_max_hdr_len = udp->udp_sticky_hdrs_len;
- }
-
- udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port,
- us->us_bind_fanout_size)];
-
- mutex_enter(&udpf->uf_lock);
- if (udp->udp_state == TS_DATA_XFER) {
- /* Already connected - clear out state */
- udp->udp_v6src = udp->udp_bound_v6src;
- udp->udp_state = TS_IDLE;
- }
/*
- * Create a default IP header with no IP options.
+ * We have to send a connection confirmation to
+ * keep TLI happy.
*/
- udp->udp_dstport = dstport;
- udp->udp_ipversion = ipversion;
- if (ipversion == IPV4_VERSION) {
- /*
- * Interpret a zero destination to mean loopback.
- * Update the T_CONN_REQ (sin/sin6) since it is used to
- * generate the T_CONN_CON.
- */
- if (v4dst == INADDR_ANY) {
- v4dst = htonl(INADDR_LOOPBACK);
- IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
- if (udp->udp_family == AF_INET) {
- sin->sin_addr.s_addr = v4dst;
- } else {
- sin6->sin6_addr = v6dst;
- }
- }
- udp->udp_v6dst = v6dst;
- udp->udp_flowinfo = 0;
-
- /*
- * If the destination address is multicast and
- * an outgoing multicast interface has been set,
- * use the address of that interface as our
- * source address if no source address has been set.
- */
- if (V4_PART_OF_V6(udp->udp_v6src) == INADDR_ANY &&
- CLASSD(v4dst) &&
- udp->udp_multicast_if_addr != INADDR_ANY) {
- IN6_IPADDR_TO_V4MAPPED(udp->udp_multicast_if_addr,
- &udp->udp_v6src);
- }
+ if (udp->udp_family == AF_INET) {
+ mp1 = mi_tpi_conn_con(NULL, (char *)sa,
+ sizeof (sin_t), NULL, 0);
} else {
- ASSERT(udp->udp_ipversion == IPV6_VERSION);
- /*
- * Interpret a zero destination to mean loopback.
- * Update the T_CONN_REQ (sin/sin6) since it is used to
- * generate the T_CONN_CON.
- */
- if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) {
- v6dst = ipv6_loopback;
- sin6->sin6_addr = v6dst;
- }
- udp->udp_v6dst = v6dst;
- udp->udp_flowinfo = flowinfo;
- /*
- * If the destination address is multicast and
- * an outgoing multicast interface has been set,
- * then the ip bind logic will pick the correct source
- * address (i.e. matching the outgoing multicast interface).
- */
+ mp1 = mi_tpi_conn_con(NULL, (char *)sa,
+ sizeof (sin6_t), NULL, 0);
}
-
- /*
- * Verify that the src/port/dst/port is unique for all
- * connections in TS_DATA_XFER
- */
- for (udp1 = udpf->uf_udp; udp1 != NULL; udp1 = udp1->udp_bind_hash) {
- if (udp1->udp_state != TS_DATA_XFER)
- continue;
- if (udp->udp_port != udp1->udp_port ||
- udp->udp_ipversion != udp1->udp_ipversion ||
- dstport != udp1->udp_dstport ||
- !IN6_ARE_ADDR_EQUAL(&udp->udp_v6src, &udp1->udp_v6src) ||
- !IN6_ARE_ADDR_EQUAL(&v6dst, &udp1->udp_v6dst) ||
- !(IPCL_ZONE_MATCH(udp->udp_connp,
- udp1->udp_connp->conn_zoneid) ||
- IPCL_ZONE_MATCH(udp1->udp_connp,
- udp->udp_connp->conn_zoneid)))
- continue;
- mutex_exit(&udpf->uf_lock);
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- udp_err_ack(q, mp, TBADADDR, 0);
- return;
- }
- udp->udp_state = TS_DATA_XFER;
- mutex_exit(&udpf->uf_lock);
-
- /*
- * Send down bind to IP to verify that there is a route
- * and to determine the source address.
- * This will come back as T_BIND_ACK with an IRE_DB_TYPE in rput.
- */
- if (udp->udp_family == AF_INET)
- mp1 = udp_ip_bind_mp(udp, O_T_BIND_REQ, sizeof (ipa_conn_t));
- else
- mp1 = udp_ip_bind_mp(udp, O_T_BIND_REQ, sizeof (ipa6_conn_t));
if (mp1 == NULL) {
-bind_failed:
- mutex_enter(&udpf->uf_lock);
- udp->udp_state = TS_IDLE;
- udp->udp_pending_op = -1;
- mutex_exit(&udpf->uf_lock);
- rw_exit(&udp->udp_rwlock);
udp_err_ack(q, mp, TSYSERR, ENOMEM);
return;
}
- rw_exit(&udp->udp_rwlock);
/*
- * We also have to send a connection confirmation to
- * keep TLI happy. Prepare it for udp_bind_result.
+ * ok_ack for T_CONN_REQ
*/
- if (udp->udp_family == AF_INET)
- mp2 = mi_tpi_conn_con(NULL, (char *)sin,
- sizeof (*sin), NULL, 0);
- else
- mp2 = mi_tpi_conn_con(NULL, (char *)sin6,
- sizeof (*sin6), NULL, 0);
- if (mp2 == NULL) {
- freemsg(mp1);
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- goto bind_failed;
- }
-
mp = mi_tpi_ok_ack_alloc(mp);
if (mp == NULL) {
/* Unable to reuse the T_CONN_REQ for the ack. */
- freemsg(mp2);
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- mutex_enter(&udpf->uf_lock);
- udp->udp_state = TS_IDLE;
- udp->udp_pending_op = -1;
- mutex_exit(&udpf->uf_lock);
- rw_exit(&udp->udp_rwlock);
+ freemsg(mp1);
udp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM);
return;
}
- /* Hang onto the T_OK_ACK and T_CONN_CON for later. */
- linkb(mp1, mp);
- linkb(mp1, mp2);
-
- mblk_setcred(mp1, connp->conn_cred);
- if (udp->udp_family == AF_INET)
- mp1 = ip_bind_v4(q, mp1, connp);
- else
- mp1 = ip_bind_v6(q, mp1, connp, NULL);
-
- /* The above return NULL if the bind needs to be deferred */
- if (mp1 != NULL)
- udp_bind_result(connp, mp1);
- else
- CONN_INC_REF(connp);
+ error = udp_do_connect(connp, sa, len);
+ if (error != 0) {
+ freeb(mp1);
+ if (error < 0)
+ udp_err_ack(q, mp, -error, 0);
+ else
+ udp_err_ack(q, mp, TSYSERR, error);
+ } else {
+ putnext(connp->conn_rq, mp);
+ putnext(connp->conn_rq, mp1);
+ }
}
static int
-udp_close(queue_t *q)
+udp_tpi_close(queue_t *q, int flags)
{
- conn_t *connp = (conn_t *)q->q_ptr;
- udp_t *udp;
-
- ASSERT(connp != NULL && IPCL_IS_UDP(connp));
- udp = connp->conn_udp;
-
- udp_quiesce_conn(connp);
- ip_quiesce_conn(connp);
- /*
- * Disable read-side synchronous stream
- * interface and drain any queued data.
- */
- udp_rcv_drain(q, udp, B_TRUE);
- ASSERT(!udp->udp_direct_sockfs);
-
- qprocsoff(q);
-
- ASSERT(udp->udp_rcv_cnt == 0);
- ASSERT(udp->udp_rcv_msgcnt == 0);
- ASSERT(udp->udp_rcv_list_head == NULL);
- ASSERT(udp->udp_rcv_list_tail == NULL);
-
- udp_close_free(connp);
+ conn_t *connp;
- /*
- * Now we are truly single threaded on this stream, and can
- * delete the things hanging off the connp, and finally the connp.
- * We removed this connp from the fanout list, it cannot be
- * accessed thru the fanouts, and we already waited for the
- * conn_ref to drop to 0. We are already in close, so
- * there cannot be any other thread from the top. qprocsoff
- * has completed, and service has completed or won't run in
- * future.
- */
- ASSERT(connp->conn_ref == 1);
- inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
- connp->conn_ref--;
- ipcl_conn_destroy(connp);
+ if (flags & SO_FALLBACK) {
+ /*
+ * stream is being closed while in fallback
+ * simply free the resources that were allocated
+ */
+ inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
+ qprocsoff(q);
+ goto done;
+ }
+ connp = Q_TO_CONN(q);
+ udp_do_close(connp);
+done:
q->q_ptr = WR(q)->q_ptr = NULL;
return (0);
}
@@ -1567,39 +894,21 @@ udp_close_free(conn_t *connp)
udp->udp_connp = connp;
}
-/*
- * This routine handles each T_DISCON_REQ message passed to udp
- * as an indicating that UDP is no longer connected. This results
- * in sending a T_BIND_REQ to IP to restore the binding to just
- * the local address/port.
- *
- * This routine sends down a T_BIND_REQ to IP with the following mblks:
- * T_BIND_REQ - specifying just the local address/port
- * T_OK_ACK - for the T_DISCON_REQ
- *
- * The disconnect completes in udp_bind_result.
- * When a T_BIND_ACK is received the appended T_OK_ACK is sent to the TPI user.
- * Should udp_bind_result receive T_ERROR_ACK for the T_BIND_REQ it will
- * convert it to an error ack for the appropriate primitive.
- */
-static void
-udp_disconnect(queue_t *q, mblk_t *mp)
+static int
+udp_do_disconnect(conn_t *connp)
{
udp_t *udp;
- mblk_t *mp1;
+ mblk_t *ire_mp;
udp_fanout_t *udpf;
udp_stack_t *us;
- conn_t *connp = Q_TO_CONN(q);
+ int error;
udp = connp->conn_udp;
us = udp->udp_us;
rw_enter(&udp->udp_rwlock, RW_WRITER);
if (udp->udp_state != TS_DATA_XFER || udp->udp_pending_op != -1) {
rw_exit(&udp->udp_rwlock);
- (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
- "udp_disconnect: bad state, %u", udp->udp_state);
- udp_err_ack(q, mp, TOUTSTATE, 0);
- return;
+ return (-TOUTSTATE);
}
udp->udp_pending_op = T_DISCON_REQ;
udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port,
@@ -1609,57 +918,85 @@ udp_disconnect(queue_t *q, mblk_t *mp)
udp->udp_state = TS_IDLE;
mutex_exit(&udpf->uf_lock);
- /*
- * Send down bind to IP to remove the full binding and revert
- * to the local address binding.
- */
- if (udp->udp_family == AF_INET)
- mp1 = udp_ip_bind_mp(udp, O_T_BIND_REQ, sizeof (sin_t));
- else
- mp1 = udp_ip_bind_mp(udp, O_T_BIND_REQ, sizeof (sin6_t));
- if (mp1 == NULL) {
+ if (udp->udp_family == AF_INET6) {
+ /* Rebuild the header template */
+ error = udp_build_hdrs(udp);
+ if (error != 0) {
+ udp->udp_pending_op = -1;
+ rw_exit(&udp->udp_rwlock);
+ return (error);
+ }
+ }
+
+ ire_mp = allocb(sizeof (ire_t), BPRI_HI);
+ if (ire_mp == NULL) {
+ mutex_enter(&udpf->uf_lock);
udp->udp_pending_op = -1;
+ mutex_exit(&udpf->uf_lock);
rw_exit(&udp->udp_rwlock);
- udp_err_ack(q, mp, TSYSERR, ENOMEM);
- return;
+ return (ENOMEM);
}
- mp = mi_tpi_ok_ack_alloc(mp);
+
+ rw_exit(&udp->udp_rwlock);
+
+ if (udp->udp_family == AF_INET6) {
+ error = ip_proto_bind_laddr_v6(connp, &ire_mp, IPPROTO_UDP,
+ &udp->udp_bound_v6src, udp->udp_port, B_TRUE);
+ } else {
+ error = ip_proto_bind_laddr_v4(connp, &ire_mp, IPPROTO_UDP,
+ V4_PART_OF_V6(udp->udp_bound_v6src), udp->udp_port, B_TRUE);
+ }
+
+ return (udp_post_ip_bind_connect(udp, ire_mp, error));
+}
+
+
+static void
+udp_tpi_disconnect(queue_t *q, mblk_t *mp)
+{
+ conn_t *connp = Q_TO_CONN(q);
+ int error;
+
+ /*
+ * Allocate the largest primitive we need to send back
+ * T_error_ack is > than T_ok_ack
+ */
+ mp = reallocb(mp, sizeof (struct T_error_ack), 1);
if (mp == NULL) {
/* Unable to reuse the T_DISCON_REQ for the ack. */
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- udp_err_ack_prim(q, mp1, T_DISCON_REQ, TSYSERR, ENOMEM);
+ udp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, ENOMEM);
return;
}
- if (udp->udp_family == AF_INET6) {
- int error;
+ error = udp_do_disconnect(connp);
- /* Rebuild the header template */
- error = udp_build_hdrs(udp);
- if (error != 0) {
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- udp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, error);
- freemsg(mp1);
- return;
+ if (error != 0) {
+ if (error < 0) {
+ udp_err_ack(q, mp, -error, 0);
+ } else {
+ udp_err_ack(q, mp, TSYSERR, error);
}
+ } else {
+ mp = mi_tpi_ok_ack_alloc(mp);
+ ASSERT(mp != NULL);
+ qreply(q, mp);
}
+}
- rw_exit(&udp->udp_rwlock);
- /* Append the T_OK_ACK to the T_BIND_REQ for udp_bind_ack */
- linkb(mp1, mp);
+int
+udp_disconnect(conn_t *connp)
+{
+ int error;
+ udp_t *udp = connp->conn_udp;
- if (udp->udp_family == AF_INET6)
- mp1 = ip_bind_v6(q, mp1, connp, NULL);
- else
- mp1 = ip_bind_v4(q, mp1, connp);
+ udp->udp_dgram_errind = B_FALSE;
- /* The above return NULL if the bind needs to be deferred */
- if (mp1 != NULL)
- udp_bind_result(connp, mp1);
- else
- CONN_INC_REF(connp);
+ error = udp_do_disconnect(connp);
+
+ if (error < 0)
+ error = proto_tlitosyserr(-error);
+
+ return (error);
}
/* This routine creates a T_ERROR_ACK message and passes it upstream. */
@@ -1783,8 +1120,8 @@ udp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
* Assumes that IP has pulled up everything up to and including the ICMP header.
*/
static void
-udp_icmp_error(queue_t *q, mblk_t *mp)
-{
+udp_icmp_error(conn_t *connp, mblk_t *mp)
+ {
icmph_t *icmph;
ipha_t *ipha;
int iph_hdr_length;
@@ -1793,15 +1130,16 @@ udp_icmp_error(queue_t *q, mblk_t *mp)
sin6_t sin6;
mblk_t *mp1;
int error = 0;
- udp_t *udp = Q_TO_UDP(q);
+ udp_t *udp = connp->conn_udp;
+ mp1 = NULL;
ipha = (ipha_t *)mp->b_rptr;
ASSERT(OK_32PTR(mp->b_rptr));
if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) {
ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
- udp_icmp_error_ipv6(q, mp);
+ udp_icmp_error_ipv6(connp, mp);
return;
}
ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
@@ -1850,27 +1188,66 @@ udp_icmp_error(queue_t *q, mblk_t *mp)
return;
}
+
switch (udp->udp_family) {
case AF_INET:
sin = sin_null;
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = ipha->ipha_dst;
sin.sin_port = udpha->uha_dst_port;
- mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0,
- error);
+ if (IPCL_IS_NONSTR(connp)) {
+ rw_enter(&udp->udp_rwlock, RW_WRITER);
+ if (udp->udp_state == TS_DATA_XFER) {
+ if (sin.sin_port == udp->udp_dstport &&
+ sin.sin_addr.s_addr ==
+ V4_PART_OF_V6(udp->udp_v6dst)) {
+
+ rw_exit(&udp->udp_rwlock);
+ (*connp->conn_upcalls->su_set_error)
+ (connp->conn_upper_handle, error);
+ goto done;
+ }
+ } else {
+ udp->udp_delayed_error = error;
+ *((sin_t *)&udp->udp_delayed_addr) = sin;
+ }
+ rw_exit(&udp->udp_rwlock);
+ } else {
+ mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t),
+ NULL, 0, error);
+ }
break;
case AF_INET6:
sin6 = sin6_null;
sin6.sin6_family = AF_INET6;
IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &sin6.sin6_addr);
sin6.sin6_port = udpha->uha_dst_port;
+ if (IPCL_IS_NONSTR(connp)) {
+ rw_enter(&udp->udp_rwlock, RW_WRITER);
+ if (udp->udp_state == TS_DATA_XFER) {
+ if (sin6.sin6_port == udp->udp_dstport &&
+ IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
+ &udp->udp_v6dst)) {
+ rw_exit(&udp->udp_rwlock);
+ (*connp->conn_upcalls->su_set_error)
+ (connp->conn_upper_handle, error);
+ goto done;
+ }
+ } else {
+ udp->udp_delayed_error = error;
+ *((sin6_t *)&udp->udp_delayed_addr) = sin6;
+ }
+ rw_exit(&udp->udp_rwlock);
+ } else {
- mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
- NULL, 0, error);
+ mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
+ NULL, 0, error);
+ }
break;
}
- if (mp1)
- putnext(q, mp1);
+ if (mp1 != NULL)
+ putnext(connp->conn_rq, mp1);
+done:
freemsg(mp);
}
@@ -1881,7 +1258,7 @@ udp_icmp_error(queue_t *q, mblk_t *mp)
* ICMPv6 header.
*/
static void
-udp_icmp_error_ipv6(queue_t *q, mblk_t *mp)
+udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp)
{
icmp6_t *icmp6;
ip6_t *ip6h, *outer_ip6h;
@@ -1891,7 +1268,7 @@ udp_icmp_error_ipv6(queue_t *q, mblk_t *mp)
sin6_t sin6;
mblk_t *mp1;
int error = 0;
- udp_t *udp = Q_TO_UDP(q);
+ udp_t *udp = connp->conn_udp;
udp_stack_t *us = udp->udp_us;
outer_ip6h = (ip6_t *)mp->b_rptr;
@@ -1982,7 +1359,13 @@ udp_icmp_error_ipv6(queue_t *q, mblk_t *mp)
* message. Free it, then send our empty message.
*/
freemsg(mp);
- putnext(q, newmp);
+ if (!IPCL_IS_NONSTR(connp)) {
+ putnext(connp->conn_rq, newmp);
+ } else {
+ (*connp->conn_upcalls->su_recv)
+ (connp->conn_upper_handle, newmp, 0, 0, &error,
+ NULL);
+ }
return;
}
case ICMP6_TIME_EXCEEDED:
@@ -2018,10 +1401,30 @@ udp_icmp_error_ipv6(queue_t *q, mblk_t *mp)
sin6.sin6_port = udpha->uha_dst_port;
sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
- mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), NULL, 0,
- error);
- if (mp1)
- putnext(q, mp1);
+ if (IPCL_IS_NONSTR(connp)) {
+ rw_enter(&udp->udp_rwlock, RW_WRITER);
+ if (udp->udp_state == TS_DATA_XFER) {
+ if (sin6.sin6_port == udp->udp_dstport &&
+ IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr,
+ &udp->udp_v6dst)) {
+ rw_exit(&udp->udp_rwlock);
+ (*connp->conn_upcalls->su_set_error)
+ (connp->conn_upper_handle, error);
+ goto done;
+ }
+ } else {
+ udp->udp_delayed_error = error;
+ *((sin6_t *)&udp->udp_delayed_addr) = sin6;
+ }
+ rw_exit(&udp->udp_rwlock);
+ } else {
+ mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
+ NULL, 0, error);
+ if (mp1 != NULL)
+ putnext(connp->conn_rq, mp1);
+ }
+
+done:
freemsg(mp);
}
@@ -2166,6 +1569,18 @@ udp_copy_info(struct T_info_ack *tap, udp_t *udp)
tap->OPT_size = udp_max_optsize;
}
+static void
+udp_do_capability_ack(udp_t *udp, struct T_capability_ack *tcap,
+ t_uscalar_t cap_bits1)
+{
+ tcap->CAP_bits1 = 0;
+
+ if (cap_bits1 & TC1_INFO) {
+ udp_copy_info(&tcap->INFO_ack, udp);
+ tcap->CAP_bits1 |= TC1_INFO;
+ }
+}
+
/*
* This routine responds to T_CAPABILITY_REQ messages. It is called by
* udp_wput. Much of the T_CAPABILITY_ACK information is copied from
@@ -2187,12 +1602,7 @@ udp_capability_req(queue_t *q, mblk_t *mp)
return;
tcap = (struct T_capability_ack *)mp->b_rptr;
- tcap->CAP_bits1 = 0;
-
- if (cap_bits1 & TC1_INFO) {
- udp_copy_info(&tcap->INFO_ack, udp);
- tcap->CAP_bits1 |= TC1_INFO;
- }
+ udp_do_capability_ack(udp, tcap, cap_bits1);
qreply(q, mp);
}
@@ -2378,12 +1788,10 @@ static int
udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
boolean_t isv6)
{
- int err;
+ int error;
udp_t *udp;
conn_t *connp;
dev_t conn_dev;
- zoneid_t zoneid;
- netstack_t *ns;
udp_stack_t *us;
vmem_t *minor_arena;
@@ -2396,20 +1804,6 @@ udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
if (sflag == MODOPEN)
return (EINVAL);
- ns = netstack_find_by_cred(credp);
- ASSERT(ns != NULL);
- us = ns->netstack_udp;
- ASSERT(us != NULL);
-
- /*
- * For exclusive stacks we set the zoneid to zero
- * to make UDP operate as if in the global zone.
- */
- if (ns->netstack_stackid != GLOBAL_NETSTACKID)
- zoneid = GLOBAL_ZONEID;
- else
- zoneid = crgetzoneid(credp);
-
if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) &&
((conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) {
minor_arena = ip_minor_arena_la;
@@ -2419,25 +1813,34 @@ udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
* or a non socket application is doing the open.
* Try to allocate from the small arena.
*/
- if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
- netstack_rele(ns);
+ if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0)
return (EBUSY);
- }
+
minor_arena = ip_minor_arena_sa;
}
- *devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
+ if (flag & SO_FALLBACK) {
+ /*
+ * Non streams socket needs a stream to fallback to
+ */
+ RD(q)->q_ptr = (void *)conn_dev;
+ WR(q)->q_qinfo = &udp_fallback_sock_winit;
+ WR(q)->q_ptr = (void *)minor_arena;
+ qprocson(q);
+ return (0);
+ }
- connp = ipcl_conn_create(IPCL_UDPCONN, KM_SLEEP, ns);
- connp->conn_dev = conn_dev;
- connp->conn_minor_arena = minor_arena;
+ connp = udp_do_open(credp, isv6, KM_SLEEP);
+ if (connp == NULL) {
+ inet_minor_free(minor_arena, conn_dev);
+ return (ENOMEM);
+ }
udp = connp->conn_udp;
+ us = udp->udp_us;
- /*
- * ipcl_conn_create did a netstack_hold. Undo the hold that was
- * done by netstack_find_by_cred()
- */
- netstack_rele(ns);
+ *devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
+ connp->conn_dev = conn_dev;
+ connp->conn_minor_arena = minor_arena;
/*
* Initialize the udp_t structure for this stream.
@@ -2452,79 +1855,39 @@ udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
ASSERT(connp->conn_udp == udp);
ASSERT(udp->udp_connp == connp);
- /* Set the initial state of the stream and the privilege status. */
- udp->udp_state = TS_UNBND;
- if (isv6) {
- udp->udp_family = AF_INET6;
- udp->udp_ipversion = IPV6_VERSION;
- udp->udp_max_hdr_len = IPV6_HDR_LEN + UDPH_SIZE;
- udp->udp_ttl = us->us_ipv6_hoplimit;
- connp->conn_af_isv6 = B_TRUE;
- connp->conn_flags |= IPCL_ISV6;
- } else {
- udp->udp_family = AF_INET;
- udp->udp_ipversion = IPV4_VERSION;
- udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE;
- udp->udp_ttl = us->us_ipv4_ttl;
- connp->conn_af_isv6 = B_FALSE;
- connp->conn_flags &= ~IPCL_ISV6;
- }
-
- udp->udp_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
- udp->udp_pending_op = -1;
- connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
- connp->conn_zoneid = zoneid;
-
- udp->udp_open_time = lbolt64;
- udp->udp_open_pid = curproc->p_pid;
-
- /*
- * If the caller has the process-wide flag set, then default to MAC
- * exempt mode. This allows read-down to unlabeled hosts.
- */
- if (getpflags(NET_MAC_AWARE, credp) != 0)
- connp->conn_mac_exempt = B_TRUE;
-
if (flag & SO_SOCKSTR) {
connp->conn_flags |= IPCL_SOCKET;
udp->udp_issocket = B_TRUE;
udp->udp_direct_sockfs = B_TRUE;
}
- connp->conn_ulp_labeled = is_system_labeled();
-
- udp->udp_us = us;
-
q->q_hiwat = us->us_recv_hiwat;
WR(q)->q_hiwat = us->us_xmit_hiwat;
WR(q)->q_lowat = us->us_xmit_lowat;
- connp->conn_recv = udp_input;
- crhold(credp);
- connp->conn_cred = credp;
-
- mutex_enter(&connp->conn_lock);
- connp->conn_state_flags &= ~CONN_INCIPIENT;
- mutex_exit(&connp->conn_lock);
-
qprocson(q);
if (udp->udp_family == AF_INET6) {
/* Build initial header template for transmit */
- if ((err = udp_build_hdrs(udp)) != 0) {
+ if ((error = udp_build_hdrs(udp)) != 0) {
rw_exit(&udp->udp_rwlock);
qprocsoff(q);
+ inet_minor_free(minor_arena, conn_dev);
ipcl_conn_destroy(connp);
- return (err);
+ return (error);
}
}
rw_exit(&udp->udp_rwlock);
/* Set the Stream head write offset and high watermark. */
- (void) mi_set_sth_wroff(q,
+ (void) proto_set_tx_wroff(q, connp,
udp->udp_max_hdr_len + us->us_wroff_extra);
- (void) mi_set_sth_hiwat(q, udp_set_rcv_hiwat(udp, q->q_hiwat));
+ /* XXX udp_set_rcv_hiwat() doesn't hold the lock, is it a bug??? */
+ (void) proto_set_rx_hiwat(q, connp, udp_set_rcv_hiwat(udp, q->q_hiwat));
+ mutex_enter(&connp->conn_lock);
+ connp->conn_state_flags &= ~CONN_INCIPIENT;
+ mutex_exit(&connp->conn_lock);
return (0);
}
@@ -2582,21 +1945,16 @@ udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
* This routine retrieves the current status of socket options.
* It returns the size of the option retrieved.
*/
-int
-udp_opt_get_locked(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
+static int
+udp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
{
- int *i1 = (int *)ptr;
- conn_t *connp;
- udp_t *udp;
- ip6_pkt_t *ipp;
- int len;
- udp_stack_t *us;
-
- connp = Q_TO_CONN(q);
- udp = connp->conn_udp;
- ipp = &udp->udp_sticky_ipp;
- us = udp->udp_us;
+ udp_t *udp = connp->conn_udp;
+ udp_stack_t *us = udp->udp_us;
+ int *i1 = (int *)ptr;
+ ip6_pkt_t *ipp = &udp->udp_sticky_ipp;
+ int len;
+ ASSERT(RW_READ_HELD(&udp->udp_rwlock));
switch (level) {
case SOL_SOCKET:
switch (name) {
@@ -2625,10 +1983,10 @@ udp_opt_get_locked(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
break; /* goto sizeof (int) option return */
case SO_SNDBUF:
- *i1 = q->q_hiwat;
+ *i1 = udp->udp_xmit_hiwat;
break; /* goto sizeof (int) option return */
case SO_RCVBUF:
- *i1 = RD(q)->q_hiwat;
+ *i1 = udp->udp_rcv_disply_hiwat;
break; /* goto sizeof (int) option return */
case SO_DGRAM_ERRIND:
*i1 = udp->udp_dgram_errind;
@@ -2907,15 +2265,15 @@ udp_opt_get_locked(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
}
int
-udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
+udp_tpi_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
{
- udp_t *udp;
+ udp_t *udp;
int err;
udp = Q_TO_UDP(q);
rw_enter(&udp->udp_rwlock, RW_READER);
- err = udp_opt_get_locked(q, level, name, ptr);
+ err = udp_opt_get(Q_TO_CONN(q), level, name, ptr);
rw_exit(&udp->udp_rwlock);
return (err);
}
@@ -2924,83 +2282,34 @@ udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
* This routine sets socket options.
*/
/* ARGSUSED */
-int
-udp_opt_set_locked(queue_t *q, uint_t optset_context, int level,
- int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
- uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+static int
+udp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen,
+ uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, cred_t *cr,
+ void *thisdg_attrs, boolean_t checkonly)
{
udpattrs_t *attrs = thisdg_attrs;
int *i1 = (int *)invalp;
boolean_t onoff = (*i1 == 0) ? 0 : 1;
- boolean_t checkonly;
+ udp_t *udp = connp->conn_udp;
+ udp_stack_t *us = udp->udp_us;
int error;
- conn_t *connp;
- udp_t *udp;
uint_t newlen;
- udp_stack_t *us;
size_t sth_wroff;
- connp = Q_TO_CONN(q);
- udp = connp->conn_udp;
- us = udp->udp_us;
-
- switch (optset_context) {
- case SETFN_OPTCOM_CHECKONLY:
- checkonly = B_TRUE;
- /*
- * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
- * inlen != 0 implies value supplied and
- * we have to "pretend" to set it.
- * inlen == 0 implies that there is no
- * value part in T_CHECK request and just validation
- * done elsewhere should be enough, we just return here.
- */
- if (inlen == 0) {
- *outlenp = 0;
- return (0);
- }
- break;
- case SETFN_OPTCOM_NEGOTIATE:
- checkonly = B_FALSE;
- break;
- case SETFN_UD_NEGOTIATE:
- case SETFN_CONN_NEGOTIATE:
- checkonly = B_FALSE;
- /*
- * Negotiating local and "association-related" options
- * through T_UNITDATA_REQ.
- *
- * Following routine can filter out ones we do not
- * want to be "set" this way.
- */
- if (!udp_opt_allow_udr_set(level, name)) {
- *outlenp = 0;
- return (EINVAL);
- }
- break;
- default:
- /*
- * We should never get here
- */
- *outlenp = 0;
- return (EINVAL);
- }
-
- ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
- (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
-
+ ASSERT(RW_WRITE_HELD(&udp->udp_rwlock));
/*
* For fixed length options, no sanity check
* of passed in length is done. It is assumed *_optcom_req()
* routines do the right thing.
*/
-
switch (level) {
case SOL_SOCKET:
switch (name) {
case SO_REUSEADDR:
- if (!checkonly)
+ if (!checkonly) {
udp->udp_reuseaddr = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case SO_DEBUG:
if (!checkonly)
@@ -3011,16 +2320,22 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level,
* but are only meaningful to IP.
*/
case SO_DONTROUTE:
- if (!checkonly)
+ if (!checkonly) {
udp->udp_dontroute = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case SO_USELOOPBACK:
- if (!checkonly)
+ if (!checkonly) {
udp->udp_useloopback = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case SO_BROADCAST:
- if (!checkonly)
+ if (!checkonly) {
udp->udp_broadcast = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case SO_SNDBUF:
@@ -3029,7 +2344,8 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level,
return (ENOBUFS);
}
if (!checkonly) {
- q->q_hiwat = *i1;
+ udp->udp_xmit_hiwat = *i1;
+ connp->conn_wq->q_hiwat = *i1;
}
break;
case SO_RCVBUF:
@@ -3038,10 +2354,13 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level,
return (ENOBUFS);
}
if (!checkonly) {
- RD(q)->q_hiwat = *i1;
+ int size;
+
+ udp->udp_rcv_disply_hiwat = *i1;
+ size = udp_set_rcv_hiwat(udp, *i1);
rw_exit(&udp->udp_rwlock);
- (void) mi_set_sth_hiwat(RD(q),
- udp_set_rcv_hiwat(udp, *i1));
+ (void) proto_set_rx_hiwat(connp->conn_rq, connp,
+ size);
rw_enter(&udp->udp_rwlock, RW_WRITER);
}
break;
@@ -3065,11 +2384,20 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level,
udp->udp_timestamp = onoff;
break;
case SO_ANON_MLP:
- /* Pass option along to IP level for handling */
- return (-EINVAL);
+ if (!checkonly) {
+ connp->conn_anon_mlp = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
+ break;
case SO_MAC_EXEMPT:
- /* Pass option along to IP level for handling */
- return (-EINVAL);
+ if (secpolicy_net_mac_aware(cr) != 0 ||
+ udp->udp_state != TS_UNBND)
+ return (EACCES);
+ if (!checkonly) {
+ connp->conn_mac_exempt = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
+ break;
case SCM_UCRED: {
struct ucred_s *ucr;
cred_t *cr, *newcr;
@@ -3149,7 +2477,8 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level,
UDPH_SIZE + udp->udp_ip_snd_options_len;
sth_wroff = udp->udp_max_hdr_len + us->us_wroff_extra;
rw_exit(&udp->udp_rwlock);
- (void) mi_set_sth_wroff(RD(q), sth_wroff);
+ (void) proto_set_tx_wroff(connp->conn_rq, connp,
+ sth_wroff);
rw_enter(&udp->udp_rwlock, RW_WRITER);
break;
@@ -3173,6 +2502,7 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level,
if (!checkonly) {
udp->udp_multicast_if_addr =
inap->s_addr;
+ PASS_OPT_TO_IP(connp);
}
break;
}
@@ -3181,8 +2511,10 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level,
udp->udp_multicast_ttl = *invalp;
break;
case IP_MULTICAST_LOOP:
- if (!checkonly)
+ if (!checkonly) {
connp->conn_multicast_loop = *invalp;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IP_RECVOPTS:
if (!checkonly)
@@ -3193,12 +2525,16 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level,
udp->udp_recvdstaddr = onoff;
break;
case IP_RECVIF:
- if (!checkonly)
+ if (!checkonly) {
udp->udp_recvif = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IP_RECVSLLA:
- if (!checkonly)
+ if (!checkonly) {
udp->udp_recvslla = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IP_RECVTTL:
if (!checkonly)
@@ -3278,12 +2614,16 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level,
*/
return (-EINVAL);
case IP_BOUND_IF:
- if (!checkonly)
+ if (!checkonly) {
udp->udp_bound_if = *i1;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IP_UNSPEC_SRC:
- if (!checkonly)
+ if (!checkonly) {
udp->udp_unspec_source = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IP_BROADCAST_TTL:
if (!checkonly)
@@ -3315,8 +2655,10 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level,
switch (name) {
case IPV6_MULTICAST_IF:
- if (!checkonly)
+ if (!checkonly) {
udp->udp_multicast_if_index = *i1;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IPV6_UNICAST_HOPS:
/* -1 means use default */
@@ -3371,8 +2713,10 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level,
*outlenp = 0;
return (EINVAL);
}
- if (!checkonly)
+ if (!checkonly) {
connp->conn_multicast_loop = *i1;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IPV6_JOIN_GROUP:
case IPV6_LEAVE_GROUP:
@@ -3389,53 +2733,71 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level,
*/
return (-EINVAL);
case IPV6_BOUND_IF:
- if (!checkonly)
+ if (!checkonly) {
udp->udp_bound_if = *i1;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IPV6_UNSPEC_SRC:
- if (!checkonly)
+ if (!checkonly) {
udp->udp_unspec_source = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
/*
* Set boolean switches for ancillary data delivery
*/
case IPV6_RECVPKTINFO:
- if (!checkonly)
+ if (!checkonly) {
udp->udp_ip_recvpktinfo = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IPV6_RECVTCLASS:
if (!checkonly) {
udp->udp_ipv6_recvtclass = onoff;
+ PASS_OPT_TO_IP(connp);
}
break;
case IPV6_RECVPATHMTU:
if (!checkonly) {
udp->udp_ipv6_recvpathmtu = onoff;
+ PASS_OPT_TO_IP(connp);
}
break;
case IPV6_RECVHOPLIMIT:
- if (!checkonly)
+ if (!checkonly) {
udp->udp_ipv6_recvhoplimit = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IPV6_RECVHOPOPTS:
- if (!checkonly)
+ if (!checkonly) {
udp->udp_ipv6_recvhopopts = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IPV6_RECVDSTOPTS:
- if (!checkonly)
+ if (!checkonly) {
udp->udp_ipv6_recvdstopts = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case _OLD_IPV6_RECVDSTOPTS:
if (!checkonly)
udp->udp_old_ipv6_recvdstopts = onoff;
break;
case IPV6_RECVRTHDRDSTOPTS:
- if (!checkonly)
+ if (!checkonly) {
udp->udp_ipv6_recvrthdrdstopts = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
case IPV6_RECVRTHDR:
- if (!checkonly)
+ if (!checkonly) {
udp->udp_ipv6_recvrthdr = onoff;
+ PASS_OPT_TO_IP(connp);
+ }
break;
/*
* Set sticky options or ancillary data.
@@ -3477,6 +2839,7 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level,
error = udp_build_hdrs(udp);
if (error != 0)
return (error);
+ PASS_OPT_TO_IP(connp);
}
break;
case IPV6_HOPLIMIT:
@@ -3541,8 +2904,9 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level,
} else {
sin6_t *sin6 = (sin6_t *)invalp;
- if (sin6->sin6_family != AF_INET6)
+ if (sin6->sin6_family != AF_INET6) {
return (EAFNOSUPPORT);
+ }
if (IN6_IS_ADDR_V4MAPPED(
&sin6->sin6_addr))
return (EADDRNOTAVAIL);
@@ -3557,6 +2921,7 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level,
error = udp_build_hdrs(udp);
if (error != 0)
return (error);
+ PASS_OPT_TO_IP(connp);
}
break;
case IPV6_HOPOPTS: {
@@ -3785,6 +3150,8 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level,
}
if (!checkonly) {
+ int size;
+
udp->udp_nat_t_endpoint = onoff;
udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
@@ -3795,8 +3162,10 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level,
udp->udp_max_hdr_len +=
sizeof (uint32_t);
}
- (void) mi_set_sth_wroff(RD(q),
- udp->udp_max_hdr_len + us->us_wroff_extra);
+ size = udp->udp_max_hdr_len +
+ us->us_wroff_extra;
+ (void) proto_set_tx_wroff(connp->conn_rq, connp,
+ size);
}
break;
default:
@@ -3820,20 +3189,82 @@ udp_opt_set_locked(queue_t *q, uint_t optset_context, int level,
}
int
-udp_opt_set(queue_t *q, uint_t optset_context, int level,
- int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
- uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+udp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
+ uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
+ void *thisdg_attrs, cred_t *cr)
{
- udp_t *udp;
- int err;
+ int error;
+ boolean_t checkonly;
- udp = Q_TO_UDP(q);
+ error = 0;
+ switch (optset_context) {
+ case SETFN_OPTCOM_CHECKONLY:
+ checkonly = B_TRUE;
+ /*
+ * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
+ * inlen != 0 implies value supplied and
+ * we have to "pretend" to set it.
+ * inlen == 0 implies that there is no
+ * value part in T_CHECK request and just validation
+ * done elsewhere should be enough, we just return here.
+ */
+ if (inlen == 0) {
+ *outlenp = 0;
+ goto done;
+ }
+ break;
+ case SETFN_OPTCOM_NEGOTIATE:
+ checkonly = B_FALSE;
+ break;
+ case SETFN_UD_NEGOTIATE:
+ case SETFN_CONN_NEGOTIATE:
+ checkonly = B_FALSE;
+ /*
+ * Negotiating local and "association-related" options
+ * through T_UNITDATA_REQ.
+ *
+ * Following routine can filter out ones we do not
+ * want to be "set" this way.
+ */
+ if (!udp_opt_allow_udr_set(level, name)) {
+ *outlenp = 0;
+ error = EINVAL;
+ goto done;
+ }
+ break;
+ default:
+ /*
+ * We should never get here
+ */
+ *outlenp = 0;
+ error = EINVAL;
+ goto done;
+ }
+
+ ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
+ (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
+
+ error = udp_do_opt_set(connp, level, name, inlen, invalp, outlenp,
+ outvalp, cr, thisdg_attrs, checkonly);
+done:
+ return (error);
+}
+
+/* ARGSUSED */
+int
+udp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
+ uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
+ void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
+{
+ conn_t *connp = Q_TO_CONN(q);
+ int error;
+ udp_t *udp = connp->conn_udp;
rw_enter(&udp->udp_rwlock, RW_WRITER);
- err = udp_opt_set_locked(q, optset_context, level, name, inlen, invalp,
- outlenp, outvalp, thisdg_attrs, cr, mblk);
+ error = udp_opt_set(connp, optset_context, level, name, inlen, invalp,
+ outlenp, outvalp, thisdg_attrs, cr);
rw_exit(&udp->udp_rwlock);
- return (err);
+ return (error);
}
/*
@@ -3853,8 +3284,11 @@ udp_build_hdrs(udp_t *udp)
udpha_t *udpha;
ip6_pkt_t *ipp = &udp->udp_sticky_ipp;
size_t sth_wroff;
+ conn_t *connp = udp->udp_connp;
ASSERT(RW_WRITE_HELD(&udp->udp_rwlock));
+ ASSERT(connp != NULL);
+
hdrs_len = ip_total_hdrs_len_v6(ipp) + UDPH_SIZE;
ASSERT(hdrs_len != 0);
if (hdrs_len != udp->udp_sticky_hdrs_len) {
@@ -3892,7 +3326,8 @@ udp_build_hdrs(udp_t *udp)
udp->udp_max_hdr_len = hdrs_len;
sth_wroff = udp->udp_max_hdr_len + us->us_wroff_extra;
rw_exit(&udp->udp_rwlock);
- (void) mi_set_sth_wroff(udp->udp_connp->conn_rq, sth_wroff);
+ (void) proto_set_tx_wroff(udp->udp_connp->conn_rq,
+ udp->udp_connp, sth_wroff);
rw_enter(&udp->udp_rwlock, RW_WRITER);
}
return (0);
@@ -4164,6 +3599,33 @@ udp_save_ip_rcv_opt(udp_t *udp, void *opt, int opt_len)
}
}
+static void
+udp_queue_fallback(udp_t *udp, mblk_t *mp)
+{
+ ASSERT(MUTEX_HELD(&udp->udp_recv_lock));
+ if (IPCL_IS_NONSTR(udp->udp_connp)) {
+ /*
+ * fallback has started but messages have not been moved yet
+ */
+ if (udp->udp_fallback_queue_head == NULL) {
+ ASSERT(udp->udp_fallback_queue_tail == NULL);
+ udp->udp_fallback_queue_head = mp;
+ udp->udp_fallback_queue_tail = mp;
+ } else {
+ ASSERT(udp->udp_fallback_queue_tail != NULL);
+ udp->udp_fallback_queue_tail->b_next = mp;
+ udp->udp_fallback_queue_tail = mp;
+ }
+ mutex_exit(&udp->udp_recv_lock);
+ } else {
+ /*
+ * no more fallbacks possible, ok to drop lock.
+ */
+ mutex_exit(&udp->udp_recv_lock);
+ putnext(udp->udp_connp->conn_rq, mp);
+ }
+}
+
/* ARGSUSED2 */
static void
udp_input(void *arg1, mblk_t *mp, void *arg2)
@@ -4222,7 +3684,7 @@ udp_input(void *arg1, mblk_t *mp, void *arg2)
/*
* ICMP messages.
*/
- udp_icmp_error(connp->conn_rq, mp);
+ udp_icmp_error(connp, mp);
return;
}
}
@@ -4403,7 +3865,6 @@ udp_input(void *arg1, mblk_t *mp, void *arg2)
UDP_STAT(us, udp_in_recvucred);
}
- /* XXX FIXME: apply to AF_INET6 as well */
/*
* If SO_TIMESTAMP is set allocate the appropriate sized
* buffer. Since gethrestime() expects a pointer aligned
@@ -4873,7 +4334,6 @@ udp_input(void *arg1, mblk_t *mp, void *arg2)
dstopt += ipp.ipp_dstoptslen;
udi_size -= toh->len;
}
-
if (cr != NULL) {
struct T_opthdr *toh;
@@ -4915,23 +4375,37 @@ udp_input(void *arg1, mblk_t *mp, void *arg2)
if (options_mp != NULL)
freeb(options_mp);
- if (udp_bits.udpb_direct_sockfs) {
- /*
- * There is nothing above us except for the stream head;
- * use the read-side synchronous stream interface in
- * order to reduce the time spent in interrupt thread.
- */
- ASSERT(udp->udp_issocket);
- udp_rcv_enqueue(connp->conn_rq, udp, mp, mp_len);
+ if (IPCL_IS_NONSTR(connp)) {
+ int error;
+
+ if ((*connp->conn_upcalls->su_recv)
+ (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error,
+ NULL) < 0) {
+ mutex_enter(&udp->udp_recv_lock);
+ if (error == ENOSPC) {
+ /*
+ * let's confirm while holding the lock
+ */
+ if ((*connp->conn_upcalls->su_recv)
+ (connp->conn_upper_handle, NULL, 0, 0,
+ &error, NULL) < 0) {
+ if (error == ENOSPC) {
+ connp->conn_flow_cntrld =
+ B_TRUE;
+ } else {
+ ASSERT(error == EOPNOTSUPP);
+ }
+ }
+ mutex_exit(&udp->udp_recv_lock);
+ } else {
+ ASSERT(error == EOPNOTSUPP);
+ udp_queue_fallback(udp, mp);
+ }
+ }
} else {
- /*
- * Use regular STREAMS interface to pass data upstream
- * if this is not a socket endpoint, or if we have
- * switched over to the slow mode due to sockmod being
- * popped or a module being pushed on top of us.
- */
putnext(connp->conn_rq, mp);
}
+ ASSERT(MUTEX_NOT_HELD(&udp->udp_recv_lock));
return;
tossit:
@@ -4942,243 +4416,6 @@ tossit:
}
/*
- * Handle the results of a T_BIND_REQ whether deferred by IP or handled
- * immediately.
- */
-static void
-udp_bind_result(conn_t *connp, mblk_t *mp)
-{
- struct T_error_ack *tea;
-
- switch (mp->b_datap->db_type) {
- case M_PROTO:
- case M_PCPROTO:
- /* M_PROTO messages contain some type of TPI message. */
- ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <=
- (uintptr_t)INT_MAX);
- if (mp->b_wptr - mp->b_rptr < sizeof (t_scalar_t)) {
- freemsg(mp);
- return;
- }
- tea = (struct T_error_ack *)mp->b_rptr;
-
- switch (tea->PRIM_type) {
- case T_ERROR_ACK:
- switch (tea->ERROR_prim) {
- case O_T_BIND_REQ:
- case T_BIND_REQ:
- udp_bind_error(connp, mp);
- return;
- default:
- break;
- }
- ASSERT(0);
- freemsg(mp);
- return;
-
- case T_BIND_ACK:
- udp_bind_ack(connp, mp);
- return;
-
- default:
- break;
- }
- freemsg(mp);
- return;
- default:
- /* FIXME: other cases? */
- ASSERT(0);
- freemsg(mp);
- return;
- }
-}
-
-/*
- * Process a T_BIND_ACK
- */
-static void
-udp_bind_ack(conn_t *connp, mblk_t *mp)
-{
- udp_t *udp = connp->conn_udp;
- mblk_t *mp1;
- ire_t *ire;
- struct T_bind_ack *tba;
- uchar_t *addrp;
- ipa_conn_t *ac;
- ipa6_conn_t *ac6;
- udp_fanout_t *udpf;
- udp_stack_t *us = udp->udp_us;
-
- ASSERT(udp->udp_pending_op != -1);
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- /*
- * If a broadcast/multicast address was bound set
- * the source address to 0.
- * This ensures no datagrams with broadcast address
- * as source address are emitted (which would violate
- * RFC1122 - Hosts requirements)
- *
- * Note that when connecting the returned IRE is
- * for the destination address and we only perform
- * the broadcast check for the source address (it
- * is OK to connect to a broadcast/multicast address.)
- */
- mp1 = mp->b_cont;
- if (mp1 != NULL && mp1->b_datap->db_type == IRE_DB_TYPE) {
- ire = (ire_t *)mp1->b_rptr;
-
- /*
- * Note: we get IRE_BROADCAST for IPv6 to "mark" a multicast
- * local address.
- */
- udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port,
- us->us_bind_fanout_size)];
- if (ire->ire_type == IRE_BROADCAST &&
- udp->udp_state != TS_DATA_XFER) {
- ASSERT(udp->udp_pending_op == T_BIND_REQ ||
- udp->udp_pending_op == O_T_BIND_REQ);
- /* This was just a local bind to a broadcast addr */
- mutex_enter(&udpf->uf_lock);
- V6_SET_ZERO(udp->udp_v6src);
- mutex_exit(&udpf->uf_lock);
- if (udp->udp_family == AF_INET6)
- (void) udp_build_hdrs(udp);
- } else if (V6_OR_V4_INADDR_ANY(udp->udp_v6src)) {
- /*
- * Local address not yet set - pick it from the
- * T_bind_ack
- */
- tba = (struct T_bind_ack *)mp->b_rptr;
- addrp = &mp->b_rptr[tba->ADDR_offset];
- switch (udp->udp_family) {
- case AF_INET:
- if (tba->ADDR_length == sizeof (ipa_conn_t)) {
- ac = (ipa_conn_t *)addrp;
- } else {
- ASSERT(tba->ADDR_length ==
- sizeof (ipa_conn_x_t));
- ac = &((ipa_conn_x_t *)addrp)->acx_conn;
- }
- mutex_enter(&udpf->uf_lock);
- IN6_IPADDR_TO_V4MAPPED(ac->ac_laddr,
- &udp->udp_v6src);
- mutex_exit(&udpf->uf_lock);
- break;
- case AF_INET6:
- if (tba->ADDR_length == sizeof (ipa6_conn_t)) {
- ac6 = (ipa6_conn_t *)addrp;
- } else {
- ASSERT(tba->ADDR_length ==
- sizeof (ipa6_conn_x_t));
- ac6 = &((ipa6_conn_x_t *)
- addrp)->ac6x_conn;
- }
- mutex_enter(&udpf->uf_lock);
- udp->udp_v6src = ac6->ac6_laddr;
- mutex_exit(&udpf->uf_lock);
- (void) udp_build_hdrs(udp);
- break;
- }
- }
- mp1 = mp1->b_cont;
- }
- udp->udp_pending_op = -1;
- rw_exit(&udp->udp_rwlock);
- /*
- * Look for one or more appended ACK message added by
- * udp_connect or udp_disconnect.
- * If none found just send up the T_BIND_ACK.
- * udp_connect has appended a T_OK_ACK and a T_CONN_CON.
- * udp_disconnect has appended a T_OK_ACK.
- */
- if (mp1 != NULL) {
- if (mp->b_cont == mp1)
- mp->b_cont = NULL;
- else {
- ASSERT(mp->b_cont->b_cont == mp1);
- mp->b_cont->b_cont = NULL;
- }
- freemsg(mp);
- mp = mp1;
- while (mp != NULL) {
- mp1 = mp->b_cont;
- mp->b_cont = NULL;
- putnext(connp->conn_rq, mp);
- mp = mp1;
- }
- return;
- }
- freemsg(mp->b_cont);
- mp->b_cont = NULL;
- putnext(connp->conn_rq, mp);
-}
-
-static void
-udp_bind_error(conn_t *connp, mblk_t *mp)
-{
- udp_t *udp = connp->conn_udp;
- struct T_error_ack *tea;
- udp_fanout_t *udpf;
- udp_stack_t *us = udp->udp_us;
-
- tea = (struct T_error_ack *)mp->b_rptr;
-
- /*
- * If our O_T_BIND_REQ/T_BIND_REQ fails,
- * clear out the associated port and source
- * address before passing the message
- * upstream. If this was caused by a T_CONN_REQ
- * revert back to bound state.
- */
-
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- ASSERT(udp->udp_pending_op != -1);
- tea->ERROR_prim = udp->udp_pending_op;
- udp->udp_pending_op = -1;
- udpf = &us->us_bind_fanout[
- UDP_BIND_HASH(udp->udp_port,
- us->us_bind_fanout_size)];
- mutex_enter(&udpf->uf_lock);
-
- switch (tea->ERROR_prim) {
- case T_CONN_REQ:
- ASSERT(udp->udp_state == TS_DATA_XFER);
- /* Connect failed */
- /* Revert back to the bound source */
- udp->udp_v6src = udp->udp_bound_v6src;
- udp->udp_state = TS_IDLE;
- mutex_exit(&udpf->uf_lock);
- if (udp->udp_family == AF_INET6)
- (void) udp_build_hdrs(udp);
- rw_exit(&udp->udp_rwlock);
- break;
-
- case T_DISCON_REQ:
- case T_BIND_REQ:
- case O_T_BIND_REQ:
- V6_SET_ZERO(udp->udp_v6src);
- V6_SET_ZERO(udp->udp_bound_v6src);
- udp->udp_state = TS_UNBND;
- udp_bind_hash_remove(udp, B_TRUE);
- udp->udp_port = 0;
- mutex_exit(&udpf->uf_lock);
- if (udp->udp_family == AF_INET6)
- (void) udp_build_hdrs(udp);
- rw_exit(&udp->udp_rwlock);
- break;
-
- default:
- mutex_exit(&udpf->uf_lock);
- rw_exit(&udp->udp_rwlock);
- (void) mi_strlog(connp->conn_rq, 1,
- SL_ERROR|SL_TRACE,
- "udp_input_other: bad ERROR_prim, "
- "len %d", tea->ERROR_prim);
- }
- putnext(connp->conn_rq, mp);
-}
-
-/*
* return SNMP stuff in buffer in mpdata. We don't hold any lock and report
* information that can be changing beneath us.
*/
@@ -5589,64 +4826,23 @@ done:
* is called by udp_wput to handle T_UNBIND_REQ messages.
*/
static void
-udp_unbind(queue_t *q, mblk_t *mp)
+udp_tpi_unbind(queue_t *q, mblk_t *mp)
{
- udp_t *udp = Q_TO_UDP(q);
- udp_fanout_t *udpf;
- udp_stack_t *us = udp->udp_us;
-
- if (cl_inet_unbind != NULL) {
- /*
- * Running in cluster mode - register unbind information
- */
- if (udp->udp_ipversion == IPV4_VERSION) {
- (*cl_inet_unbind)(IPPROTO_UDP, AF_INET,
- (uint8_t *)(&V4_PART_OF_V6(udp->udp_v6src)),
- (in_port_t)udp->udp_port);
- } else {
- (*cl_inet_unbind)(IPPROTO_UDP, AF_INET6,
- (uint8_t *)&(udp->udp_v6src),
- (in_port_t)udp->udp_port);
- }
- }
+ conn_t *connp = Q_TO_CONN(q);
+ int error;
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- if (udp->udp_state == TS_UNBND || udp->udp_pending_op != -1) {
- rw_exit(&udp->udp_rwlock);
- udp_err_ack(q, mp, TOUTSTATE, 0);
+ error = udp_do_unbind(connp);
+ if (error) {
+ if (error < 0)
+ udp_err_ack(q, mp, -error, 0);
+ else
+ udp_err_ack(q, mp, TSYSERR, error);
return;
}
- udp->udp_pending_op = T_UNBIND_REQ;
- rw_exit(&udp->udp_rwlock);
- /*
- * Pass the unbind to IP; T_UNBIND_REQ is larger than T_OK_ACK
- * and therefore ip_unbind must never return NULL.
- */
- mp = ip_unbind(q, mp);
+ mp = mi_tpi_ok_ack_alloc(mp);
ASSERT(mp != NULL);
ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK);
-
- /*
- * Once we're unbound from IP, the pending operation may be cleared
- * here.
- */
- rw_enter(&udp->udp_rwlock, RW_WRITER);
- udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port,
- us->us_bind_fanout_size)];
- mutex_enter(&udpf->uf_lock);
- udp_bind_hash_remove(udp, B_TRUE);
- V6_SET_ZERO(udp->udp_v6src);
- V6_SET_ZERO(udp->udp_bound_v6src);
- udp->udp_port = 0;
- mutex_exit(&udpf->uf_lock);
-
- udp->udp_pending_op = -1;
- udp->udp_state = TS_UNBND;
- if (udp->udp_family == AF_INET6)
- (void) udp_build_hdrs(udp);
- rw_exit(&udp->udp_rwlock);
-
qreply(q, mp);
}
@@ -5748,27 +4944,29 @@ udp_update_label(queue_t *wq, mblk_t *mp, ipaddr_t dst)
static mblk_t *
udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port,
- uint_t srcid, int *error, boolean_t insert_spi)
+ uint_t srcid, int *error, boolean_t insert_spi, struct nmsghdr *msg,
+ cred_t *cr, pid_t pid)
{
- udp_t *udp = connp->conn_udp;
- queue_t *q = connp->conn_wq;
- mblk_t *mp1 = mp;
- mblk_t *mp2;
- ipha_t *ipha;
- int ip_hdr_length;
- uint32_t ip_len;
- udpha_t *udpha;
- boolean_t lock_held = B_FALSE;
+ udp_t *udp = connp->conn_udp;
+ mblk_t *mp1 = mp;
+ mblk_t *mp2;
+ ipha_t *ipha;
+ int ip_hdr_length;
+ uint32_t ip_len;
+ udpha_t *udpha;
+ boolean_t lock_held = B_FALSE;
in_port_t uha_src_port;
udpattrs_t attrs;
- uchar_t ip_snd_opt[IP_MAX_OPT_LENGTH];
+ uchar_t ip_snd_opt[IP_MAX_OPT_LENGTH];
uint32_t ip_snd_opt_len = 0;
- ip4_pkt_t pktinfo;
- ip4_pkt_t *pktinfop = &pktinfo;
- ip_opt_info_t optinfo;
+ ip4_pkt_t pktinfo;
+ ip4_pkt_t *pktinfop = &pktinfo;
+ ip_opt_info_t optinfo;
ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
udp_stack_t *us = udp->udp_us;
ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec;
+ queue_t *q = connp->conn_wq;
+ ire_t *ire;
*error = 0;
@@ -5784,26 +4982,55 @@ udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port,
* If options passed in, feed it for verification and handling
*/
attrs.udpattr_credset = B_FALSE;
- if (DB_TYPE(mp) != M_DATA) {
- mp1 = mp->b_cont;
- if (((struct T_unitdata_req *)mp->b_rptr)->OPT_length != 0) {
+ if (IPCL_IS_NONSTR(connp)) {
+ if (msg->msg_controllen != 0) {
attrs.udpattr_ipp4 = pktinfop;
attrs.udpattr_mb = mp;
- if (udp_unitdata_opt_process(q, mp, error, &attrs) < 0)
+
+ rw_enter(&udp->udp_rwlock, RW_WRITER);
+ *error = process_auxiliary_options(connp,
+ msg->msg_control, msg->msg_controllen,
+ &attrs, &udp_opt_obj, udp_opt_set);
+ rw_exit(&udp->udp_rwlock);
+ if (*error)
goto done;
- /*
- * Note: success in processing options.
- * mp option buffer represented by
- * OPT_length/offset now potentially modified
- * and contain option setting results
- */
- ASSERT(*error == 0);
+ }
+ } else {
+ if (DB_TYPE(mp) != M_DATA) {
+ mp1 = mp->b_cont;
+ if (((struct T_unitdata_req *)
+ mp->b_rptr)->OPT_length != 0) {
+ attrs.udpattr_ipp4 = pktinfop;
+ attrs.udpattr_mb = mp;
+ if (udp_unitdata_opt_process(q, mp, error,
+ &attrs) < 0)
+ goto done;
+ /*
+ * Note: success in processing options.
+ * mp option buffer represented by
+ * OPT_length/offset now potentially modified
+ * and contain option setting results
+ */
+ ASSERT(*error == 0);
+ }
}
}
/* mp1 points to the M_DATA mblk carrying the packet */
ASSERT(mp1 != NULL && DB_TYPE(mp1) == M_DATA);
+ /*
+ * Determine whether we need to mark the mblk with the user's
+ * credentials.
+ */
+ ire = connp->conn_ire_cache;
+ if (is_system_labeled() || CLASSD(v4dst) || (ire == NULL) ||
+ (ire->ire_addr != v4dst) ||
+ (ire->ire_type & (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK))) {
+ if (cr != NULL && DB_CRED(mp) == NULL)
+ msg_setcredpid(mp, cr, pid);
+ }
+
rw_enter(&udp->udp_rwlock, RW_READER);
lock_held = B_TRUE;
/*
@@ -6235,7 +5462,7 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid)
ipha_t *ipha = (ipha_t *)mp->b_rptr;
udp_stack_t *us = udp->udp_us;
ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
- boolean_t ll_multicast = B_FALSE;
+ boolean_t ll_multicast = B_FALSE;
dev_q = ire->ire_stq->q_next;
ASSERT(dev_q != NULL);
@@ -6248,6 +5475,7 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid)
DEV_Q_FLOW_BLOCKED(dev_q)) {
BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
+
if (ipst->ips_ip_output_queue)
(void) putq(connp->conn_wq, mp);
else
@@ -6397,11 +5625,11 @@ udp_update_label_v6(queue_t *wq, mblk_t *mp, in6_addr_t *dst)
return (err);
}
-void
-udp_output_connected(void *arg, mblk_t *mp)
+static int
+udp_send_connected(conn_t *connp, mblk_t *mp, struct nmsghdr *msg, cred_t *cr,
+ pid_t pid)
{
- conn_t *connp = (conn_t *)arg;
- udp_t *udp = connp->conn_udp;
+ udp_t *udp = connp->conn_udp;
udp_stack_t *us = udp->udp_us;
ipaddr_t v4dst;
in_port_t dstport;
@@ -6416,7 +5644,7 @@ udp_output_connected(void *arg, mblk_t *mp)
/* M_DATA for connected socket */
- ASSERT(udp->udp_issocket);
+ ASSERT(udp->udp_issocket || IPCL_IS_NONSTR(connp));
UDP_DBGSTAT(us, udp_data_conn);
mutex_enter(&connp->conn_lock);
@@ -6428,7 +5656,7 @@ udp_output_connected(void *arg, mblk_t *mp)
TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
"udp_wput_end: connp %p (%S)", connp,
"not-connected; address required");
- return;
+ return (EDESTADDRREQ);
}
mapped_addr = IN6_IS_ADDR_V4MAPPED(&udp->udp_v6dst);
@@ -6466,20 +5694,100 @@ udp_output_connected(void *arg, mblk_t *mp)
* family of the socket.
*/
mp = udp_output_v4(connp, mp, v4dst, dstport, 0, &error,
- insert_spi);
+ insert_spi, msg, cr, pid);
} else {
- mp = udp_output_v6(connp, mp, sin6, &error);
+ mp = udp_output_v6(connp, mp, sin6, &error, msg, cr, pid);
}
if (error == 0) {
ASSERT(mp == NULL);
- return;
+ return (0);
}
UDP_STAT(us, udp_out_err_output);
ASSERT(mp != NULL);
- /* mp is freed by the following routine */
- udp_ud_err(connp->conn_wq, mp, (uchar_t *)addr, (t_scalar_t)addrlen,
- (t_scalar_t)error);
+ if (IPCL_IS_NONSTR(connp)) {
+ freemsg(mp);
+ return (error);
+ } else {
+ /* mp is freed by the following routine */
+ udp_ud_err(connp->conn_wq, mp, (uchar_t *)addr,
+ (t_scalar_t)addrlen, (t_scalar_t)error);
+ return (0);
+ }
+}
+
+/* ARGSUSED */
+static int
+udp_send_not_connected(conn_t *connp, mblk_t *mp, struct sockaddr *addr,
+ socklen_t addrlen, struct nmsghdr *msg, cred_t *cr, pid_t pid)
+{
+
+ udp_t *udp = connp->conn_udp;
+ boolean_t insert_spi = udp->udp_nat_t_endpoint;
+ int error = 0;
+ sin6_t *sin6;
+ sin_t *sin;
+ uint_t srcid;
+ uint16_t port;
+ ipaddr_t v4dst;
+
+
+ ASSERT(addr != NULL);
+
+ switch (udp->udp_family) {
+ case AF_INET6:
+ sin6 = (sin6_t *)addr;
+ if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ /*
+ * Destination is a non-IPv4-compatible IPv6 address.
+ * Send out an IPv6 format packet.
+ */
+ mp = udp_output_v6(connp, mp, sin6, &error, msg, cr,
+ pid);
+ if (error != 0)
+ goto ud_error;
+
+ return (0);
+ }
+ /*
+ * If the local address is not zero or a mapped address
+ * return an error. It would be possible to send an IPv4
+ * packet but the response would never make it back to the
+ * application since it is bound to a non-mapped address.
+ */
+ if (!IN6_IS_ADDR_V4MAPPED(&udp->udp_v6src) &&
+ !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) {
+ error = EADDRNOTAVAIL;
+ goto ud_error;
+ }
+ /* Send IPv4 packet without modifying udp_ipversion */
+ /* Extract port and ipaddr */
+ port = sin6->sin6_port;
+ IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, v4dst);
+ srcid = sin6->__sin6_src_id;
+ break;
+
+ case AF_INET:
+ sin = (sin_t *)addr;
+ /* Extract port and ipaddr */
+ port = sin->sin_port;
+ v4dst = sin->sin_addr.s_addr;
+ srcid = 0;
+ break;
+ }
+
+ mp = udp_output_v4(connp, mp, v4dst, port, srcid, &error, insert_spi,
+ msg, cr, pid);
+
+ if (error == 0) {
+ ASSERT(mp == NULL);
+ return (0);
+ }
+
+ud_error:
+ ASSERT(mp != NULL);
+
+ return (error);
}
/*
@@ -6496,18 +5804,12 @@ udp_output_connected(void *arg, mblk_t *mp)
void
udp_wput(queue_t *q, mblk_t *mp)
{
- sin6_t *sin6;
- sin_t *sin;
- ipaddr_t v4dst;
- uint16_t port;
- uint_t srcid;
conn_t *connp = Q_TO_CONN(q);
udp_t *udp = connp->conn_udp;
int error = 0;
struct sockaddr *addr;
socklen_t addrlen;
- udp_stack_t *us = udp->udp_us;
- boolean_t insert_spi = udp->udp_nat_t_endpoint;
+ udp_stack_t *us = udp->udp_us;
TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_START,
"udp_wput_start: queue %p mp %p", q, mp);
@@ -6533,7 +5835,7 @@ udp_wput(queue_t *q, mblk_t *mp)
"not-connected; address required");
return;
}
- udp_output_connected(connp, mp);
+ (void) udp_send_connected(connp, mp, NULL, NULL, -1);
return;
case M_PROTO:
@@ -6587,67 +5889,8 @@ udp_wput(queue_t *q, mblk_t *mp)
}
ASSERT(addr != NULL);
- switch (udp->udp_family) {
- case AF_INET6:
- sin6 = (sin6_t *)addr;
- if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) ||
- (sin6->sin6_family != AF_INET6)) {
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
- "udp_wput_end: q %p (%S)", q, "badaddr");
- error = EADDRNOTAVAIL;
- goto ud_error;
- }
-
- if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
- /*
- * Destination is a non-IPv4-compatible IPv6 address.
- * Send out an IPv6 format packet.
- */
- mp = udp_output_v6(connp, mp, sin6, &error);
- if (error != 0)
- goto ud_error;
-
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
- "udp_wput_end: q %p (%S)", q, "udp_output_v6");
- return;
- }
- /*
- * If the local address is not zero or a mapped address
- * return an error. It would be possible to send an IPv4
- * packet but the response would never make it back to the
- * application since it is bound to a non-mapped address.
- */
- if (!IN6_IS_ADDR_V4MAPPED(&udp->udp_v6src) &&
- !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) {
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
- "udp_wput_end: q %p (%S)", q, "badaddr");
- error = EADDRNOTAVAIL;
- goto ud_error;
- }
- /* Send IPv4 packet without modifying udp_ipversion */
- /* Extract port and ipaddr */
- port = sin6->sin6_port;
- IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, v4dst);
- srcid = sin6->__sin6_src_id;
- break;
-
- case AF_INET:
- sin = (sin_t *)addr;
- if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) ||
- (sin->sin_family != AF_INET)) {
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
- "udp_wput_end: q %p (%S)", q, "badaddr");
- error = EADDRNOTAVAIL;
- goto ud_error;
- }
- /* Extract port and ipaddr */
- port = sin->sin_port;
- v4dst = sin->sin_addr.s_addr;
- srcid = 0;
- break;
- }
-
- mp = udp_output_v4(connp, mp, v4dst, port, srcid, &error, insert_spi);
+ error = udp_send_not_connected(connp, mp, addr, addrlen, NULL, NULL,
+ -1);
if (error != 0) {
ud_error:
UDP_STAT(us, udp_out_err_output);
@@ -6658,13 +5901,25 @@ ud_error:
}
}
+/* ARGSUSED */
+static void
+udp_wput_fallback(queue_t *wq, mblk_t *mp)
+{
+#ifdef DEBUG
+ cmn_err(CE_CONT, "udp_wput_fallback: Message in fallback \n");
+#endif
+ freemsg(mp);
+}
+
+
/*
* udp_output_v6():
* Assumes that udp_wput did some sanity checking on the destination
* address.
*/
static mblk_t *
-udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, int *error)
+udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, int *error,
+ struct nmsghdr *msg, cred_t *cr, pid_t pid)
{
ip6_t *ip6h;
ip6i_t *ip6i; /* mp1->b_rptr even if no ip6i_t */
@@ -6674,6 +5929,7 @@ udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, int *error)
size_t ip_len;
udpha_t *udph;
udp_t *udp = connp->conn_udp;
+ udp_stack_t *us = udp->udp_us;
queue_t *q = connp->conn_wq;
ip6_pkt_t ipp_s; /* For ancillary data options */
ip6_pkt_t *ipp = &ipp_s;
@@ -6689,8 +5945,8 @@ udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, int *error)
ip6_hbh_t *hopoptsptr = NULL;
uint_t hopoptslen = 0;
boolean_t is_ancillary = B_FALSE;
- udp_stack_t *us = udp->udp_us;
size_t sth_wroff = 0;
+ ire_t *ire;
*error = 0;
@@ -6714,19 +5970,51 @@ udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, int *error)
*/
attrs.udpattr_credset = B_FALSE;
opt_present = B_FALSE;
- if (DB_TYPE(mp) != M_DATA) {
- mp1 = mp->b_cont;
- if (((struct T_unitdata_req *)mp->b_rptr)->OPT_length != 0) {
+ if (IPCL_IS_NONSTR(connp)) {
+ if (msg->msg_controllen != 0) {
attrs.udpattr_ipp6 = ipp;
attrs.udpattr_mb = mp;
- if (udp_unitdata_opt_process(q, mp, error,
- &attrs) < 0) {
+
+ rw_enter(&udp->udp_rwlock, RW_WRITER);
+ *error = process_auxiliary_options(connp,
+ msg->msg_control, msg->msg_controllen,
+ &attrs, &udp_opt_obj, udp_opt_set);
+ rw_exit(&udp->udp_rwlock);
+ if (*error)
goto done;
- }
ASSERT(*error == 0);
opt_present = B_TRUE;
}
+ } else {
+ if (DB_TYPE(mp) != M_DATA) {
+ mp1 = mp->b_cont;
+ if (((struct T_unitdata_req *)
+ mp->b_rptr)->OPT_length != 0) {
+ attrs.udpattr_ipp6 = ipp;
+ attrs.udpattr_mb = mp;
+ if (udp_unitdata_opt_process(q, mp, error,
+ &attrs) < 0) {
+ goto done;
+ }
+ ASSERT(*error == 0);
+ opt_present = B_TRUE;
+ }
+ }
}
+
+ /*
+ * Determine whether we need to mark the mblk with the user's
+ * credentials.
+ */
+ ire = connp->conn_ire_cache;
+ if (is_system_labeled() || IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) ||
+ (ire == NULL) ||
+ (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &sin6->sin6_addr)) ||
+ (ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK))) {
+ if (cr != NULL && DB_CRED(mp) == NULL)
+ msg_setcredpid(mp, cr, pid);
+ }
+
rw_enter(&udp->udp_rwlock, RW_READER);
ignore = ipp->ipp_sticky_ignored;
@@ -7268,7 +6556,7 @@ no_options:
done:
if (sth_wroff != 0) {
- (void) mi_set_sth_wroff(RD(q),
+ (void) proto_set_tx_wroff(RD(q), connp,
udp->udp_max_hdr_len + us->us_wroff_extra);
}
if (hopoptsptr != NULL && !is_ancillary) {
@@ -7284,7 +6572,7 @@ done:
static int
-udp_getpeername(udp_t *udp, struct sockaddr *sa, uint_t *salenp)
+i_udp_getpeername(udp_t *udp, struct sockaddr *sa, uint_t *salenp)
{
sin_t *sin = (sin_t *)sa;
sin6_t *sin6 = (sin6_t *)sa;
@@ -7404,7 +6692,7 @@ udp_wput_cmdblk(queue_t *q, mblk_t *mp)
rw_enter(&udp->udp_rwlock, RW_READER);
switch (cmdp->cb_cmd) {
case TI_GETPEERNAME:
- cmdp->cb_error = udp_getpeername(udp, data, &cmdp->cb_len);
+ cmdp->cb_error = i_udp_getpeername(udp, data, &cmdp->cb_len);
break;
case TI_GETMYNAME:
cmdp->cb_error = udp_getmyname(udp, data, &cmdp->cb_len);
@@ -7419,6 +6707,21 @@ udp_wput_cmdblk(queue_t *q, mblk_t *mp)
}
static void
+udp_disable_direct_sockfs(udp_t *udp)
+{
+ udp->udp_issocket = B_FALSE;
+ if (udp->udp_direct_sockfs) {
+ /*
+ * Disable read-side synchronous stream interface and
+ * drain any queued data.
+ */
+ udp_rcv_drain(udp->udp_connp->conn_rq, udp, B_FALSE);
+ ASSERT(!udp->udp_direct_sockfs);
+ UDP_STAT(udp->udp_us, udp_sock_fallback);
+ }
+}
+
+static void
udp_wput_other(queue_t *q, mblk_t *mp)
{
uchar_t *rptr = mp->b_rptr;
@@ -7458,12 +6761,12 @@ udp_wput_other(queue_t *q, mblk_t *mp)
return;
case O_T_BIND_REQ:
case T_BIND_REQ:
- udp_bind(q, mp);
+ udp_tpi_bind(q, mp);
TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
"udp_wput_other_end: q %p (%S)", q, "bindreq");
return;
case T_CONN_REQ:
- udp_connect(q, mp);
+ udp_tpi_connect(q, mp);
TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
"udp_wput_other_end: q %p (%S)", q, "connreq");
return;
@@ -7488,7 +6791,7 @@ udp_wput_other(queue_t *q, mblk_t *mp)
"udp_wput_other_end: q %p (%S)", q, "unitdatareq");
return;
case T_UNBIND_REQ:
- udp_unbind(q, mp);
+ udp_tpi_unbind(q, mp);
TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
"udp_wput_other_end: q %p (%S)", q, "unbindreq");
return;
@@ -7509,7 +6812,7 @@ udp_wput_other(queue_t *q, mblk_t *mp)
return;
case T_DISCON_REQ:
- udp_disconnect(q, mp);
+ udp_tpi_disconnect(q, mp);
TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
"udp_wput_other_end: q %p (%S)", q, "disconreq");
return;
@@ -7596,18 +6899,8 @@ udp_wput_other(queue_t *q, mblk_t *mp)
DB_TYPE(mp) = M_IOCNAK;
iocp->ioc_error = EINVAL;
} else {
- udp->udp_issocket = B_FALSE;
- if (udp->udp_direct_sockfs) {
- /*
- * Disable read-side synchronous
- * stream interface and drain any
- * queued data.
- */
- udp_rcv_drain(RD(q), udp,
- B_FALSE);
- ASSERT(!udp->udp_direct_sockfs);
- UDP_STAT(us, udp_sock_fallback);
- }
+ udp_disable_direct_sockfs(udp);
+
DB_TYPE(mp) = M_IOCACK;
iocp->ioc_error = 0;
}
@@ -7640,12 +6933,12 @@ udp_wput_other(queue_t *q, mblk_t *mp)
static void
udp_wput_iocdata(queue_t *q, mblk_t *mp)
{
- mblk_t *mp1;
- struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
+ mblk_t *mp1;
+ struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
STRUCT_HANDLE(strbuf, sb);
- udp_t *udp = Q_TO_UDP(q);
- int error;
- uint_t addrlen;
+ udp_t *udp = Q_TO_UDP(q);
+ int error;
+ uint_t addrlen;
/* Make sure it is one of ours. */
switch (iocp->ioc_cmd) {
@@ -7699,16 +6992,17 @@ udp_wput_iocdata(queue_t *q, mblk_t *mp)
}
mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE);
+
if (mp1 == NULL)
return;
rw_enter(&udp->udp_rwlock, RW_READER);
switch (iocp->ioc_cmd) {
case TI_GETMYNAME:
- error = udp_getmyname(udp, (void *)mp1->b_rptr, &addrlen);
+ error = udp_do_getsockname(udp, (void *)mp1->b_rptr, &addrlen);
break;
case TI_GETPEERNAME:
- error = udp_getpeername(udp, (void *)mp1->b_rptr, &addrlen);
+ error = udp_do_getpeername(udp, (void *)mp1->b_rptr, &addrlen);
break;
}
rw_exit(&udp->udp_rwlock);
@@ -7755,7 +7049,7 @@ udp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp,
}
void
-udp_ddi_init(void)
+udp_ddi_g_init(void)
{
udp_max_optsize = optcom_max_optsize(udp_opt_obj.odb_opt_des_arr,
udp_opt_obj.odb_opt_arr_cnt);
@@ -7769,11 +7063,13 @@ udp_ddi_init(void)
}
void
-udp_ddi_destroy(void)
+udp_ddi_g_destroy(void)
{
netstack_unregister(NS_UDP);
}
+#define INET_NAME "ip"
+
/*
* Initialize the UDP stack instance.
*/
@@ -7783,6 +7079,8 @@ udp_stack_init(netstackid_t stackid, netstack_t *ns)
udp_stack_t *us;
udpparam_t *pa;
int i;
+ int error = 0;
+ major_t major;
us = (udp_stack_t *)kmem_zalloc(sizeof (*us), KM_SLEEP);
us->us_netstack = ns;
@@ -7825,6 +7123,10 @@ udp_stack_init(netstackid_t stackid, netstack_t *ns)
us->us_kstat = udp_kstat2_init(stackid, &us->us_statistics);
us->us_mibkp = udp_kstat_init(stackid);
+
+ major = mod_name_to_major(INET_NAME);
+ error = ldi_ident_from_major(major, &us->us_ldi_ident);
+ ASSERT(error == 0);
return (us);
}
@@ -7856,6 +7158,8 @@ udp_stack_fini(netstackid_t stackid, void *arg)
udp_kstat2_fini(stackid, us->us_kstat);
us->us_kstat = NULL;
bzero(&us->us_statistics, sizeof (us->us_statistics));
+
+ ldi_ident_release(us->us_ldi_ident);
kmem_free(us, sizeof (*us));
}
@@ -8192,8 +7496,6 @@ udp_rcv_drain(queue_t *q, udp_t *udp, boolean_t closing)
mblk_t *mp;
udp_stack_t *us = udp->udp_us;
- ASSERT(q == RD(q));
-
mutex_enter(&udp->udp_drain_lock);
/*
* There is no race with a concurrent udp_input() sending
@@ -8222,6 +7524,7 @@ udp_rcv_drain(queue_t *q, udp_t *udp, boolean_t closing)
if (closing) {
freemsg(mp);
} else {
+ ASSERT(q == RD(q));
putnext(q, mp);
}
}
@@ -8282,3 +7585,1802 @@ udp_lwput(queue_t *q, mblk_t *mp)
{
freemsg(mp);
}
+
+/*
+ * Below routines for UDP socket module.
+ */
+
+static conn_t *
+udp_do_open(cred_t *credp, boolean_t isv6, int flags)
+{
+ udp_t *udp;
+ conn_t *connp;
+ zoneid_t zoneid;
+ netstack_t *ns;
+ udp_stack_t *us;
+
+ ns = netstack_find_by_cred(credp);
+ ASSERT(ns != NULL);
+ us = ns->netstack_udp;
+ ASSERT(us != NULL);
+
+ /*
+ * For exclusive stacks we set the zoneid to zero
+ * to make UDP operate as if in the global zone.
+ */
+ if (ns->netstack_stackid != GLOBAL_NETSTACKID)
+ zoneid = GLOBAL_ZONEID;
+ else
+ zoneid = crgetzoneid(credp);
+
+ ASSERT(flags == KM_SLEEP || flags == KM_NOSLEEP);
+
+ connp = ipcl_conn_create(IPCL_UDPCONN, flags, ns);
+ if (connp == NULL) {
+ netstack_rele(ns);
+ return (NULL);
+ }
+ udp = connp->conn_udp;
+
+ /*
+ * ipcl_conn_create did a netstack_hold. Undo the hold that was
+ * done by netstack_find_by_cred()
+ */
+ netstack_rele(ns);
+
+ rw_enter(&udp->udp_rwlock, RW_WRITER);
+ ASSERT(connp->conn_ulp == IPPROTO_UDP);
+ ASSERT(connp->conn_udp == udp);
+ ASSERT(udp->udp_connp == connp);
+
+ /* Set the initial state of the stream and the privilege status. */
+ udp->udp_state = TS_UNBND;
+ if (isv6) {
+ udp->udp_family = AF_INET6;
+ udp->udp_ipversion = IPV6_VERSION;
+ udp->udp_max_hdr_len = IPV6_HDR_LEN + UDPH_SIZE;
+ udp->udp_ttl = us->us_ipv6_hoplimit;
+ connp->conn_af_isv6 = B_TRUE;
+ connp->conn_flags |= IPCL_ISV6;
+ } else {
+ udp->udp_family = AF_INET;
+ udp->udp_ipversion = IPV4_VERSION;
+ udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE;
+ udp->udp_ttl = us->us_ipv4_ttl;
+ connp->conn_af_isv6 = B_FALSE;
+ connp->conn_flags &= ~IPCL_ISV6;
+ }
+
+ udp->udp_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
+ udp->udp_pending_op = -1;
+ connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
+ connp->conn_zoneid = zoneid;
+
+ udp->udp_open_time = lbolt64;
+ udp->udp_open_pid = curproc->p_pid;
+
+ /*
+ * If the caller has the process-wide flag set, then default to MAC
+ * exempt mode. This allows read-down to unlabeled hosts.
+ */
+ if (getpflags(NET_MAC_AWARE, credp) != 0)
+ connp->conn_mac_exempt = B_TRUE;
+
+ connp->conn_ulp_labeled = is_system_labeled();
+
+ udp->udp_us = us;
+
+ connp->conn_recv = udp_input;
+ crhold(credp);
+ connp->conn_cred = credp;
+
+ *((sin6_t *)&udp->udp_delayed_addr) = sin6_null;
+
+ rw_exit(&udp->udp_rwlock);
+
+ return (connp);
+}
+
+/* ARGSUSED */
+sock_lower_handle_t
+udp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
+ uint_t *smodep, int *errorp, int flags, cred_t *credp)
+{
+ udp_t *udp = NULL;
+ udp_stack_t *us;
+ conn_t *connp;
+ boolean_t isv6;
+
+ if (type != SOCK_DGRAM || (family != AF_INET && family != AF_INET6) ||
+ (proto != 0 && proto != IPPROTO_UDP)) {
+ *errorp = EPROTONOSUPPORT;
+ return (NULL);
+ }
+
+ if (family == AF_INET6)
+ isv6 = B_TRUE;
+ else
+ isv6 = B_FALSE;
+
+ connp = udp_do_open(credp, isv6, flags);
+ if (connp == NULL) {
+ *errorp = ENOMEM;
+ return (NULL);
+ }
+
+ udp = connp->conn_udp;
+ ASSERT(udp != NULL);
+ us = udp->udp_us;
+ ASSERT(us != NULL);
+
+ connp->conn_flags |= IPCL_NONSTR | IPCL_SOCKET;
+
+ /* Set flow control */
+ rw_enter(&udp->udp_rwlock, RW_WRITER);
+ (void) udp_set_rcv_hiwat(udp, us->us_recv_hiwat);
+ udp->udp_rcv_disply_hiwat = us->us_recv_hiwat;
+ udp->udp_rcv_lowat = udp_mod_info.mi_lowat;
+ udp->udp_xmit_hiwat = us->us_xmit_hiwat;
+ udp->udp_xmit_lowat = us->us_xmit_lowat;
+
+ if (udp->udp_family == AF_INET6) {
+ /* Build initial header template for transmit */
+ if ((*errorp = udp_build_hdrs(udp)) != 0) {
+ rw_exit(&udp->udp_rwlock);
+ ipcl_conn_destroy(connp);
+ return (NULL);
+ }
+ }
+ rw_exit(&udp->udp_rwlock);
+
+ connp->conn_flow_cntrld = B_FALSE;
+
+ ASSERT(us->us_ldi_ident != NULL);
+
+ if ((*errorp = ip_create_helper_stream(connp, us->us_ldi_ident)) != 0) {
+ ip1dbg(("create of IP helper stream failed\n"));
+ udp_do_close(connp);
+ return (NULL);
+ }
+
+ /* Set the send flow control */
+ connp->conn_wq->q_hiwat = us->us_xmit_hiwat;
+ connp->conn_wq->q_lowat = us->us_xmit_lowat;
+
+ mutex_enter(&connp->conn_lock);
+ connp->conn_state_flags &= ~CONN_INCIPIENT;
+ mutex_exit(&connp->conn_lock);
+
+ *errorp = 0;
+ *smodep = SM_ATOMIC;
+ *sock_downcalls = &sock_udp_downcalls;
+ return ((sock_lower_handle_t)connp);
+}
+
+/* ARGSUSED */
+void
+udp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle,
+ sock_upcalls_t *sock_upcalls, int flags, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ udp_t *udp = connp->conn_udp;
+ udp_stack_t *us = udp->udp_us;
+ struct sock_proto_props sopp;
+
+ connp->conn_upcalls = sock_upcalls;
+ connp->conn_upper_handle = sock_handle;
+
+ sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT |
+ SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ;
+ sopp.sopp_wroff = udp->udp_max_hdr_len + us->us_wroff_extra;
+ sopp.sopp_maxblk = INFPSZ;
+ sopp.sopp_rxhiwat = udp->udp_rcv_hiwat;
+ sopp.sopp_maxaddrlen = sizeof (sin6_t);
+ sopp.sopp_maxpsz =
+ (udp->udp_family == AF_INET) ? UDP_MAXPACKET_IPV4 :
+ UDP_MAXPACKET_IPV6;
+ sopp.sopp_minpsz = (udp_mod_info.mi_minpsz == 1) ? 0 :
+ udp_mod_info.mi_minpsz;
+
+ (*connp->conn_upcalls->su_set_proto_props)(connp->conn_upper_handle,
+ &sopp);
+}
+
+static void
+udp_do_close(conn_t *connp)
+{
+ udp_t *udp;
+
+ ASSERT(connp != NULL && IPCL_IS_UDP(connp));
+ udp = connp->conn_udp;
+
+ udp_quiesce_conn(connp);
+ ip_quiesce_conn(connp);
+
+ if (!IPCL_IS_NONSTR(connp)) {
+ /*
+ * Disable read-side synchronous stream
+ * interface and drain any queued data.
+ */
+ ASSERT(connp->conn_wq != NULL);
+ udp_rcv_drain(connp->conn_wq, udp, B_TRUE);
+ ASSERT(!udp->udp_direct_sockfs);
+
+ ASSERT(connp->conn_rq != NULL);
+ qprocsoff(connp->conn_rq);
+ }
+
+ ASSERT(udp->udp_rcv_cnt == 0);
+ ASSERT(udp->udp_rcv_msgcnt == 0);
+ ASSERT(udp->udp_rcv_list_head == NULL);
+ ASSERT(udp->udp_rcv_list_tail == NULL);
+
+ udp_close_free(connp);
+
+ /*
+ * Now we are truly single threaded on this stream, and can
+ * delete the things hanging off the connp, and finally the connp.
+ * We removed this connp from the fanout list, it cannot be
+ * accessed thru the fanouts, and we already waited for the
+ * conn_ref to drop to 0. We are already in close, so
+ * there cannot be any other thread from the top. qprocsoff
+ * has completed, and service has completed or won't run in
+ * future.
+ */
+ ASSERT(connp->conn_ref == 1);
+ if (!IPCL_IS_NONSTR(connp)) {
+ inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
+ } else {
+ ip_close_helper_stream(connp);
+ }
+
+ connp->conn_ref--;
+ ipcl_conn_destroy(connp);
+}
+
+/* ARGSUSED */
+int
+udp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+
+ udp_do_close(connp);
+ return (0);
+}
+
+static int
+udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
+ boolean_t bind_to_req_port_only)
+{
+ sin_t *sin;
+ sin6_t *sin6;
+ sin6_t sin6addr;
+ in_port_t port; /* Host byte order */
+ in_port_t requested_port; /* Host byte order */
+ int count;
+ in6_addr_t v6src;
+ int loopmax;
+ udp_fanout_t *udpf;
+ in_port_t lport; /* Network byte order */
+ zoneid_t zoneid;
+ udp_t *udp;
+ boolean_t is_inaddr_any;
+ mlp_type_t addrtype, mlptype;
+ udp_stack_t *us;
+ int error = 0;
+ mblk_t *mp = NULL;
+
+ udp = connp->conn_udp;
+ us = udp->udp_us;
+
+ if (udp->udp_state != TS_UNBND) {
+ (void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
+ "udp_bind: bad state, %u", udp->udp_state);
+ return (-TOUTSTATE);
+ }
+
+ switch (len) {
+ case 0:
+ if (udp->udp_family == AF_INET) {
+ sin = (sin_t *)&sin6addr;
+ *sin = sin_null;
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = INADDR_ANY;
+ udp->udp_ipversion = IPV4_VERSION;
+ } else {
+ ASSERT(udp->udp_family == AF_INET6);
+ sin6 = (sin6_t *)&sin6addr;
+ *sin6 = sin6_null;
+ sin6->sin6_family = AF_INET6;
+ V6_SET_ZERO(sin6->sin6_addr);
+ udp->udp_ipversion = IPV6_VERSION;
+ }
+ port = 0;
+ break;
+
+ case sizeof (sin_t): /* Complete IPv4 address */
+ sin = (sin_t *)sa;
+
+ if (sin == NULL || !OK_32PTR((char *)sin))
+ return (EINVAL);
+
+ if (udp->udp_family != AF_INET ||
+ sin->sin_family != AF_INET) {
+ return (EAFNOSUPPORT);
+ }
+ port = ntohs(sin->sin_port);
+ break;
+
+ case sizeof (sin6_t): /* complete IPv6 address */
+ sin6 = (sin6_t *)sa;
+
+ if (sin6 == NULL || !OK_32PTR((char *)sin6))
+ return (EINVAL);
+
+ if (udp->udp_family != AF_INET6 ||
+ sin6->sin6_family != AF_INET6) {
+ return (EAFNOSUPPORT);
+ }
+ port = ntohs(sin6->sin6_port);
+ break;
+
+ default: /* Invalid request */
+ (void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
+ "udp_bind: bad ADDR_length length %u", len);
+ return (-TBADADDR);
+ }
+
+ requested_port = port;
+
+ if (requested_port == 0 || !bind_to_req_port_only)
+ bind_to_req_port_only = B_FALSE;
+ else /* T_BIND_REQ and requested_port != 0 */
+ bind_to_req_port_only = B_TRUE;
+
+ if (requested_port == 0) {
+ /*
+ * If the application passed in zero for the port number, it
+ * doesn't care which port number we bind to. Get one in the
+ * valid range.
+ */
+ if (udp->udp_anon_priv_bind) {
+ port = udp_get_next_priv_port(udp);
+ } else {
+ port = udp_update_next_port(udp,
+ us->us_next_port_to_try, B_TRUE);
+ }
+ } else {
+ /*
+ * If the port is in the well-known privileged range,
+ * make sure the caller was privileged.
+ */
+ int i;
+ boolean_t priv = B_FALSE;
+
+ if (port < us->us_smallest_nonpriv_port) {
+ priv = B_TRUE;
+ } else {
+ for (i = 0; i < us->us_num_epriv_ports; i++) {
+ if (port == us->us_epriv_ports[i]) {
+ priv = B_TRUE;
+ break;
+ }
+ }
+ }
+
+ if (priv) {
+ if (secpolicy_net_privaddr(cr, port, IPPROTO_UDP) != 0)
+ return (-TACCES);
+ }
+ }
+
+ if (port == 0)
+ return (-TNOADDR);
+
+ /*
+ * The state must be TS_UNBND. TPI mandates that users must send
+ * TPI primitives only 1 at a time and wait for the response before
+ * sending the next primitive.
+ */
+ rw_enter(&udp->udp_rwlock, RW_WRITER);
+ if (udp->udp_state != TS_UNBND || udp->udp_pending_op != -1) {
+ rw_exit(&udp->udp_rwlock);
+ (void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
+ "udp_bind: bad state, %u", udp->udp_state);
+ return (-TOUTSTATE);
+ }
+ /* XXX how to remove the T_BIND_REQ? Should set it before calling */
+ udp->udp_pending_op = T_BIND_REQ;
+ /*
+ * Copy the source address into our udp structure. This address
+ * may still be zero; if so, IP will fill in the correct address
+ * each time an outbound packet is passed to it. Since the udp is
+ * not yet in the bind hash list, we don't grab the uf_lock to
+ * change udp_ipversion
+ */
+ if (udp->udp_family == AF_INET) {
+ ASSERT(sin != NULL);
+ ASSERT(udp->udp_ipversion == IPV4_VERSION);
+ udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE +
+ udp->udp_ip_snd_options_len;
+ IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6src);
+ } else {
+ ASSERT(sin6 != NULL);
+ v6src = sin6->sin6_addr;
+ if (IN6_IS_ADDR_V4MAPPED(&v6src)) {
+ /*
+ * no need to hold the uf_lock to set the udp_ipversion
+ * since we are not yet in the fanout list
+ */
+ udp->udp_ipversion = IPV4_VERSION;
+ udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH +
+ UDPH_SIZE + udp->udp_ip_snd_options_len;
+ } else {
+ udp->udp_ipversion = IPV6_VERSION;
+ udp->udp_max_hdr_len = udp->udp_sticky_hdrs_len;
+ }
+ }
+
+ /*
+ * If udp_reuseaddr is not set, then we have to make sure that
+ * the IP address and port number the application requested
+ * (or we selected for the application) is not being used by
+ * another stream. If another stream is already using the
+ * requested IP address and port, the behavior depends on
+ * "bind_to_req_port_only". If set the bind fails; otherwise we
+ * search for any an unused port to bind to the the stream.
+ *
+ * As per the BSD semantics, as modified by the Deering multicast
+ * changes, if udp_reuseaddr is set, then we allow multiple binds
+ * to the same port independent of the local IP address.
+ *
+ * This is slightly different than in SunOS 4.X which did not
+ * support IP multicast. Note that the change implemented by the
+ * Deering multicast code effects all binds - not only binding
+ * to IP multicast addresses.
+ *
+ * Note that when binding to port zero we ignore SO_REUSEADDR in
+ * order to guarantee a unique port.
+ */
+
+ count = 0;
+ if (udp->udp_anon_priv_bind) {
+ /*
+ * loopmax = (IPPORT_RESERVED-1) -
+ * us->us_min_anonpriv_port + 1
+ */
+ loopmax = IPPORT_RESERVED - us->us_min_anonpriv_port;
+ } else {
+ loopmax = us->us_largest_anon_port -
+ us->us_smallest_anon_port + 1;
+ }
+
+ is_inaddr_any = V6_OR_V4_INADDR_ANY(v6src);
+ zoneid = connp->conn_zoneid;
+
+ for (;;) {
+ udp_t *udp1;
+ boolean_t found_exclbind = B_FALSE;
+
+ /*
+ * Walk through the list of udp streams bound to
+ * requested port with the same IP address.
+ */
+ lport = htons(port);
+ udpf = &us->us_bind_fanout[UDP_BIND_HASH(lport,
+ us->us_bind_fanout_size)];
+ mutex_enter(&udpf->uf_lock);
+ for (udp1 = udpf->uf_udp; udp1 != NULL;
+ udp1 = udp1->udp_bind_hash) {
+ if (lport != udp1->udp_port)
+ continue;
+
+ /*
+ * On a labeled system, we must treat bindings to ports
+ * on shared IP addresses by sockets with MAC exemption
+ * privilege as being in all zones, as there's
+ * otherwise no way to identify the right receiver.
+ */
+ if (!(IPCL_ZONE_MATCH(udp1->udp_connp, zoneid) ||
+ IPCL_ZONE_MATCH(connp,
+ udp1->udp_connp->conn_zoneid)) &&
+ !connp->conn_mac_exempt && \
+ !udp1->udp_connp->conn_mac_exempt)
+ continue;
+
+ /*
+ * If UDP_EXCLBIND is set for either the bound or
+ * binding endpoint, the semantics of bind
+ * is changed according to the following chart.
+ *
+ * spec = specified address (v4 or v6)
+ * unspec = unspecified address (v4 or v6)
+ * A = specified addresses are different for endpoints
+ *
+ * bound bind to allowed?
+ * -------------------------------------
+ * unspec unspec no
+ * unspec spec no
+ * spec unspec no
+ * spec spec yes if A
+ *
+ * For labeled systems, SO_MAC_EXEMPT behaves the same
+ * as UDP_EXCLBIND, except that zoneid is ignored.
+ */
+ if (udp1->udp_exclbind || udp->udp_exclbind ||
+ udp1->udp_connp->conn_mac_exempt ||
+ connp->conn_mac_exempt) {
+ if (V6_OR_V4_INADDR_ANY(
+ udp1->udp_bound_v6src) ||
+ is_inaddr_any ||
+ IN6_ARE_ADDR_EQUAL(&udp1->udp_bound_v6src,
+ &v6src)) {
+ found_exclbind = B_TRUE;
+ break;
+ }
+ continue;
+ }
+
+ /*
+ * Check ipversion to allow IPv4 and IPv6 sockets to
+ * have disjoint port number spaces.
+ */
+ if (udp->udp_ipversion != udp1->udp_ipversion) {
+
+ /*
+ * On the first time through the loop, if the
+ * the user intentionally specified a
+ * particular port number, then ignore any
+ * bindings of the other protocol that may
+ * conflict. This allows the user to bind IPv6
+ * alone and get both v4 and v6, or bind both
+ * both and get each seperately. On subsequent
+ * times through the loop, we're checking a
+ * port that we chose (not the user) and thus
+ * we do not allow casual duplicate bindings.
+ */
+ if (count == 0 && requested_port != 0)
+ continue;
+ }
+
+ /*
+ * No difference depending on SO_REUSEADDR.
+ *
+ * If existing port is bound to a
+ * non-wildcard IP address and
+ * the requesting stream is bound to
+ * a distinct different IP addresses
+ * (non-wildcard, also), keep going.
+ */
+ if (!is_inaddr_any &&
+ !V6_OR_V4_INADDR_ANY(udp1->udp_bound_v6src) &&
+ !IN6_ARE_ADDR_EQUAL(&udp1->udp_bound_v6src,
+ &v6src)) {
+ continue;
+ }
+ break;
+ }
+
+ if (!found_exclbind &&
+ (udp->udp_reuseaddr && requested_port != 0)) {
+ break;
+ }
+
+ if (udp1 == NULL) {
+ /*
+ * No other stream has this IP address
+ * and port number. We can use it.
+ */
+ break;
+ }
+ mutex_exit(&udpf->uf_lock);
+ if (bind_to_req_port_only) {
+ /*
+ * We get here only when requested port
+ * is bound (and only first of the for()
+ * loop iteration).
+ *
+ * The semantics of this bind request
+ * require it to fail so we return from
+ * the routine (and exit the loop).
+ *
+ */
+ udp->udp_pending_op = -1;
+ rw_exit(&udp->udp_rwlock);
+ return (-TADDRBUSY);
+ }
+
+ if (udp->udp_anon_priv_bind) {
+ port = udp_get_next_priv_port(udp);
+ } else {
+ if ((count == 0) && (requested_port != 0)) {
+ /*
+ * If the application wants us to find
+ * a port, get one to start with. Set
+ * requested_port to 0, so that we will
+ * update us->us_next_port_to_try below.
+ */
+ port = udp_update_next_port(udp,
+ us->us_next_port_to_try, B_TRUE);
+ requested_port = 0;
+ } else {
+ port = udp_update_next_port(udp, port + 1,
+ B_FALSE);
+ }
+ }
+
+ if (port == 0 || ++count >= loopmax) {
+ /*
+ * We've tried every possible port number and
+ * there are none available, so send an error
+ * to the user.
+ */
+ udp->udp_pending_op = -1;
+ rw_exit(&udp->udp_rwlock);
+ return (-TNOADDR);
+ }
+ }
+
+ /*
+ * Copy the source address into our udp structure. This address
+ * may still be zero; if so, ip will fill in the correct address
+ * each time an outbound packet is passed to it.
+ * If we are binding to a broadcast or multicast address then
+ * udp_post_ip_bind_connect will clear the source address
+ * when udp_do_bind success.
+ */
+ udp->udp_v6src = udp->udp_bound_v6src = v6src;
+ udp->udp_port = lport;
+ /*
+ * Now reset the the next anonymous port if the application requested
+ * an anonymous port, or we handed out the next anonymous port.
+ */
+ if ((requested_port == 0) && (!udp->udp_anon_priv_bind)) {
+ us->us_next_port_to_try = port + 1;
+ }
+
+ /* Initialize the O_T_BIND_REQ/T_BIND_REQ for ip. */
+ if (udp->udp_family == AF_INET) {
+ sin->sin_port = udp->udp_port;
+ } else {
+ sin6->sin6_port = udp->udp_port;
+ /* Rebuild the header template */
+ error = udp_build_hdrs(udp);
+ if (error != 0) {
+ udp->udp_pending_op = -1;
+ rw_exit(&udp->udp_rwlock);
+ mutex_exit(&udpf->uf_lock);
+ return (error);
+ }
+ }
+ udp->udp_state = TS_IDLE;
+ udp_bind_hash_insert(udpf, udp);
+ mutex_exit(&udpf->uf_lock);
+ rw_exit(&udp->udp_rwlock);
+
+ if (cl_inet_bind) {
+ /*
+ * Running in cluster mode - register bind information
+ */
+ if (udp->udp_ipversion == IPV4_VERSION) {
+ (*cl_inet_bind)(IPPROTO_UDP, AF_INET,
+ (uint8_t *)(&V4_PART_OF_V6(udp->udp_v6src)),
+ (in_port_t)udp->udp_port);
+ } else {
+ (*cl_inet_bind)(IPPROTO_UDP, AF_INET6,
+ (uint8_t *)&(udp->udp_v6src),
+ (in_port_t)udp->udp_port);
+ }
+
+ }
+
+ connp->conn_anon_port = (is_system_labeled() && requested_port == 0);
+ if (is_system_labeled() && (!connp->conn_anon_port ||
+ connp->conn_anon_mlp)) {
+ uint16_t mlpport;
+ cred_t *cr = connp->conn_cred;
+ zone_t *zone;
+
+ zone = crgetzone(cr);
+ connp->conn_mlp_type = udp->udp_recvucred ? mlptBoth :
+ mlptSingle;
+ addrtype = tsol_mlp_addr_type(zone->zone_id, IPV6_VERSION,
+ &v6src, us->us_netstack->netstack_ip);
+ if (addrtype == mlptSingle) {
+ rw_enter(&udp->udp_rwlock, RW_WRITER);
+ udp->udp_pending_op = -1;
+ rw_exit(&udp->udp_rwlock);
+ connp->conn_anon_port = B_FALSE;
+ connp->conn_mlp_type = mlptSingle;
+ return (-TNOADDR);
+ }
+ mlpport = connp->conn_anon_port ? PMAPPORT : port;
+ mlptype = tsol_mlp_port_type(zone, IPPROTO_UDP, mlpport,
+ addrtype);
+ if (mlptype != mlptSingle &&
+ (connp->conn_mlp_type == mlptSingle ||
+ secpolicy_net_bindmlp(cr) != 0)) {
+ if (udp->udp_debug) {
+ (void) strlog(UDP_MOD_ID, 0, 1,
+ SL_ERROR|SL_TRACE,
+ "udp_bind: no priv for multilevel port %d",
+ mlpport);
+ }
+ rw_enter(&udp->udp_rwlock, RW_WRITER);
+ udp->udp_pending_op = -1;
+ rw_exit(&udp->udp_rwlock);
+ connp->conn_anon_port = B_FALSE;
+ connp->conn_mlp_type = mlptSingle;
+ return (-TACCES);
+ }
+
+ /*
+ * If we're specifically binding a shared IP address and the
+ * port is MLP on shared addresses, then check to see if this
+ * zone actually owns the MLP. Reject if not.
+ */
+ if (mlptype == mlptShared && addrtype == mlptShared) {
+ /*
+ * No need to handle exclusive-stack zones since
+ * ALL_ZONES only applies to the shared stack.
+ */
+ zoneid_t mlpzone;
+
+ mlpzone = tsol_mlp_findzone(IPPROTO_UDP,
+ htons(mlpport));
+ if (connp->conn_zoneid != mlpzone) {
+ if (udp->udp_debug) {
+ (void) strlog(UDP_MOD_ID, 0, 1,
+ SL_ERROR|SL_TRACE,
+ "udp_bind: attempt to bind port "
+ "%d on shared addr in zone %d "
+ "(should be %d)",
+ mlpport, connp->conn_zoneid,
+ mlpzone);
+ }
+ rw_enter(&udp->udp_rwlock, RW_WRITER);
+ udp->udp_pending_op = -1;
+ rw_exit(&udp->udp_rwlock);
+ connp->conn_anon_port = B_FALSE;
+ connp->conn_mlp_type = mlptSingle;
+ return (-TACCES);
+ }
+ }
+ if (connp->conn_anon_port) {
+ error = tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
+ port, B_TRUE);
+ if (error != 0) {
+ if (udp->udp_debug) {
+ (void) strlog(UDP_MOD_ID, 0, 1,
+ SL_ERROR|SL_TRACE,
+ "udp_bind: cannot establish anon "
+ "MLP for port %d", port);
+ }
+ rw_enter(&udp->udp_rwlock, RW_WRITER);
+ udp->udp_pending_op = -1;
+ rw_exit(&udp->udp_rwlock);
+ connp->conn_anon_port = B_FALSE;
+ connp->conn_mlp_type = mlptSingle;
+ return (-TACCES);
+ }
+ }
+ connp->conn_mlp_type = mlptype;
+ }
+
+ if (!V6_OR_V4_INADDR_ANY(udp->udp_v6src)) {
+ /*
+ * Append a request for an IRE if udp_v6src not
+ * zero (IPv4 - INADDR_ANY, or IPv6 - all-zeroes address).
+ */
+ mp = allocb(sizeof (ire_t), BPRI_HI);
+ if (!mp) {
+ rw_enter(&udp->udp_rwlock, RW_WRITER);
+ udp->udp_pending_op = -1;
+ rw_exit(&udp->udp_rwlock);
+ return (ENOMEM);
+ }
+ mp->b_wptr += sizeof (ire_t);
+ mp->b_datap->db_type = IRE_DB_REQ_TYPE;
+ }
+ if (udp->udp_family == AF_INET6) {
+ ASSERT(udp->udp_connp->conn_af_isv6);
+ error = ip_proto_bind_laddr_v6(connp, &mp, IPPROTO_UDP,
+ &udp->udp_bound_v6src, udp->udp_port, B_TRUE);
+ } else {
+ ASSERT(!udp->udp_connp->conn_af_isv6);
+ error = ip_proto_bind_laddr_v4(connp, &mp, IPPROTO_UDP,
+ V4_PART_OF_V6(udp->udp_bound_v6src), udp->udp_port,
+ B_TRUE);
+ }
+
+ (void) udp_post_ip_bind_connect(udp, mp, error);
+ return (error);
+}
+
+int
+udp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
+ socklen_t len, cred_t *cr)
+{
+ int error;
+ conn_t *connp;
+
+ connp = (conn_t *)proto_handle;
+
+ if (sa == NULL)
+ error = udp_do_unbind(connp);
+ else
+ error = udp_do_bind(connp, sa, len, cr, B_TRUE);
+
+ if (error < 0) {
+ if (error == -TOUTSTATE)
+ error = EINVAL;
+ else
+ error = proto_tlitosyserr(-error);
+ }
+
+ return (error);
+}
+
+static int
+udp_implicit_bind(conn_t *connp, cred_t *cr)
+{
+ int error;
+
+ error = udp_do_bind(connp, NULL, 0, cr, B_FALSE);
+ return ((error < 0) ? proto_tlitosyserr(-error) : error);
+}
+
+/*
+ * This routine removes a port number association from a stream. It
+ * is called by udp_unbind and udp_tpi_unbind.
+ */
+static int
+udp_do_unbind(conn_t *connp)
+{
+ udp_t *udp = connp->conn_udp;
+ udp_fanout_t *udpf;
+ udp_stack_t *us = udp->udp_us;
+
+ if (cl_inet_unbind != NULL) {
+ /*
+ * Running in cluster mode - register unbind information
+ */
+ if (udp->udp_ipversion == IPV4_VERSION) {
+ (*cl_inet_unbind)(IPPROTO_UDP, AF_INET,
+ (uint8_t *)(&V4_PART_OF_V6(udp->udp_v6src)),
+ (in_port_t)udp->udp_port);
+ } else {
+ (*cl_inet_unbind)(IPPROTO_UDP, AF_INET6,
+ (uint8_t *)&(udp->udp_v6src),
+ (in_port_t)udp->udp_port);
+ }
+ }
+
+ rw_enter(&udp->udp_rwlock, RW_WRITER);
+ if (udp->udp_state == TS_UNBND || udp->udp_pending_op != -1) {
+ rw_exit(&udp->udp_rwlock);
+ return (-TOUTSTATE);
+ }
+ udp->udp_pending_op = T_UNBIND_REQ;
+ rw_exit(&udp->udp_rwlock);
+
+ /*
+ * Pass the unbind to IP; T_UNBIND_REQ is larger than T_OK_ACK
+ * and therefore ip_unbind must never return NULL.
+ */
+ ip_unbind(connp);
+
+ /*
+ * Once we're unbound from IP, the pending operation may be cleared
+ * here.
+ */
+ rw_enter(&udp->udp_rwlock, RW_WRITER);
+ udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port,
+ us->us_bind_fanout_size)];
+
+ mutex_enter(&udpf->uf_lock);
+ udp_bind_hash_remove(udp, B_TRUE);
+ V6_SET_ZERO(udp->udp_v6src);
+ V6_SET_ZERO(udp->udp_bound_v6src);
+ udp->udp_port = 0;
+ mutex_exit(&udpf->uf_lock);
+
+ udp->udp_pending_op = -1;
+ udp->udp_state = TS_UNBND;
+ if (udp->udp_family == AF_INET6)
+ (void) udp_build_hdrs(udp);
+ rw_exit(&udp->udp_rwlock);
+
+ return (0);
+}
+
+static int
+udp_post_ip_bind_connect(udp_t *udp, mblk_t *ire_mp, int error)
+{
+ ire_t *ire;
+ udp_fanout_t *udpf;
+ udp_stack_t *us = udp->udp_us;
+
+ ASSERT(udp->udp_pending_op != -1);
+ rw_enter(&udp->udp_rwlock, RW_WRITER);
+ if (error == 0) {
+ /* For udp_do_connect() success */
+ /* udp_do_bind() success will do nothing in here */
+ /*
+ * If a broadcast/multicast address was bound, set
+ * the source address to 0.
+ * This ensures no datagrams with broadcast address
+ * as source address are emitted (which would violate
+ * RFC1122 - Hosts requirements)
+ *
+ * Note that when connecting the returned IRE is
+ * for the destination address and we only perform
+ * the broadcast check for the source address (it
+ * is OK to connect to a broadcast/multicast address.)
+ */
+ if (ire_mp != NULL && ire_mp->b_datap->db_type == IRE_DB_TYPE) {
+ ire = (ire_t *)ire_mp->b_rptr;
+
+ /*
+ * Note: we get IRE_BROADCAST for IPv6 to "mark" a
+ * multicast local address.
+ */
+ udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port,
+ us->us_bind_fanout_size)];
+ if (ire->ire_type == IRE_BROADCAST &&
+ udp->udp_state != TS_DATA_XFER) {
+ ASSERT(udp->udp_pending_op == T_BIND_REQ ||
+ udp->udp_pending_op == O_T_BIND_REQ);
+ /*
+ * This was just a local bind to a broadcast
+ * addr.
+ */
+ mutex_enter(&udpf->uf_lock);
+ V6_SET_ZERO(udp->udp_v6src);
+ mutex_exit(&udpf->uf_lock);
+ if (udp->udp_family == AF_INET6)
+ (void) udp_build_hdrs(udp);
+ } else if (V6_OR_V4_INADDR_ANY(udp->udp_v6src)) {
+ if (udp->udp_family == AF_INET6)
+ (void) udp_build_hdrs(udp);
+ }
+ }
+ } else {
+ udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port,
+ us->us_bind_fanout_size)];
+ mutex_enter(&udpf->uf_lock);
+
+ if (udp->udp_state == TS_DATA_XFER) {
+ /* Connect failed */
+ /* Revert back to the bound source */
+ udp->udp_v6src = udp->udp_bound_v6src;
+ udp->udp_state = TS_IDLE;
+ } else {
+ /* For udp_do_bind() failed */
+ V6_SET_ZERO(udp->udp_v6src);
+ V6_SET_ZERO(udp->udp_bound_v6src);
+ udp->udp_state = TS_UNBND;
+ udp_bind_hash_remove(udp, B_TRUE);
+ udp->udp_port = 0;
+ }
+ mutex_exit(&udpf->uf_lock);
+ if (udp->udp_family == AF_INET6)
+ (void) udp_build_hdrs(udp);
+ }
+ udp->udp_pending_op = -1;
+ rw_exit(&udp->udp_rwlock);
+ if (ire_mp != NULL)
+ freeb(ire_mp);
+ return (error);
+}
+
+/*
+ * It associates a default destination address with the stream.
+ */
+static int
+udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len)
+{
+ sin6_t *sin6;
+ sin_t *sin;
+ in6_addr_t v6dst;
+ ipaddr_t v4dst;
+ uint16_t dstport;
+ uint32_t flowinfo;
+ mblk_t *ire_mp;
+ udp_fanout_t *udpf;
+ udp_t *udp, *udp1;
+ ushort_t ipversion;
+ udp_stack_t *us;
+ int error;
+
+ udp = connp->conn_udp;
+ us = udp->udp_us;
+
+ /*
+ * Address has been verified by the caller
+ */
+ switch (len) {
+ default:
+ /*
+ * Should never happen
+ */
+ return (EINVAL);
+
+ case sizeof (sin_t):
+ sin = (sin_t *)sa;
+ v4dst = sin->sin_addr.s_addr;
+ dstport = sin->sin_port;
+ IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
+ ASSERT(udp->udp_ipversion == IPV4_VERSION);
+ ipversion = IPV4_VERSION;
+ break;
+
+ case sizeof (sin6_t):
+ sin6 = (sin6_t *)sa;
+ v6dst = sin6->sin6_addr;
+ dstport = sin6->sin6_port;
+ if (IN6_IS_ADDR_V4MAPPED(&v6dst)) {
+ IN6_V4MAPPED_TO_IPADDR(&v6dst, v4dst);
+ ipversion = IPV4_VERSION;
+ flowinfo = 0;
+ } else {
+ ipversion = IPV6_VERSION;
+ flowinfo = sin6->sin6_flowinfo;
+ }
+ break;
+ }
+
+ if (dstport == 0)
+ return (-TBADADDR);
+
+ rw_enter(&udp->udp_rwlock, RW_WRITER);
+
+ /*
+ * This UDP must have bound to a port already before doing a connect.
+ * TPI mandates that users must send TPI primitives only 1 at a time
+ * and wait for the response before sending the next primitive.
+ */
+ if (udp->udp_state == TS_UNBND || udp->udp_pending_op != -1) {
+ rw_exit(&udp->udp_rwlock);
+ (void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
+ "udp_connect: bad state, %u", udp->udp_state);
+ return (-TOUTSTATE);
+ }
+ udp->udp_pending_op = T_CONN_REQ;
+ ASSERT(udp->udp_port != 0 && udp->udp_ptpbhn != NULL);
+
+ if (ipversion == IPV4_VERSION) {
+ udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE +
+ udp->udp_ip_snd_options_len;
+ } else {
+ udp->udp_max_hdr_len = udp->udp_sticky_hdrs_len;
+ }
+
+ udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port,
+ us->us_bind_fanout_size)];
+
+ mutex_enter(&udpf->uf_lock);
+ if (udp->udp_state == TS_DATA_XFER) {
+ /* Already connected - clear out state */
+ udp->udp_v6src = udp->udp_bound_v6src;
+ udp->udp_state = TS_IDLE;
+ }
+
+ /*
+ * Create a default IP header with no IP options.
+ */
+ udp->udp_dstport = dstport;
+ udp->udp_ipversion = ipversion;
+ if (ipversion == IPV4_VERSION) {
+ /*
+ * Interpret a zero destination to mean loopback.
+ * Update the T_CONN_REQ (sin/sin6) since it is used to
+ * generate the T_CONN_CON.
+ */
+ if (v4dst == INADDR_ANY) {
+ v4dst = htonl(INADDR_LOOPBACK);
+ IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst);
+ if (udp->udp_family == AF_INET) {
+ sin->sin_addr.s_addr = v4dst;
+ } else {
+ sin6->sin6_addr = v6dst;
+ }
+ }
+ udp->udp_v6dst = v6dst;
+ udp->udp_flowinfo = 0;
+
+ /*
+ * If the destination address is multicast and
+ * an outgoing multicast interface has been set,
+ * use the address of that interface as our
+ * source address if no source address has been set.
+ */
+ if (V4_PART_OF_V6(udp->udp_v6src) == INADDR_ANY &&
+ CLASSD(v4dst) &&
+ udp->udp_multicast_if_addr != INADDR_ANY) {
+ IN6_IPADDR_TO_V4MAPPED(udp->udp_multicast_if_addr,
+ &udp->udp_v6src);
+ }
+ } else {
+ ASSERT(udp->udp_ipversion == IPV6_VERSION);
+ /*
+ * Interpret a zero destination to mean loopback.
+ * Update the T_CONN_REQ (sin/sin6) since it is used to
+ * generate the T_CONN_CON.
+ */
+ if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) {
+ v6dst = ipv6_loopback;
+ sin6->sin6_addr = v6dst;
+ }
+ udp->udp_v6dst = v6dst;
+ udp->udp_flowinfo = flowinfo;
+ /*
+ * If the destination address is multicast and
+ * an outgoing multicast interface has been set,
+ * then the ip bind logic will pick the correct source
+ * address (i.e. matching the outgoing multicast interface).
+ */
+ }
+
+ /*
+ * Verify that the src/port/dst/port is unique for all
+ * connections in TS_DATA_XFER
+ */
+ for (udp1 = udpf->uf_udp; udp1 != NULL; udp1 = udp1->udp_bind_hash) {
+ if (udp1->udp_state != TS_DATA_XFER)
+ continue;
+ if (udp->udp_port != udp1->udp_port ||
+ udp->udp_ipversion != udp1->udp_ipversion ||
+ dstport != udp1->udp_dstport ||
+ !IN6_ARE_ADDR_EQUAL(&udp->udp_v6src, &udp1->udp_v6src) ||
+ !IN6_ARE_ADDR_EQUAL(&v6dst, &udp1->udp_v6dst) ||
+ !(IPCL_ZONE_MATCH(udp->udp_connp,
+ udp1->udp_connp->conn_zoneid) ||
+ IPCL_ZONE_MATCH(udp1->udp_connp,
+ udp->udp_connp->conn_zoneid)))
+ continue;
+ mutex_exit(&udpf->uf_lock);
+ udp->udp_pending_op = -1;
+ rw_exit(&udp->udp_rwlock);
+ return (-TBADADDR);
+ }
+ udp->udp_state = TS_DATA_XFER;
+ mutex_exit(&udpf->uf_lock);
+
+ ire_mp = allocb(sizeof (ire_t), BPRI_HI);
+ if (ire_mp == NULL) {
+ mutex_enter(&udpf->uf_lock);
+ udp->udp_state = TS_IDLE;
+ udp->udp_pending_op = -1;
+ mutex_exit(&udpf->uf_lock);
+ rw_exit(&udp->udp_rwlock);
+ return (ENOMEM);
+ }
+
+ rw_exit(&udp->udp_rwlock);
+
+ ire_mp->b_wptr += sizeof (ire_t);
+ ire_mp->b_datap->db_type = IRE_DB_REQ_TYPE;
+
+ if (udp->udp_family == AF_INET) {
+ error = ip_proto_bind_connected_v4(connp, &ire_mp, IPPROTO_UDP,
+ &V4_PART_OF_V6(udp->udp_v6src), udp->udp_port,
+ V4_PART_OF_V6(udp->udp_v6dst), udp->udp_dstport,
+ B_TRUE, B_TRUE);
+ } else {
+ error = ip_proto_bind_connected_v6(connp, &ire_mp, IPPROTO_UDP,
+ &udp->udp_v6src, udp->udp_port, &udp->udp_v6dst,
+ &udp->udp_sticky_ipp, udp->udp_dstport, B_TRUE, B_TRUE);
+ }
+
+ return (udp_post_ip_bind_connect(udp, ire_mp, error));
+}
+
+/* ARGSUSED */
+static int
+udp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa,
+ socklen_t len, sock_connid_t *id, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ udp_t *udp = connp->conn_udp;
+ int error;
+ boolean_t did_bind = B_FALSE;
+
+ if (sa == NULL) {
+ /*
+ * Disconnect
+ * Make sure we are connected
+ */
+ if (udp->udp_state != TS_DATA_XFER)
+ return (EINVAL);
+
+ error = udp_disconnect(connp);
+ return (error);
+ }
+
+ error = proto_verify_ip_addr(udp->udp_family, sa, len);
+ if (error != 0)
+ goto done;
+
+ /* do an implicit bind if necessary */
+ if (udp->udp_state == TS_UNBND) {
+ error = udp_implicit_bind(connp, cr);
+ /*
+ * We could be racing with an actual bind, in which case
+ * we would see EPROTO. We cross our fingers and try
+ * to connect.
+ */
+ if (!(error == 0 || error == EPROTO))
+ goto done;
+ did_bind = B_TRUE;
+ }
+ /*
+ * set SO_DGRAM_ERRIND
+ */
+ udp->udp_dgram_errind = B_TRUE;
+
+ error = udp_do_connect(connp, sa, len);
+
+ if (error != 0 && did_bind) {
+ int unbind_err;
+
+ unbind_err = udp_do_unbind(connp);
+ ASSERT(unbind_err == 0);
+ }
+
+ if (error == 0) {
+ *id = 0;
+ (*connp->conn_upcalls->su_connected)
+ (connp->conn_upper_handle, 0, NULL, -1);
+ } else if (error < 0) {
+ error = proto_tlitosyserr(-error);
+ }
+
+done:
+ if (error != 0 && udp->udp_state == TS_DATA_XFER) {
+ /*
+ * No need to hold locks to set state
+ * after connect failure socket state is undefined
+ * We set the state only to imitate old sockfs behavior
+ */
+ udp->udp_state = TS_IDLE;
+ }
+ return (error);
+}
+
+/* ARGSUSED */
+int
+udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
+ cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ udp_t *udp = connp->conn_udp;
+ udp_stack_t *us = udp->udp_us;
+ int error = 0;
+
+ ASSERT(DB_TYPE(mp) == M_DATA);
+
+ /*
+ * If the socket is connected and no change in destination
+ */
+ if (msg->msg_namelen == 0) {
+ error = udp_send_connected(connp, mp, msg, cr, curproc->p_pid);
+ if (error == EDESTADDRREQ)
+ return (error);
+ else
+ return (udp->udp_dgram_errind ? error : 0);
+ }
+
+ /*
+ * Do an implicit bind if necessary.
+ */
+ if (udp->udp_state == TS_UNBND) {
+ error = udp_implicit_bind(connp, cr);
+ /*
+ * We could be racing with an actual bind, in which case
+ * we would see EPROTO. We cross our fingers and try
+ * to send.
+ */
+ if (!(error == 0 || error == EPROTO)) {
+ freemsg(mp);
+ return (error);
+ }
+ }
+
+ rw_enter(&udp->udp_rwlock, RW_WRITER);
+
+ if (msg->msg_name != NULL && udp->udp_state == TS_DATA_XFER) {
+ rw_exit(&udp->udp_rwlock);
+ freemsg(mp);
+ return (EISCONN);
+ }
+
+
+ if (udp->udp_delayed_error != 0) {
+ boolean_t match;
+
+ error = udp->udp_delayed_error;
+ match = B_FALSE;
+ udp->udp_delayed_error = 0;
+ switch (udp->udp_family) {
+ case AF_INET: {
+ /* Compare just IP address and port */
+ sin_t *sin1 = (sin_t *)msg->msg_name;
+ sin_t *sin2 = (sin_t *)&udp->udp_delayed_addr;
+
+ if (msg->msg_namelen == sizeof (sin_t) &&
+ sin1->sin_port == sin2->sin_port &&
+ sin1->sin_addr.s_addr == sin2->sin_addr.s_addr)
+ match = B_TRUE;
+
+ break;
+ }
+ case AF_INET6: {
+ sin6_t *sin1 = (sin6_t *)msg->msg_name;
+ sin6_t *sin2 = (sin6_t *)&udp->udp_delayed_addr;
+
+ if (msg->msg_namelen == sizeof (sin6_t) &&
+ sin1->sin6_port == sin2->sin6_port &&
+ IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
+ &sin2->sin6_addr))
+ match = B_TRUE;
+ break;
+ }
+ default:
+ ASSERT(0);
+ }
+
+ *((sin6_t *)&udp->udp_delayed_addr) = sin6_null;
+
+ if (match) {
+ rw_exit(&udp->udp_rwlock);
+ freemsg(mp);
+ return (error);
+ }
+ }
+
+ error = proto_verify_ip_addr(udp->udp_family,
+ (struct sockaddr *)msg->msg_name, msg->msg_namelen);
+ rw_exit(&udp->udp_rwlock);
+
+ if (error != 0) {
+ freemsg(mp);
+ return (error);
+ }
+
+ error = udp_send_not_connected(connp, mp,
+ (struct sockaddr *)msg->msg_name, msg->msg_namelen, msg, cr,
+ curproc->p_pid);
+ if (error != 0) {
+ UDP_STAT(us, udp_out_err_output);
+ freemsg(mp);
+ }
+ return (udp->udp_dgram_errind ? error : 0);
+}
+
+void
+udp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
+ boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ udp_t *udp;
+ struct T_capability_ack tca;
+ struct sockaddr_in6 laddr, faddr;
+ socklen_t laddrlen, faddrlen;
+ short opts;
+ struct stroptions *stropt;
+ mblk_t *stropt_mp;
+ int error;
+
+ udp = connp->conn_udp;
+
+ stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL);
+
+ /*
+ * setup the fallback stream that was allocated
+ */
+ connp->conn_dev = (dev_t)RD(q)->q_ptr;
+ connp->conn_minor_arena = WR(q)->q_ptr;
+
+ RD(q)->q_ptr = WR(q)->q_ptr = connp;
+
+ WR(q)->q_qinfo = &udp_winit;
+
+ connp->conn_rq = RD(q);
+ connp->conn_wq = WR(q);
+
+ /* Notify stream head about options before sending up data */
+ stropt_mp->b_datap->db_type = M_SETOPTS;
+ stropt_mp->b_wptr += sizeof (*stropt);
+ stropt = (struct stroptions *)stropt_mp->b_rptr;
+ stropt->so_flags = SO_WROFF | SO_HIWAT;
+ stropt->so_wroff =
+ (ushort_t)(udp->udp_max_hdr_len + udp->udp_us->us_wroff_extra);
+ stropt->so_hiwat = udp->udp_rcv_disply_hiwat;
+ putnext(RD(q), stropt_mp);
+
+ /*
+ * Free the helper stream
+ */
+ ip_close_helper_stream(connp);
+
+ if (!direct_sockfs)
+ udp_disable_direct_sockfs(udp);
+
+ /*
+ * Collect the information needed to sync with the sonode
+ */
+ udp_do_capability_ack(udp, &tca, TC1_INFO);
+
+ laddrlen = faddrlen = sizeof (sin6_t);
+ (void) udp_getsockname((sock_lower_handle_t)connp,
+ (struct sockaddr *)&laddr, &laddrlen, NULL);
+ error = udp_getpeername((sock_lower_handle_t)connp,
+ (struct sockaddr *)&faddr, &faddrlen, NULL);
+ if (error != 0)
+ faddrlen = 0;
+
+ opts = 0;
+ if (udp->udp_dgram_errind)
+ opts |= SO_DGRAM_ERRIND;
+ if (udp->udp_dontroute)
+ opts |= SO_DONTROUTE;
+
+ /*
+ * Once we grab the drain lock, no data will be send up
+ * to the socket. So we notify the socket that the endpoint
+ * is quiescent and it's therefore safe move data from
+ * the socket to the stream head.
+ */
+ (*quiesced_cb)(connp->conn_upper_handle, q, &tca,
+ (struct sockaddr *)&laddr, laddrlen,
+ (struct sockaddr *)&faddr, faddrlen, opts);
+
+ /*
+ * push up any packets that were queued in udp_t
+ */
+
+ mutex_enter(&udp->udp_recv_lock);
+ while (udp->udp_fallback_queue_head != NULL) {
+ mblk_t *mp;
+ mp = udp->udp_fallback_queue_head;
+ udp->udp_fallback_queue_head = mp->b_next;
+ mutex_exit(&udp->udp_recv_lock);
+ mp->b_next = NULL;
+ putnext(RD(q), mp);
+ mutex_enter(&udp->udp_recv_lock);
+ }
+ udp->udp_fallback_queue_tail = udp->udp_fallback_queue_head;
+ /*
+ * No longer a streams less socket
+ */
+ connp->conn_flags &= ~IPCL_NONSTR;
+ mutex_exit(&udp->udp_recv_lock);
+
+ ASSERT(connp->conn_ref >= 1);
+}
+
+static int
+udp_do_getpeername(udp_t *udp, struct sockaddr *sa, uint_t *salenp)
+{
+ sin_t *sin = (sin_t *)sa;
+ sin6_t *sin6 = (sin6_t *)sa;
+
+ ASSERT(RW_LOCK_HELD(&udp->udp_rwlock));
+ ASSERT(udp != NULL);
+
+ if (udp->udp_state != TS_DATA_XFER)
+ return (ENOTCONN);
+
+ switch (udp->udp_family) {
+ case AF_INET:
+ ASSERT(udp->udp_ipversion == IPV4_VERSION);
+
+ if (*salenp < sizeof (sin_t))
+ return (EINVAL);
+
+ *salenp = sizeof (sin_t);
+ *sin = sin_null;
+ sin->sin_family = AF_INET;
+ sin->sin_port = udp->udp_dstport;
+ sin->sin_addr.s_addr = V4_PART_OF_V6(udp->udp_v6dst);
+ break;
+ case AF_INET6:
+ if (*salenp < sizeof (sin6_t))
+ return (EINVAL);
+
+ *salenp = sizeof (sin6_t);
+ *sin6 = sin6_null;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = udp->udp_dstport;
+ sin6->sin6_addr = udp->udp_v6dst;
+ sin6->sin6_flowinfo = udp->udp_flowinfo;
+ break;
+ }
+
+ return (0);
+}
+
+/* ARGSUSED */
+int
+udp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa,
+ socklen_t *salenp, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ udp_t *udp = connp->conn_udp;
+ int error;
+
+ ASSERT(udp != NULL);
+
+ rw_enter(&udp->udp_rwlock, RW_READER);
+
+ error = udp_do_getpeername(udp, sa, salenp);
+
+ rw_exit(&udp->udp_rwlock);
+
+ return (error);
+}
+
+static int
+udp_do_getsockname(udp_t *udp, struct sockaddr *sa, uint_t *salenp)
+{
+ sin_t *sin = (sin_t *)sa;
+ sin6_t *sin6 = (sin6_t *)sa;
+
+ ASSERT(udp != NULL);
+ ASSERT(RW_LOCK_HELD(&udp->udp_rwlock));
+
+ switch (udp->udp_family) {
+ case AF_INET:
+ ASSERT(udp->udp_ipversion == IPV4_VERSION);
+
+ if (*salenp < sizeof (sin_t))
+ return (EINVAL);
+
+ *salenp = sizeof (sin_t);
+ *sin = sin_null;
+ sin->sin_family = AF_INET;
+ if (udp->udp_state == TS_UNBND) {
+ break;
+ }
+ sin->sin_port = udp->udp_port;
+
+ if (!IN6_IS_ADDR_V4MAPPED_ANY(&udp->udp_v6src) &&
+ !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) {
+ sin->sin_addr.s_addr = V4_PART_OF_V6(udp->udp_v6src);
+ } else {
+ /*
+ * INADDR_ANY
+ * udp_v6src is not set, we might be bound to
+ * broadcast/multicast. Use udp_bound_v6src as
+ * local address instead (that could
+ * also still be INADDR_ANY)
+ */
+ sin->sin_addr.s_addr =
+ V4_PART_OF_V6(udp->udp_bound_v6src);
+ }
+ break;
+
+ case AF_INET6:
+ if (*salenp < sizeof (sin6_t))
+ return (EINVAL);
+
+ *salenp = sizeof (sin6_t);
+ *sin6 = sin6_null;
+ sin6->sin6_family = AF_INET6;
+ if (udp->udp_state == TS_UNBND) {
+ break;
+ }
+ sin6->sin6_port = udp->udp_port;
+
+ if (!IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) {
+ sin6->sin6_addr = udp->udp_v6src;
+ } else {
+ /*
+ * UNSPECIFIED
+ * udp_v6src is not set, we might be bound to
+ * broadcast/multicast. Use udp_bound_v6src as
+ * local address instead (that could
+ * also still be UNSPECIFIED)
+ */
+ sin6->sin6_addr = udp->udp_bound_v6src;
+ }
+ }
+ return (0);
+}
+
+/* ARGSUSED */
+int
+udp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa,
+ socklen_t *salenp, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ udp_t *udp = connp->conn_udp;
+ int error;
+
+ ASSERT(udp != NULL);
+ rw_enter(&udp->udp_rwlock, RW_READER);
+
+ error = udp_do_getsockname(udp, sa, salenp);
+
+ rw_exit(&udp->udp_rwlock);
+
+ return (error);
+}
+
+int
+udp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
+ void *optvalp, socklen_t *optlen, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ udp_t *udp = connp->conn_udp;
+ int error;
+ t_uscalar_t max_optbuf_len;
+ void *optvalp_buf;
+ int len;
+
+ error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len,
+ udp_opt_obj.odb_opt_des_arr,
+ udp_opt_obj.odb_opt_arr_cnt,
+ udp_opt_obj.odb_topmost_tpiprovider,
+ B_FALSE, B_TRUE, cr);
+ if (error != 0) {
+ if (error < 0)
+ error = proto_tlitosyserr(-error);
+ return (error);
+ }
+
+ optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP);
+ rw_enter(&udp->udp_rwlock, RW_READER);
+ len = udp_opt_get(connp, level, option_name, optvalp_buf);
+ rw_exit(&udp->udp_rwlock);
+
+ if (len < 0) {
+ /*
+ * Pass on to IP
+ */
+ kmem_free(optvalp_buf, max_optbuf_len);
+ return (ip_get_options(connp, level, option_name,
+ optvalp, optlen, cr));
+ } else {
+ /*
+ * update optlen and copy option value
+ */
+ t_uscalar_t size = MIN(len, *optlen);
+ bcopy(optvalp_buf, optvalp, size);
+ bcopy(&size, optlen, sizeof (size));
+
+ kmem_free(optvalp_buf, max_optbuf_len);
+ return (0);
+ }
+}
+
+int
+udp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name,
+ const void *optvalp, socklen_t optlen, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ udp_t *udp = connp->conn_udp;
+ int error;
+
+ error = proto_opt_check(level, option_name, optlen, NULL,
+ udp_opt_obj.odb_opt_des_arr,
+ udp_opt_obj.odb_opt_arr_cnt,
+ udp_opt_obj.odb_topmost_tpiprovider,
+ B_TRUE, B_FALSE, cr);
+
+ if (error != 0) {
+ if (error < 0)
+ error = proto_tlitosyserr(-error);
+ return (error);
+ }
+
+ rw_enter(&udp->udp_rwlock, RW_WRITER);
+ error = udp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name,
+ optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp,
+ NULL, cr);
+ rw_exit(&udp->udp_rwlock);
+
+ if (error < 0) {
+ /*
+ * Pass on to ip
+ */
+ error = ip_set_options(connp, level, option_name, optvalp,
+ optlen, cr);
+ }
+
+ return (error);
+}
+
+void
+udp_clr_flowctrl(sock_lower_handle_t proto_handle)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ udp_t *udp = connp->conn_udp;
+
+ mutex_enter(&udp->udp_recv_lock);
+ connp->conn_flow_cntrld = B_FALSE;
+ mutex_exit(&udp->udp_recv_lock);
+}
+
+/* ARGSUSED */
+int
+udp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+
+ /* shut down the send side */
+ if (how != SHUT_RD)
+ (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
+ SOCK_OPCTL_SHUT_SEND, 0);
+ /* shut down the recv side */
+ if (how != SHUT_WR)
+ (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle,
+ SOCK_OPCTL_SHUT_RECV, 0);
+ return (0);
+}
+
+int
+udp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
+ int mode, int32_t *rvalp, cred_t *cr)
+{
+ conn_t *connp = (conn_t *)proto_handle;
+ int error;
+
+ switch (cmd) {
+ case ND_SET:
+ case ND_GET:
+ case _SIOCSOCKFALLBACK:
+ case TI_GETPEERNAME:
+ case TI_GETMYNAME:
+ ip1dbg(("udp_ioctl: cmd 0x%x on non streams socket",
+ cmd));
+ error = EINVAL;
+ break;
+ default:
+ /*
+ * Pass on to IP using helper stream
+ */
+ error = ldi_ioctl(
+ connp->conn_helper_info->ip_helper_stream_handle,
+ cmd, arg, mode, cr, rvalp);
+ break;
+ }
+ return (error);
+}
+
+/* ARGSUSED */
+int
+udp_accept(sock_lower_handle_t lproto_handle,
+ sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle,
+ cred_t *cr)
+{
+ return (EOPNOTSUPP);
+}
+
+/* ARGSUSED */
+int
+udp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr)
+{
+ return (EOPNOTSUPP);
+}
+
+sock_downcalls_t sock_udp_downcalls = {
+ udp_activate, /* sd_activate */
+ udp_accept, /* sd_accept */
+ udp_bind, /* sd_bind */
+ udp_listen, /* sd_listen */
+ udp_connect, /* sd_connect */
+ udp_getpeername, /* sd_getpeername */
+ udp_getsockname, /* sd_getsockname */
+ udp_getsockopt, /* sd_getsockopt */
+ udp_setsockopt, /* sd_setsockopt */
+ udp_send, /* sd_send */
+ NULL, /* sd_send_uio */
+ NULL, /* sd_recv_uio */
+ NULL, /* sd_poll */
+ udp_shutdown, /* sd_shutdown */
+ udp_clr_flowctrl, /* sd_setflowctrl */
+ udp_ioctl, /* sd_ioctl */
+ udp_close /* sd_close */
+};
diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c
index f900d0f3e1..0ec5a2c45e 100644
--- a/usr/src/uts/common/inet/udp/udp_opt_data.c
+++ b/usr/src/uts/common/inet/udp/udp_opt_data.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/stream.h>
#define _SUN_TPI_VERSION 2
@@ -85,9 +83,11 @@ opdes_t udp_opt_arr[] = {
{ SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
{ IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), 40, -1 /* not initialized */ },
+ (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+ IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
{ T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
- (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), 40, -1 /* not initialized */ },
+ (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT),
+ IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
{ IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
{ T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 },
@@ -318,8 +318,8 @@ uint_t udp_max_optsize; /* initialized when UDP driver is loaded */
optdb_obj_t udp_opt_obj = {
udp_opt_default, /* UDP default value function pointer */
- udp_opt_get, /* UDP get function pointer */
- udp_opt_set, /* UDP set function pointer */
+ udp_tpi_opt_get, /* UDP get function pointer */
+ udp_tpi_opt_set, /* UDP set function pointer */
B_TRUE, /* UDP is tpi provider */
UDP_OPT_ARR_CNT, /* UDP option database count of entries */
udp_opt_arr, /* UDP option database */
diff --git a/usr/src/uts/common/inet/udp/udpddi.c b/usr/src/uts/common/inet/udp/udpddi.c
index 0b80531ab8..63248365cd 100644
--- a/usr/src/uts/common/inet/udp/udpddi.c
+++ b/usr/src/uts/common/inet/udp/udpddi.c
@@ -30,6 +30,8 @@
#include <inet/common.h>
#include <inet/ip.h>
#include <inet/udp_impl.h>
+#include <sys/strsubr.h>
+#include <sys/socketvar.h>
#define INET_NAME "udp"
#define INET_MODDESC "UDP dummy STREAMS module"
@@ -38,6 +40,9 @@
#define INET_MODSTRTAB dummymodinfo
#define INET_DEVSTRTAB udpinfov4
#define INET_MODMTFLAGS D_MP
+#define INET_SOCKDESC "UDP socket module"
+#define INET_SOCK_PROTO_CREATE_FUNC (*udp_create)
+#define INET_SOCK_PROTO_FB_FUNC (*udp_fallback)
/*
* We define both synchronous STREAMS and sockfs direct-access
* mode for UDP module instance, because it is autopushed on
diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h
index 468fa553f4..38d255ac9d 100644
--- a/usr/src/uts/common/inet/udp_impl.h
+++ b/usr/src/uts/common/inet/udp_impl.h
@@ -252,7 +252,9 @@ struct udp_stack {
*/
in_port_t us_min_anonpriv_port;
+ ldi_ident_t us_ldi_ident;
};
+
typedef struct udp_stack udp_stack_t;
/* Internal udp control structure, one per open stream */
@@ -313,9 +315,14 @@ typedef struct udp_s {
/* Following protected by udp_rwlock */
mblk_t *udp_rcv_list_head; /* b_next chain of mblks */
mblk_t *udp_rcv_list_tail; /* last mblk in chain */
+ kmutex_t udp_recv_lock; /* recv lock */
uint_t udp_rcv_cnt; /* total data in rcv_list */
uint_t udp_rcv_msgcnt; /* total msgs in rcv_list */
+ size_t udp_rcv_disply_hiwat; /* user's view of rcvbuf */
size_t udp_rcv_hiwat; /* receive high watermark */
+ size_t udp_rcv_lowat; /* receive low watermark */
+ size_t udp_xmit_hiwat; /* Send buffer high watermark */
+ size_t udp_xmit_lowat; /* Send buffer low watermark */
uint_t udp_label_len; /* length of security label */
uint_t udp_label_len_v6; /* len of v6 security label */
in6_addr_t udp_v6lastdst; /* most recent destination */
@@ -323,6 +330,10 @@ typedef struct udp_s {
uint64_t udp_open_time; /* time when this was opened */
pid_t udp_open_pid; /* process id when this was opened */
udp_stack_t *udp_us; /* Stack instance for zone */
+ int udp_delayed_error;
+ mblk_t *udp_fallback_queue_head;
+ mblk_t *udp_fallback_queue_tail;
+ struct sockaddr_storage udp_delayed_addr;
} udp_t;
/* UDP Protocol header */
@@ -351,7 +362,6 @@ typedef struct udpahdr_s {
#define UDP_STAT(us, x) ((us)->us_statistics.x.value.ui64++)
#define UDP_STAT_UPDATE(us, x, n) \
((us)->us_statistics.x.value.ui64 += (n))
-
#ifdef DEBUG
#define UDP_DBGSTAT(us, x) UDP_STAT(us, x)
#else
@@ -359,25 +369,19 @@ typedef struct udpahdr_s {
#endif /* DEBUG */
extern int udp_opt_default(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
-extern int udp_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
-extern int udp_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
+extern int udp_tpi_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
+extern int udp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
uint_t *, uchar_t *, void *, cred_t *, mblk_t *);
extern mblk_t *udp_snmp_get(queue_t *, mblk_t *);
extern int udp_snmp_set(queue_t *, t_scalar_t, t_scalar_t, uchar_t *, int);
extern void udp_close_free(conn_t *);
extern void udp_quiesce_conn(conn_t *);
-extern void udp_ddi_init(void);
-extern void udp_ddi_destroy(void);
-extern void udp_resume_bind(conn_t *, mblk_t *);
-extern void udp_wput(queue_t *, mblk_t *);
-
-extern int udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name,
- uchar_t *ptr);
-extern int udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name,
- uchar_t *ptr);
-extern int udp_opt_set(queue_t *q, uint_t optset_context,
- int level, int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
- uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk);
+extern void udp_ddi_g_init(void);
+extern void udp_ddi_g_destroy(void);
+extern void udp_g_q_inactive(udp_stack_t *);
+extern void udp_output(conn_t *connp, mblk_t *mp, struct sockaddr *addr,
+ socklen_t addrlen);
+extern void udp_wput(queue_t *, mblk_t *);
/*
* Object to represent database of options to search passed to
@@ -387,6 +391,13 @@ extern int udp_opt_set(queue_t *q, uint_t optset_context,
extern optdb_obj_t udp_opt_obj;
extern uint_t udp_max_optsize;
+extern sock_lower_handle_t udp_create(int, int, int, sock_downcalls_t **,
+ uint_t *, int *, int, cred_t *);
+extern void udp_fallback(sock_lower_handle_t, queue_t *, boolean_t,
+ so_proto_quiesced_cb_t);
+
+extern sock_downcalls_t sock_udp_downcalls;
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/io/comstar/port/iscsit/iscsit_isns.c b/usr/src/uts/common/io/comstar/port/iscsit/iscsit_isns.c
index 0f166f77b7..2708d10c5b 100644
--- a/usr/src/uts/common/io/comstar/port/iscsit/iscsit_isns.c
+++ b/usr/src/uts/common/io/comstar/port/iscsit/iscsit_isns.c
@@ -42,6 +42,7 @@
#include <sys/iscsit/isns_protocol.h>
#include <iscsit.h>
#include <iscsit_isns.h>
+#include <sys/ksocket.h>
/* local defines */
#define MAX_XID (2^16)
@@ -177,7 +178,7 @@ static void
isnst_esi_thread(void *arg);
static boolean_t
-isnst_handle_esi_req(struct sonode *so, isns_pdu_t *pdu, size_t pl_size);
+isnst_handle_esi_req(ksocket_t so, isns_pdu_t *pdu, size_t pl_size);
static void isnst_esi_start(isns_portal_list_t *portal);
static void isnst_esi_stop();
@@ -303,22 +304,22 @@ isnst_esi_stop_thread(isns_esi_tinfo_t *tinfop)
list_remove(&esi_list, tinfop);
/*
- * The only way to break a thread waiting in soaccept() is to signal
- * it with EINTR. See idm_so_tgt_svc_offline for more detail.
- */
- tinfop->esi_so->so_error = EINTR;
- cv_signal(&tinfop->esi_so->so_connind_cv);
-
- /*
- * Must also drop the global lock in case the esi thread is running
- * and trying to update the server timestamps.
+ * The only way to break a thread waiting in ksocket_accept() is to call
+ * ksocket_close.
*/
mutex_exit(&isns_esi_mutex);
ISNS_GLOBAL_UNLOCK();
+ idm_soshutdown(tinfop->esi_so);
+ idm_sodestroy(tinfop->esi_so);
thread_join(tinfop->esi_thread_did);
ISNS_GLOBAL_LOCK();
mutex_enter(&isns_esi_mutex);
+ tinfop->esi_thread_running = B_FALSE;
+ tinfop->esi_so = NULL;
+ tinfop->esi_port = 0;
+ tinfop->esi_registered = B_FALSE;
+ cv_signal(&isns_esi_cv);
tinfop->esi_portal->portal_esi = NULL;
kmem_free(tinfop, sizeof (isns_esi_tinfo_t));
}
@@ -630,18 +631,22 @@ isnst_stop()
*/
static void
-isnst_update_server_timestamp(struct sonode *so)
+isnst_update_server_timestamp(ksocket_t so)
{
iscsit_isns_svr_t *svr;
struct in_addr *sin = NULL, *svr_in;
struct in6_addr *sin6 = NULL, *svr_in6;
-
- if (so->so_faddr_sa->sa_family == AF_INET) {
- sin = &((struct sockaddr_in *)
- ((void *)so->so_faddr_sa))->sin_addr;
+ struct sockaddr_in6 t_addr;
+ socklen_t t_addrlen;
+
+ bzero(&t_addr, sizeof (struct sockaddr_in6));
+ t_addrlen = sizeof (struct sockaddr_in6);
+ (void) ksocket_getpeername(so, (struct sockaddr *)&t_addr, &t_addrlen,
+ CRED());
+ if (((struct sockaddr *)(&t_addr))->sa_family == AF_INET) {
+ sin = &((struct sockaddr_in *)((void *)(&t_addr)))->sin_addr;
} else {
- sin6 = &((struct sockaddr_in6 *)
- ((void *)so->so_faddr_sa))->sin6_addr;
+ sin6 = &(&t_addr)->sin6_addr;
}
/*
@@ -1982,7 +1987,7 @@ static void *
isnst_open_so(struct sockaddr_storage *sa)
{
int sa_sz;
- struct sonode *so;
+ ksocket_t so;
/* determin local IP address */
if (sa->ss_family == AF_INET) {
@@ -2000,7 +2005,8 @@ isnst_open_so(struct sockaddr_storage *sa)
}
if (so != NULL) {
- if (soconnect(so, (struct sockaddr *)sa, sa_sz, 0, 0) != 0) {
+ if (ksocket_connect(so, (struct sockaddr *)sa, sa_sz, CRED())
+ != 0) {
/* not calling isnst_close_so() to */
/* make dtrace output look clear */
idm_soshutdown(so);
@@ -2133,7 +2139,7 @@ static void
isnst_esi_thread(void *arg)
{
isns_esi_tinfo_t *tinfop;
- struct sonode *newso;
+ ksocket_t newso;
struct sockaddr_in sin;
struct sockaddr_in6 sin6;
uint32_t on;
@@ -2141,6 +2147,14 @@ isnst_esi_thread(void *arg)
isns_pdu_t *pdu;
size_t pl_size;
int family;
+ struct sockaddr_in t_addr;
+ struct sockaddr_in6 t_addr6;
+ socklen_t t_addrlen;
+ socklen_t t_addrlen6;
+
+ bzero(&t_addr, sizeof (struct sockaddr_in6));
+ t_addrlen = sizeof (struct sockaddr_in);
+ t_addrlen6 = sizeof (struct sockaddr_in6);
tinfop = (isns_esi_tinfo_t *)arg;
tinfop->esi_thread_did = curthread->t_did;
@@ -2155,7 +2169,6 @@ isnst_esi_thread(void *arg)
family = AF_INET6;
}
-
if ((tinfop->esi_so =
idm_socreate(family, SOCK_STREAM, 0)) == NULL) {
cmn_err(CE_WARN,
@@ -2166,7 +2179,7 @@ isnst_esi_thread(void *arg)
mutex_exit(&isns_esi_mutex);
thread_exit();
}
-
+ ksocket_hold(tinfop->esi_so);
/*
* Set options, bind, and listen until we're told to stop
*/
@@ -2181,17 +2194,19 @@ isnst_esi_thread(void *arg)
&sin.sin_addr.s_addr, sizeof (in_addr_t));
on = 1;
- (void) sosetsockopt(tinfop->esi_so, SOL_SOCKET, SO_REUSEADDR,
- (char *)&on, sizeof (on));
+ (void) ksocket_setsockopt(tinfop->esi_so, SOL_SOCKET,
+ SO_REUSEADDR, (char *)&on, sizeof (on), CRED());
- if (sobind(tinfop->esi_so, (struct sockaddr *)&sin,
- sizeof (sin), 0, 0) != 0) {
+ if (ksocket_bind(tinfop->esi_so, (struct sockaddr *)&sin,
+ sizeof (sin), CRED()) != 0) {
idm_sodestroy(tinfop->esi_so);
tinfop->esi_so = NULL;
tinfop->esi_thread_failed = B_TRUE;
} else {
+ (void) ksocket_getsockname(tinfop->esi_so,
+ (struct sockaddr *)(&t_addr), &t_addrlen, CRED());
tinfop->esi_port = ntohs(((struct sockaddr_in *)
- ((void *)tinfop->esi_so->so_laddr_sa))->sin_port);
+ (&t_addr))->sin_port);
}
break;
@@ -2205,17 +2220,19 @@ isnst_esi_thread(void *arg)
&sin6.sin6_addr.s6_addr, sizeof (in6_addr_t));
on = 1;
- (void) sosetsockopt(tinfop->esi_so, SOL_SOCKET,
- SO_REUSEADDR, (char *)&on, sizeof (on));
+ (void) ksocket_setsockopt(tinfop->esi_so, SOL_SOCKET,
+ SO_REUSEADDR, (char *)&on, sizeof (on), CRED());
- if (sobind(tinfop->esi_so, (struct sockaddr *)&sin6,
- sizeof (sin6), 0, 0) != 0) {
+ if (ksocket_bind(tinfop->esi_so, (struct sockaddr *)&sin6,
+ sizeof (sin6), CRED()) != 0) {
idm_sodestroy(tinfop->esi_so);
tinfop->esi_so = NULL;
tinfop->esi_thread_failed = B_TRUE;
} else {
+ (void) ksocket_getsockname(tinfop->esi_so,
+ (struct sockaddr *)(&t_addr6), &t_addrlen6, CRED());
tinfop->esi_port = ntohs(((struct sockaddr_in6 *)
- ((void *)tinfop->esi_so->so_laddr_sa))->sin6_port);
+ (&t_addr6))->sin6_port);
}
break;
@@ -2226,7 +2243,7 @@ isnst_esi_thread(void *arg)
goto esi_thread_exit;
}
- if ((rc = solisten(tinfop->esi_so, 5)) != 0) {
+ if ((rc = ksocket_listen(tinfop->esi_so, 5, CRED())) != 0) {
cmn_err(CE_WARN, "isnst_esi_thread: listen failure 0x%x", rc);
goto esi_thread_exit;
}
@@ -2244,21 +2261,21 @@ isnst_esi_thread(void *arg)
DTRACE_PROBE2(iscsit__isns__esi__accept__wait,
boolean_t, tinfop->esi_thread_running,
boolean_t, tinfop->esi_thread_failed);
- if ((rc = soaccept(tinfop->esi_so, 0, &newso)) != 0) {
+ if ((rc = ksocket_accept(tinfop->esi_so, NULL, NULL,
+ &newso, CRED())) != 0) {
mutex_enter(&isns_esi_mutex);
DTRACE_PROBE2(iscsit__isns__esi__accept__fail,
boolean_t, tinfop->esi_thread_running,
boolean_t, tinfop->esi_thread_failed);
/*
- * If we were interrupted with EINTR, it's not
- * really a failure.
+ * If we were interrupted with EINTR
+ * it's not really a failure.
*/
if (rc != EINTR) {
cmn_err(CE_WARN, "isnst_esi_thread: "
"accept failure (0x%x)", rc);
tinfop->esi_thread_failed = B_TRUE;
}
-
tinfop->esi_thread_running = B_FALSE;
continue;
}
@@ -2281,7 +2298,7 @@ isnst_esi_thread(void *arg)
tinfop->esi_registered = B_TRUE;
}
- (void) soshutdown(newso, SHUT_RDWR);
+ (void) ksocket_close(newso, CRED());
/*
* Do not hold the esi mutex during server timestamp
@@ -2295,15 +2312,7 @@ isnst_esi_thread(void *arg)
}
mutex_exit(&isns_esi_mutex);
esi_thread_exit:
- idm_soshutdown(tinfop->esi_so);
- idm_sodestroy(tinfop->esi_so);
- mutex_enter(&isns_esi_mutex);
- tinfop->esi_thread_running = B_FALSE;
- tinfop->esi_so = NULL;
- tinfop->esi_port = 0;
- tinfop->esi_registered = B_FALSE;
- cv_signal(&isns_esi_cv);
- mutex_exit(&isns_esi_mutex);
+ ksocket_rele(tinfop->esi_so);
thread_exit();
}
@@ -2312,7 +2321,7 @@ esi_thread_exit:
*/
static boolean_t
-isnst_handle_esi_req(struct sonode *so, isns_pdu_t *pdu, size_t pl_size)
+isnst_handle_esi_req(ksocket_t ks, isns_pdu_t *pdu, size_t pl_size)
{
isns_pdu_t *rsp_pdu;
isns_resp_t *rsp;
@@ -2353,7 +2362,7 @@ isnst_handle_esi_req(struct sonode *so, isns_pdu_t *pdu, size_t pl_size)
bcopy(pdu->payload, rsp->data, pl_len - 4);
rsp_pdu->payload_len = htons(pl_len);
- if (isnst_send_pdu(so, rsp_pdu) != 0) {
+ if (isnst_send_pdu(ks, rsp_pdu) != 0) {
cmn_err(CE_WARN, "isnst_handle_esi_req: Send response failed");
esirv = B_FALSE;
}
diff --git a/usr/src/uts/common/io/comstar/port/iscsit/iscsit_isns.h b/usr/src/uts/common/io/comstar/port/iscsit/iscsit_isns.h
index 40c111f491..af0d8982bb 100644
--- a/usr/src/uts/common/io/comstar/port/iscsit/iscsit_isns.h
+++ b/usr/src/uts/common/io/comstar/port/iscsit/iscsit_isns.h
@@ -62,7 +62,7 @@ typedef struct {
struct isns_portal_list_s *esi_portal;
kthread_t *esi_thread;
kt_did_t esi_thread_did;
- struct sonode *esi_so;
+ ksocket_t esi_so;
uint16_t esi_port;
boolean_t esi_thread_running;
boolean_t esi_thread_failed;
diff --git a/usr/src/uts/common/io/comstar/port/iscsit/iscsit_radiuspacket.c b/usr/src/uts/common/io/comstar/port/iscsit/iscsit_radiuspacket.c
index 2441e3b65c..912158cb2d 100644
--- a/usr/src/uts/common/io/comstar/port/iscsit/iscsit_radiuspacket.c
+++ b/usr/src/uts/common/io/comstar/port/iscsit/iscsit_radiuspacket.c
@@ -32,18 +32,19 @@
#include <sys/idm/idm_so.h>
#include <sys/iscsit/radius_packet.h>
#include <sys/iscsit/radius_protocol.h>
+#include <sys/ksocket.h>
static void encode_chap_password(int identifier, int chap_passwd_len,
uint8_t *chap_passwd, uint8_t *result);
-static size_t iscsit_net_recvmsg(void *socket, struct msghdr *msg,
+static size_t iscsit_net_recvmsg(ksocket_t socket, struct msghdr *msg,
int timeout);
/*
* See radius_packet.h.
*/
int
-iscsit_snd_radius_request(void *socket, iscsi_ipaddr_t rsvr_ip_addr,
+iscsit_snd_radius_request(ksocket_t socket, iscsi_ipaddr_t rsvr_ip_addr,
uint32_t rsvr_port, radius_packet_data_t *req_data)
{
int i; /* Loop counter. */
@@ -164,7 +165,7 @@ iscsit_snd_radius_request(void *socket, iscsi_ipaddr_t rsvr_ip_addr,
* See radius_packet.h.
*/
int
-iscsit_rcv_radius_response(void *socket, uint8_t *shared_secret,
+iscsit_rcv_radius_response(ksocket_t socket, uint8_t *shared_secret,
uint32_t shared_secret_len, uint8_t *req_authenticator,
radius_packet_data_t *resp_data)
{
@@ -177,8 +178,6 @@ iscsit_rcv_radius_response(void *socket, uint8_t *shared_secret,
struct iovec iov[1];
struct nmsghdr msg;
- struct sonode *so = (struct sonode *)socket;
- int ret = 0;
tmp_data = kmem_zalloc(MAX_RAD_PACKET_LEN, KM_SLEEP);
iov[0].iov_base = (char *)tmp_data;
@@ -193,11 +192,6 @@ iscsit_rcv_radius_response(void *socket, uint8_t *shared_secret,
msg.msg_iov = iov;
msg.msg_iovlen = 1;
- (void) VOP_IOCTL(SOTOV(so), I_POP, 0, FKIOCTL, CRED(), &ret, NULL);
- if (ret != 0) {
- return (RAD_RSP_RCVD_NO_DATA);
- }
-
received_len = iscsit_net_recvmsg(socket, &msg, RAD_RCV_TIMEOUT);
if (received_len <= (size_t)0) {
@@ -313,36 +307,32 @@ encode_chap_password(int identifier, int chap_passwd_len,
*/
/* ARGSUSED */
static size_t
-iscsit_net_recvmsg(void *socket, struct msghdr *msg, int timeout)
+iscsit_net_recvmsg(ksocket_t socket, struct msghdr *msg, int timeout)
{
- int idx;
- int total_len = 0;
- struct uio uio;
- uchar_t pri = 0;
- int prflag = MSG_ANY;
- rval_t rval;
- struct sonode *sonode = (struct sonode *)socket;
-
- /* Initialization of the uio structure. */
- bzero(&uio, sizeof (uio));
- uio.uio_iov = msg->msg_iov;
- uio.uio_iovcnt = msg->msg_iovlen;
- uio.uio_segflg = UIO_SYSSPACE;
-
- for (idx = 0; idx < msg->msg_iovlen; idx++) {
- total_len += (msg->msg_iov)[idx].iov_len;
- }
- uio.uio_resid = total_len;
-
+ int prflag = msg->msg_flags;
+ size_t recv = 0;
+ struct sockaddr_in6 l_addr, f_addr;
+ socklen_t l_addrlen;
+ socklen_t f_addrlen;
+
+ bzero(&l_addr, sizeof (struct sockaddr_in6));
+ bzero(&f_addr, sizeof (struct sockaddr_in6));
+ l_addrlen = sizeof (struct sockaddr_in6);
+ f_addrlen = sizeof (struct sockaddr_in6);
/* If timeout requested on receive */
if (timeout > 0) {
boolean_t loopback = B_FALSE;
+ (void) ksocket_getsockname(socket, (struct sockaddr *)(&l_addr),
+ &l_addrlen, CRED());
+ (void) ksocket_getpeername(socket, (struct sockaddr *)(&f_addr),
+ &f_addrlen, CRED());
+
/* And this isn't a loopback connection */
- if (sonode->so_laddr.soa_sa->sa_family == AF_INET) {
+ if (((struct sockaddr *)(&l_addr))->sa_family == AF_INET) {
struct sockaddr_in *lin = (struct sockaddr_in *)
- ((void *)sonode->so_laddr.soa_sa);
+ ((void *)(&l_addr));
struct sockaddr_in *fin = (struct sockaddr_in *)
- ((void *)sonode->so_faddr.soa_sa);
+ ((void *)(&f_addr));
if ((lin->sin_family == fin->sin_family) &&
(bcmp(&lin->sin_addr, &fin->sin_addr,
@@ -351,9 +341,9 @@ iscsit_net_recvmsg(void *socket, struct msghdr *msg, int timeout)
}
} else {
struct sockaddr_in6 *lin6 = (struct sockaddr_in6 *)
- ((void *)sonode->so_laddr.soa_sa);
+ ((void *)(&l_addr));
struct sockaddr_in6 *fin6 = (struct sockaddr_in6 *)
- ((void *)sonode->so_faddr.soa_sa);
+ ((void *)(&f_addr));
if ((lin6->sin6_family == fin6->sin6_family) &&
(bcmp(&lin6->sin6_addr, &fin6->sin6_addr,
@@ -361,23 +351,20 @@ iscsit_net_recvmsg(void *socket, struct msghdr *msg, int timeout)
loopback = B_TRUE;
}
}
-
if (loopback == B_FALSE) {
- /*
- * Then poll device for up to the timeout
- * period or the requested data is received.
- */
- if (kstrgetmsg(SOTOV(sonode),
- NULL, NULL, &pri, &prflag, timeout * 1000,
- &rval) == ETIME) {
+ struct timeval tl;
+ tl.tv_sec = timeout;
+ tl.tv_usec = 0;
+ /* Set recv timeout */
+ if (ksocket_setsockopt(socket, SOL_SOCKET, SO_RCVTIMEO,
+ &tl, sizeof (struct timeval), CRED()))
return (0);
- }
}
}
/*
* Receive the requested data. Block until all
- * data is received.
+ * data is received or timeout.
*
* resid occurs only when the connection is
* disconnected. In that case it will return
@@ -385,6 +372,6 @@ iscsit_net_recvmsg(void *socket, struct msghdr *msg, int timeout)
* In general this is the total amount we
* requested.
*/
- (void) sorecvmsg((struct sonode *)socket, msg, &uio);
- return (total_len - uio.uio_resid);
+ (void) ksocket_recvmsg(socket, msg, prflag, &recv, CRED());
+ return (recv);
}
diff --git a/usr/src/uts/common/io/ib/clients/rds/rds_opt.c b/usr/src/uts/common/io/ib/clients/rds/rds_opt.c
index f0e863d0f3..902d838ff4 100644
--- a/usr/src/uts/common/io/ib/clients/rds/rds_opt.c
+++ b/usr/src/uts/common/io/ib/clients/rds/rds_opt.c
@@ -19,14 +19,12 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/ib/clients/rds/rds.h>
-#include <inet/mi.h>
+#include <inet/proto_set.h>
#define rds_max_buf 2097152
opdes_t rds_opt_arr[] = {
@@ -143,7 +141,7 @@ rds_opt_set(queue_t *q, uint_t optset_context, int level,
}
if (!checkonly) {
RD(q)->q_hiwat = *i1;
- (void) mi_set_sth_hiwat(RD(q), *i1);
+ (void) proto_set_rx_hiwat(RD(q), NULL, *i1);
}
break;
default:
diff --git a/usr/src/uts/common/io/ib/clients/rds/rdsddi.c b/usr/src/uts/common/io/ib/clients/rds/rdsddi.c
index 306a2a593e..877e56fe8a 100644
--- a/usr/src/uts/common/io/ib/clients/rds/rdsddi.c
+++ b/usr/src/uts/common/io/ib/clients/rds/rdsddi.c
@@ -23,7 +23,6 @@
* Use is subject to license terms.
*/
-
#include <sys/types.h>
#include <sys/conf.h>
#include <sys/modctl.h>
@@ -43,6 +42,7 @@
#include <inet/common.h>
#include <inet/ip.h>
#include <inet/mi.h>
+#include <inet/proto_set.h>
#include <sys/ib/clients/rds/rds.h>
#include <sys/policy.h>
#include <inet/ipclassifier.h>
@@ -226,8 +226,8 @@ rds_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
WR(q)->q_lowat = rds_xmit_lowat;
/* Set the Stream head watermarks */
- (void) mi_set_sth_hiwat(q, rds_recv_hiwat);
- (void) mi_set_sth_lowat(q, rds_recv_lowat);
+ (void) proto_set_rx_hiwat(q, NULL, rds_recv_hiwat);
+ (void) proto_set_rx_lowat(q, NULL, rds_recv_lowat);
return (0);
}
@@ -337,7 +337,7 @@ rds_deliver_new_msg(mblk_t *mp, ipaddr_t local_addr, ipaddr_t rem_addr,
if (rds->rds_port_quota > current_port_quota) {
/* this may result in stalling the port */
rds->rds_port_quota = current_port_quota;
- (void) mi_set_sth_hiwat(rds->rds_ulpd,
+ (void) proto_set_rx_hiwat(rds->rds_ulpd, NULL,
rds->rds_port_quota * UserBufferSize);
RDS_INCR_PORT_QUOTA_ADJUSTED();
}
@@ -599,7 +599,8 @@ rds_bind(queue_t *q, mblk_t *mp)
RDS_INCR_NPORT();
rds->rds_port_quota = RDS_CURRENT_PORT_QUOTA();
RDS_SET_PORT_QUOTA(rds->rds_port_quota);
- (void) mi_set_sth_hiwat(RD(q), rds->rds_port_quota * UserBufferSize);
+ (void) proto_set_rx_hiwat(RD(q), NULL,
+ rds->rds_port_quota * UserBufferSize);
qreply(q, mp);
}
@@ -859,7 +860,7 @@ rds_rsrv(queue_t *q)
current_port_quota = RDS_GET_PORT_QUOTA();
if (rds->rds_port_quota != current_port_quota) {
rds->rds_port_quota = current_port_quota;
- (void) mi_set_sth_hiwat(q,
+ (void) proto_set_rx_hiwat(q, NULL,
rds->rds_port_quota * UserBufferSize);
}
diff --git a/usr/src/uts/common/io/ib/clients/sdp/sdpddi.c b/usr/src/uts/common/io/ib/clients/sdp/sdpddi.c
index 0973888811..d0c3bb8b4e 100644
--- a/usr/src/uts/common/io/ib/clients/sdp/sdpddi.c
+++ b/usr/src/uts/common/io/ib/clients/sdp/sdpddi.c
@@ -23,7 +23,6 @@
* Use is subject to license terms.
*/
-
#include <sys/types.h>
#include <sys/conf.h>
#include <sys/modctl.h>
@@ -182,9 +181,12 @@ sdp_gen_ioctl(queue_t *q, mblk_t *mp)
/* LINTED */
iocp = (struct iocblk *)mp->b_rptr;
switch (iocp->ioc_cmd) {
+ uintptr_t send_enable;
case SIOCSENABLESDP:
bcopy(mp->b_cont->b_rptr, &enable, sizeof (int));
+ send_enable = enable;
+
/*
* Check for root privs.
* if not net config privs - return state of system SDP
@@ -202,7 +204,8 @@ sdp_gen_ioctl(queue_t *q, mblk_t *mp)
* action of enabling/disabling sdp is simply acked.
*/
rw_enter(&sdp_transport_lock, RW_READER);
- if ((enable == 1) && (sdp_transport_handle == NULL) &&
+ if ((send_enable == 1) &&
+ (sdp_transport_handle == NULL) &&
(priv == B_TRUE)) {
/* Initialize sdpib transport driver */
rw_exit(&sdp_transport_lock);
@@ -215,21 +218,20 @@ sdp_gen_ioctl(queue_t *q, mblk_t *mp)
enable = 0;
goto done;
}
- (void) sdp_ioctl(NULL, iocp->ioc_cmd, &enable,
- CRED());
- } else if ((enable == 0) &&
- (sdp_transport_handle != NULL) &&
- (priv == B_TRUE)) {
- (void) sdp_ioctl(NULL, iocp->ioc_cmd, &enable,
- CRED());
- (void) ldi_close(sdp_transport_handle,
- FNDELAY, kcred);
- sdp_transport_handle = NULL;
+ (void) ldi_ioctl(sdp_transport_handle,
+ iocp->ioc_cmd, (intptr_t)&send_enable,
+ FKIOCTL, CRED(), (int *)&enable);
+ } else if (sdp_transport_handle != NULL) {
+ (void) ldi_ioctl(sdp_transport_handle,
+ iocp->ioc_cmd, (intptr_t)&send_enable,
+ FKIOCTL, CRED(), (int *)&enable);
+ if (send_enable == 0 && priv == B_TRUE) {
+ (void) ldi_close(sdp_transport_handle,
+ FNDELAY, kcred);
+ sdp_transport_handle = NULL;
+ }
} else {
- ret = sdp_ioctl(NULL, iocp->ioc_cmd, &enable,
- CRED());
- if (ret == EINVAL)
- enable = 0;
+ enable = 0;
}
rw_exit(&sdp_transport_lock);
diff --git a/usr/src/uts/common/io/idm/idm_so.c b/usr/src/uts/common/io/idm/idm_so.c
index b8c236d749..c868c76ddd 100644
--- a/usr/src/uts/common/io/idm/idm_so.c
+++ b/usr/src/uts/common/io/idm/idm_so.c
@@ -45,7 +45,7 @@
#include <netinet/in.h>
#include <net/if.h>
#include <sys/sockio.h>
-
+#include <sys/ksocket.h>
#include <sys/idm/idm.h>
#include <sys/idm/idm_so.h>
#include <sys/idm/idm_text.h>
@@ -60,14 +60,13 @@ static void idm_sorx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
static void idm_sorx_addl_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
static void idm_sotx_cache_pdu_cb(idm_pdu_t *pdu, idm_status_t status);
-static idm_status_t idm_so_conn_create_common(idm_conn_t *ic,
- struct sonode *new_so);
+static idm_status_t idm_so_conn_create_common(idm_conn_t *ic, ksocket_t new_so);
static void idm_so_conn_destroy_common(idm_conn_t *ic);
static void idm_so_conn_connect_common(idm_conn_t *ic);
static void idm_set_ini_preconnect_options(idm_so_conn_t *sc);
static void idm_set_ini_postconnect_options(idm_so_conn_t *sc);
-static void idm_set_tgt_connect_options(struct sonode *sonode);
+static void idm_set_tgt_connect_options(ksocket_t so);
static idm_status_t idm_i_so_tx(idm_pdu_t *pdu);
static idm_status_t idm_sorecvdata(idm_conn_t *ic, idm_pdu_t *pdu);
@@ -180,58 +179,17 @@ idm_so_fini(void)
kmem_cache_destroy(idm.idm_sorx_pdu_cache);
}
-struct sonode *
+ksocket_t
idm_socreate(int domain, int type, int protocol)
{
- vnode_t *dvp;
- vnode_t *vp;
- struct snode *csp;
- int err;
- major_t maj;
-
- if ((vp = solookup(domain, type, protocol, NULL, &err)) == NULL) {
-
- /*
- * solookup calls sogetvp if the vp is not found in the cache.
- * Since the call to sogetvp is hardwired to use USERSPACE
- * and declared static we'll do the work here instead.
- */
- err = lookupname(type == SOCK_STREAM ? "/dev/tcp" : "/dev/udp",
- UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
- if (err != 0)
- return (NULL);
-
- /* Check that it is the correct vnode */
- if (vp->v_type != VCHR) {
- VN_RELE(vp);
- return (NULL);
- }
-
- csp = VTOS(VTOS(vp)->s_commonvp);
- if (!(csp->s_flag & SDIPSET)) {
- char *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-
- err = ddi_dev_pathname(vp->v_rdev, S_IFCHR,
- pathname);
- if (err == 0) {
- err = devfs_lookupname(pathname, NULLVPP,
- &dvp);
- }
- VN_RELE(vp);
- kmem_free(pathname, MAXPATHLEN);
- if (err != 0) {
- return (NULL);
- }
- vp = dvp;
- }
+ ksocket_t ks;
- maj = getmajor(vp->v_rdev);
- if (!STREAMSTAB(maj)) {
- VN_RELE(vp);
- return (NULL);
- }
+ if (!ksocket_socket(&ks, domain, type, protocol, KSOCKET_NOSLEEP,
+ CRED())) {
+ return (ks);
+ } else {
+ return (NULL);
}
- return (socreate(vp, domain, type, protocol, SOV_DEFAULT, NULL, &err));
}
/*
@@ -242,9 +200,9 @@ idm_socreate(int domain, int type, int protocol)
* regain control of a thread stuck in idm_sorecv.
*/
void
-idm_soshutdown(struct sonode *so)
+idm_soshutdown(ksocket_t so)
{
- (void) soshutdown(so, SHUT_RDWR);
+ (void) ksocket_shutdown(so, SHUT_RDWR, CRED());
}
/*
@@ -254,13 +212,9 @@ idm_soshutdown(struct sonode *so)
* otherwise undefined behavior will result.
*/
void
-idm_sodestroy(struct sonode *so)
+idm_sodestroy(ksocket_t ks)
{
- vnode_t *vp = SOTOV(so);
-
- (void) VOP_CLOSE(vp, 0, 1, 0, kcred, NULL);
-
- VN_RELE(vp);
+ (void) ksocket_close(ks, CRED());
}
/*
@@ -303,8 +257,7 @@ idm_v6_addr_okay(struct in6_addr *addr6)
int
idm_get_ipaddr(idm_addr_list_t **ipaddr_p)
{
- struct sonode *so4, *so6;
- vnode_t *vp, *vp4, *vp6;
+ ksocket_t so4, so6;
struct lifnum lifn;
struct lifconf lifc;
struct lifreq *lp;
@@ -332,19 +285,15 @@ idm_get_ipaddr(idm_addr_list_t **ipaddr_p)
return (0);
}
- /* setup the vp's for each socket type */
- vp6 = SOTOV(so6);
- vp4 = SOTOV(so4);
- /* use vp6 for ioctls with unspecified families by default */
- vp = vp6;
retry_count:
/* snapshot the current number of interfaces */
lifn.lifn_family = PF_UNSPEC;
lifn.lifn_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
lifn.lifn_count = 0;
- if (VOP_IOCTL(vp, SIOCGLIFNUM, (intptr_t)&lifn, FKIOCTL, kcred,
- &rval, NULL) != 0) {
+ /* use vp6 for ioctls with unspecified families by default */
+ if (ksocket_ioctl(so6, SIOCGLIFNUM, (intptr_t)&lifn, &rval, CRED())
+ != 0) {
goto cleanup;
}
@@ -364,8 +313,7 @@ retry_count:
lifc.lifc_flags = LIFC_NOXMIT | LIFC_TEMPORARY | LIFC_ALLZONES;
lifc.lifc_len = bufsize;
lifc.lifc_buf = buf;
- rc = VOP_IOCTL(vp, SIOCGLIFCONF, (intptr_t)&lifc, FKIOCTL, kcred,
- &rval, NULL);
+ rc = ksocket_ioctl(so6, SIOCGLIFCONF, (intptr_t)&lifc, &rval, CRED());
if (rc != 0) {
goto cleanup;
}
@@ -401,16 +349,16 @@ retry_count:
*/
switch (ss.ss_family) {
case AF_INET:
- vp = vp4;
+ rc = ksocket_ioctl(so4, SIOCGLIFFLAGS, (intptr_t)lp,
+ &rval, CRED());
break;
case AF_INET6:
- vp = vp6;
+ rc = ksocket_ioctl(so6, SIOCGLIFFLAGS, (intptr_t)lp,
+ &rval, CRED());
break;
default:
continue;
}
- rc = VOP_IOCTL(vp, SIOCGLIFFLAGS, (intptr_t)lp, FKIOCTL, kcred,
- &rval, NULL);
if (rc == 0) {
/*
* If we got the flags, skip uninteresting
@@ -468,7 +416,7 @@ cleanup:
}
int
-idm_sorecv(struct sonode *so, void *msg, size_t len)
+idm_sorecv(ksocket_t so, void *msg, size_t len)
{
iovec_t iov;
@@ -495,13 +443,13 @@ idm_sorecv(struct sonode *so, void *msg, size_t len)
* -1 if sosendmsg returns success but uio_resid != 0
*/
int
-idm_sosendto(struct sonode *so, void *buff, size_t len,
+idm_sosendto(ksocket_t so, void *buff, size_t len,
struct sockaddr *name, socklen_t namelen)
{
struct msghdr msg;
- struct uio uio;
struct iovec iov[1];
int error;
+ size_t sent = 0;
iov[0].iov_base = buff;
iov[0].iov_len = len;
@@ -510,19 +458,12 @@ idm_sosendto(struct sonode *so, void *buff, size_t len,
bzero(&msg, sizeof (msg));
msg.msg_iov = iov;
msg.msg_iovlen = 1;
-
- /* Initialization of the uio structure. */
- uio.uio_iov = iov;
- uio.uio_iovcnt = 1;
- uio.uio_segflg = UIO_SYSSPACE;
- uio.uio_resid = len;
-
msg.msg_name = name;
msg.msg_namelen = namelen;
- if ((error = sosendmsg(so, &msg, &uio)) == 0) {
+ if ((error = ksocket_sendmsg(so, &msg, 0, &sent, CRED())) == 0) {
/* Data sent */
- if (uio.uio_resid == 0) {
+ if (sent == len) {
/* All data sent. Success. */
return (0);
} else {
@@ -546,11 +487,11 @@ idm_sosendto(struct sonode *so, void *buff, size_t len,
* -1 if sosendmsg returns success but uio_resid != 0
*/
int
-idm_iov_sosend(struct sonode *so, iovec_t *iop, int iovlen, size_t total_len)
+idm_iov_sosend(ksocket_t so, iovec_t *iop, int iovlen, size_t total_len)
{
struct msghdr msg;
- struct uio uio;
int error;
+ size_t sent = 0;
ASSERT(iop != NULL);
@@ -559,16 +500,10 @@ idm_iov_sosend(struct sonode *so, iovec_t *iop, int iovlen, size_t total_len)
msg.msg_iov = iop;
msg.msg_iovlen = iovlen;
- /* Initialization of the uio structure. */
- bzero(&uio, sizeof (uio));
- uio.uio_iov = iop;
- uio.uio_iovcnt = iovlen;
- uio.uio_segflg = UIO_SYSSPACE;
- uio.uio_resid = total_len;
-
- if ((error = sosendmsg(so, &msg, &uio)) == 0) {
+ if ((error = ksocket_sendmsg(so, &msg, 0, &sent, CRED()))
+ == 0) {
/* Data sent */
- if (uio.uio_resid == 0) {
+ if (sent == total_len) {
/* All data sent. Success. */
return (0);
} else {
@@ -592,30 +527,25 @@ idm_iov_sosend(struct sonode *so, iovec_t *iop, int iovlen, size_t total_len)
* -1 if sorecvmsg returns success but uio_resid != 0
*/
int
-idm_iov_sorecv(struct sonode *so, iovec_t *iop, int iovlen, size_t total_len)
+idm_iov_sorecv(ksocket_t so, iovec_t *iop, int iovlen, size_t total_len)
{
struct msghdr msg;
- struct uio uio;
int error;
+ size_t recv;
+ int flags;
ASSERT(iop != NULL);
/* Initialization of the message header. */
bzero(&msg, sizeof (msg));
msg.msg_iov = iop;
- msg.msg_flags = MSG_WAITALL;
msg.msg_iovlen = iovlen;
+ flags = MSG_WAITALL;
- /* Initialization of the uio structure. */
- bzero(&uio, sizeof (uio));
- uio.uio_iov = iop;
- uio.uio_iovcnt = iovlen;
- uio.uio_segflg = UIO_SYSSPACE;
- uio.uio_resid = total_len;
-
- if ((error = sorecvmsg(so, &msg, &uio)) == 0) {
+ if ((error = ksocket_recvmsg(so, &msg, flags, &recv, CRED()))
+ == 0) {
/* Received data */
- if (uio.uio_resid == 0) {
+ if (recv == total_len) {
/* All requested data received. Success */
return (0);
} else {
@@ -639,12 +569,14 @@ idm_set_ini_preconnect_options(idm_so_conn_t *sc)
int abort = 30000;
/* Pre-connect socket options */
- (void) sosetsockopt(sc->ic_so, IPPROTO_TCP, TCP_CONN_NOTIFY_THRESHOLD,
- (char *)&conn_notify, sizeof (int));
- (void) sosetsockopt(sc->ic_so, IPPROTO_TCP, TCP_CONN_ABORT_THRESHOLD,
- (char *)&conn_abort, sizeof (int));
- (void) sosetsockopt(sc->ic_so, IPPROTO_TCP, TCP_ABORT_THRESHOLD,
- (char *)&abort, sizeof (int));
+ (void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP,
+ TCP_CONN_NOTIFY_THRESHOLD, (char *)&conn_notify, sizeof (int),
+ CRED());
+ (void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP,
+ TCP_CONN_ABORT_THRESHOLD, (char *)&conn_abort, sizeof (int),
+ CRED());
+ (void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP, TCP_ABORT_THRESHOLD,
+ (char *)&abort, sizeof (int), CRED());
}
static void
@@ -655,28 +587,28 @@ idm_set_ini_postconnect_options(idm_so_conn_t *sc)
const int on = 1;
/* Set postconnect options */
- (void) sosetsockopt(sc->ic_so, IPPROTO_TCP, TCP_NODELAY,
- (char *)&on, sizeof (int));
- (void) sosetsockopt(sc->ic_so, SOL_SOCKET, SO_RCVBUF,
- (char *)&rcvbuf, sizeof (int));
- (void) sosetsockopt(sc->ic_so, SOL_SOCKET, SO_SNDBUF,
- (char *)&sndbuf, sizeof (int));
+ (void) ksocket_setsockopt(sc->ic_so, IPPROTO_TCP, TCP_NODELAY,
+ (char *)&on, sizeof (int), CRED());
+ (void) ksocket_setsockopt(sc->ic_so, SOL_SOCKET, SO_RCVBUF,
+ (char *)&rcvbuf, sizeof (int), CRED());
+ (void) ksocket_setsockopt(sc->ic_so, SOL_SOCKET, SO_SNDBUF,
+ (char *)&sndbuf, sizeof (int), CRED());
}
static void
-idm_set_tgt_connect_options(struct sonode *sonode)
+idm_set_tgt_connect_options(ksocket_t ks)
{
int32_t rcvbuf = IDM_RCVBUF_SIZE;
int32_t sndbuf = IDM_SNDBUF_SIZE;
const int on = 1;
/* Set connect options */
- (void) sosetsockopt(sonode, SOL_SOCKET, SO_RCVBUF,
- (char *)&rcvbuf, sizeof (int));
- (void) sosetsockopt(sonode, SOL_SOCKET, SO_SNDBUF,
- (char *)&sndbuf, sizeof (int));
- (void) sosetsockopt(sonode, IPPROTO_TCP, TCP_NODELAY,
- (char *)&on, sizeof (on));
+ (void) ksocket_setsockopt(ks, SOL_SOCKET, SO_RCVBUF,
+ (char *)&rcvbuf, sizeof (int), CRED());
+ (void) ksocket_setsockopt(ks, SOL_SOCKET, SO_SNDBUF,
+ (char *)&sndbuf, sizeof (int), CRED());
+ (void) ksocket_setsockopt(ks, IPPROTO_TCP, TCP_NODELAY,
+ (char *)&on, sizeof (on), CRED());
}
static uint32_t
@@ -777,7 +709,7 @@ idm_sorecvhdr(idm_conn_t *ic, idm_pdu_t *pdu)
static idm_status_t
idm_so_ini_conn_create(idm_conn_req_t *cr, idm_conn_t *ic)
{
- struct sonode *so;
+ ksocket_t so;
idm_so_conn_t *so_conn;
idm_status_t idmrc;
@@ -789,8 +721,8 @@ idm_so_ini_conn_create(idm_conn_req_t *cr, idm_conn_t *ic)
/* Bind the socket if configured to do so */
if (cr->cr_bound) {
- if (sobind(so, &cr->cr_bound_addr.sin,
- SIZEOF_SOCKADDR(&cr->cr_bound_addr.sin), 0, 0) != 0) {
+ if (ksocket_bind(so, &cr->cr_bound_addr.sin,
+ SIZEOF_SOCKADDR(&cr->cr_bound_addr.sin), CRED()) != 0) {
idm_sodestroy(so);
return (IDM_STATUS_FAIL);
}
@@ -832,8 +764,8 @@ idm_so_ini_conn_connect(idm_conn_t *ic)
so_conn = ic->ic_transport_private;
- if (soconnect(so_conn->ic_so, &ic->ic_ini_dst_addr.sin,
- (SIZEOF_SOCKADDR(&ic->ic_ini_dst_addr.sin)), 0, 0) != 0) {
+ if (ksocket_connect(so_conn->ic_so, &ic->ic_ini_dst_addr.sin,
+ (SIZEOF_SOCKADDR(&ic->ic_ini_dst_addr.sin)), CRED()) != 0) {
idm_soshutdown(so_conn->ic_so);
return (IDM_STATUS_FAIL);
}
@@ -846,7 +778,7 @@ idm_so_ini_conn_connect(idm_conn_t *ic)
}
idm_status_t
-idm_so_tgt_conn_create(idm_conn_t *ic, struct sonode *new_so)
+idm_so_tgt_conn_create(idm_conn_t *ic, ksocket_t new_so)
{
idm_status_t idmrc;
@@ -875,7 +807,7 @@ idm_so_tgt_conn_connect(idm_conn_t *ic)
}
static idm_status_t
-idm_so_conn_create_common(idm_conn_t *ic, struct sonode *new_so)
+idm_so_conn_create_common(idm_conn_t *ic, ksocket_t new_so)
{
idm_so_conn_t *so_conn;
@@ -917,18 +849,20 @@ static void
idm_so_conn_connect_common(idm_conn_t *ic)
{
idm_so_conn_t *so_conn;
+ struct sockaddr_in6 t_addr;
+ socklen_t t_addrlen = 0;
so_conn = ic->ic_transport_private;
-
- SOP_GETSOCKNAME(so_conn->ic_so);
+ bzero(&t_addr, sizeof (struct sockaddr_in6));
+ t_addrlen = sizeof (struct sockaddr_in6);
/* Set the local and remote addresses in the idm conn handle */
- mutex_enter(&so_conn->ic_so->so_lock);
- bcopy(so_conn->ic_so->so_laddr_sa, &ic->ic_laddr,
- so_conn->ic_so->so_laddr_len);
- bcopy(so_conn->ic_so->so_faddr_sa, &ic->ic_raddr,
- so_conn->ic_so->so_faddr_len);
- mutex_exit(&so_conn->ic_so->so_lock);
+ ksocket_getsockname(so_conn->ic_so, (struct sockaddr *)&t_addr,
+ &t_addrlen, CRED());
+ bcopy(&t_addr, &ic->ic_laddr, t_addrlen);
+ ksocket_getpeername(so_conn->ic_so, (struct sockaddr *)&t_addr,
+ &t_addrlen, CRED());
+ bcopy(&t_addr, &ic->ic_raddr, t_addrlen);
mutex_enter(&ic->ic_mutex);
so_conn->ic_tx_thread = thread_create(NULL, 0, idm_sotx_thread, ic, 0,
@@ -1027,16 +961,16 @@ idm_so_tgt_svc_online(idm_svc_t *is)
sin6_ip.sin6_port = htons(sr->sr_port);
sin6_ip.sin6_addr = in6addr_any;
- (void) sosetsockopt(so_svc->is_so, SOL_SOCKET, SO_REUSEADDR,
- (char *)&on, sizeof (on));
+ (void) ksocket_setsockopt(so_svc->is_so, SOL_SOCKET,
+ SO_REUSEADDR, (char *)&on, sizeof (on), CRED());
/*
* Turn off SO_MAC_EXEMPT so future sobinds succeed
*/
- (void) sosetsockopt(so_svc->is_so, SOL_SOCKET, SO_MAC_EXEMPT,
- (char *)&off, sizeof (off));
+ (void) ksocket_setsockopt(so_svc->is_so, SOL_SOCKET,
+ SO_MAC_EXEMPT, (char *)&off, sizeof (off), CRED());
- if (sobind(so_svc->is_so, (struct sockaddr *)&sin6_ip,
- sizeof (sin6_ip), 0, 0) != 0) {
+ if (ksocket_bind(so_svc->is_so, (struct sockaddr *)&sin6_ip,
+ sizeof (sin6_ip), CRED()) != 0) {
mutex_exit(&is->is_mutex);
idm_sodestroy(so_svc->is_so);
return (IDM_STATUS_FAIL);
@@ -1045,7 +979,7 @@ idm_so_tgt_svc_online(idm_svc_t *is)
idm_set_tgt_connect_options(so_svc->is_so);
- if (solisten(so_svc->is_so, 5) != 0) {
+ if (ksocket_listen(so_svc->is_so, 5, CRED()) != 0) {
mutex_exit(&is->is_mutex);
idm_soshutdown(so_svc->is_so);
idm_sodestroy(so_svc->is_so);
@@ -1063,7 +997,7 @@ idm_so_tgt_svc_online(idm_svc_t *is)
idm_sodestroy(so_svc->is_so);
return (IDM_STATUS_FAIL);
}
-
+ ksocket_hold(so_svc->is_so);
/* Wait for the port watcher thread to start */
while (!so_svc->is_thread_running)
cv_wait(&is->is_cv, &is->is_mutex);
@@ -1081,33 +1015,20 @@ static void
idm_so_tgt_svc_offline(idm_svc_t *is)
{
idm_so_svc_t *so_svc;
-
mutex_enter(&is->is_mutex);
so_svc = (idm_so_svc_t *)is->is_so_svc;
so_svc->is_thread_running = B_FALSE;
mutex_exit(&is->is_mutex);
/*
- * When called from the kernel, soaccept blocks and cannot be woken
- * up via the sockfs API. soclose does not work like you would
- * hope. When the Volo project is available we can switch to that
- * API which should address this issue. For now, we will poke at
- * the socket to wake it up.
+ * Teardown socket
*/
- mutex_enter(&so_svc->is_so->so_lock);
- so_svc->is_so->so_error = EINTR;
- cv_signal(&so_svc->is_so->so_connind_cv);
- mutex_exit(&so_svc->is_so->so_lock);
+ idm_sodestroy(so_svc->is_so);
/*
* Now we expect the port watcher thread to terminate
*/
thread_join(so_svc->is_thread_did);
-
- /*
- * Teardown socket
- */
- idm_sodestroy(so_svc->is_so);
}
/*
@@ -1117,13 +1038,17 @@ void
idm_so_svc_port_watcher(void *arg)
{
idm_svc_t *svc = arg;
- struct sonode *new_so;
+ ksocket_t new_so;
idm_conn_t *ic;
idm_status_t idmrc;
idm_so_svc_t *so_svc;
int rc;
const uint32_t off = 0;
+ struct sockaddr_in6 t_addr;
+ socklen_t t_addrlen;
+ bzero(&t_addr, sizeof (struct sockaddr_in6));
+ t_addrlen = sizeof (struct sockaddr_in6);
mutex_enter(&svc->is_mutex);
so_svc = svc->is_so_svc;
@@ -1138,7 +1063,9 @@ idm_so_svc_port_watcher(void *arg)
while (so_svc->is_thread_running) {
mutex_exit(&svc->is_mutex);
- if ((rc = soaccept(so_svc->is_so, 0, &new_so)) != 0) {
+ if ((rc = ksocket_accept(so_svc->is_so,
+ (struct sockaddr *)&t_addr, &t_addrlen,
+ &new_so, CRED())) != 0) {
mutex_enter(&svc->is_mutex);
if (rc == ECONNABORTED)
continue;
@@ -1148,8 +1075,8 @@ idm_so_svc_port_watcher(void *arg)
/*
* Turn off SO_MAC_EXEMPT so future sobinds succeed
*/
- (void) sosetsockopt(new_so, SOL_SOCKET, SO_MAC_EXEMPT,
- (char *)&off, sizeof (off));
+ (void) ksocket_setsockopt(new_so, SOL_SOCKET, SO_MAC_EXEMPT,
+ (char *)&off, sizeof (off), CRED());
idmrc = idm_svc_conn_create(svc, IDM_TRANSPORT_TYPE_SOCKETS,
&ic);
@@ -1178,7 +1105,7 @@ idm_so_svc_port_watcher(void *arg)
mutex_enter(&svc->is_mutex);
}
-
+ ksocket_rele(so_svc->is_so);
so_svc->is_thread_running = B_FALSE;
mutex_exit(&svc->is_mutex);
diff --git a/usr/src/uts/common/io/ksocket/ksocket.c b/usr/src/uts/common/io/ksocket/ksocket.c
new file mode 100644
index 0000000000..512cab56c0
--- /dev/null
+++ b/usr/src/uts/common/io/ksocket/ksocket.c
@@ -0,0 +1,733 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/file.h>
+#include <sys/stropts.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysmacros.h>
+#include <sys/filio.h> /* FIO* ioctls */
+#include <sys/sockio.h> /* SIOC* ioctls */
+#include <sys/cmn_err.h>
+#include <sys/ksocket.h>
+#include <io/ksocket/ksocket_impl.h>
+#include <fs/sockfs/sockcommon.h>
+
+#define SOCKETMOD_TCP "tcp"
+#define SOCKETMOD_UDP "udp"
+/*
+ * Kernel Sockets
+ *
+ * Mostly a wrapper around the private socket_* functions.
+ */
+int
+ksocket_socket(ksocket_t *ksp, int domain, int type, int protocol, int flags,
+ struct cred *cr)
+{
+ static const int version = SOV_DEFAULT;
+ int error = 0;
+ struct sonode *so;
+ *ksp = NULL;
+
+ if (domain == AF_NCA || domain == AF_UNIX)
+ return (EAFNOSUPPORT);
+
+ ASSERT(flags == KSOCKET_SLEEP || flags == KSOCKET_NOSLEEP);
+ so = socket_create(domain, type, protocol, NULL, NULL, version, flags,
+ cr, &error);
+ if (so == NULL) {
+ if (error == EAFNOSUPPORT) {
+ char *mod = NULL;
+
+ /*
+ * Could be that root file sytem is not loaded or
+ * soconfig has not run yet.
+ */
+ if (type == SOCK_STREAM && (domain == AF_INET ||
+ domain == AF_INET6) && (protocol == 0 ||
+ protocol == IPPROTO_TCP)) {
+ mod = SOCKETMOD_TCP;
+ } else if (type == SOCK_DGRAM && (domain == AF_INET ||
+ domain == AF_INET6) && (protocol == 0 ||
+ protocol == IPPROTO_UDP)) {
+ mod = SOCKETMOD_UDP;
+ } else {
+ return (EAFNOSUPPORT);
+ }
+
+ so = socket_create(domain, type, protocol, NULL,
+ mod, version, flags, cr, &error);
+ if (so == NULL)
+ return (error);
+ } else {
+ return (error);
+ }
+ }
+
+ so->so_mode |= SM_KERNEL;
+
+ *ksp = SOTOKS(so);
+
+ return (0);
+}
+int
+ksocket_bind(ksocket_t ks, struct sockaddr *addr, socklen_t addrlen,
+ struct cred *cr)
+{
+ int error;
+
+ if (!KSOCKET_VALID(ks))
+ return (ENOTSOCK);
+
+ error = socket_bind(KSTOSO(ks), addr, addrlen, _SOBIND_SOCKBSD, cr);
+
+ return (error);
+}
+
+int
+ksocket_listen(ksocket_t ks, int backlog, struct cred *cr)
+{
+ if (!KSOCKET_VALID(ks))
+ return (ENOTSOCK);
+
+ return (socket_listen(KSTOSO(ks), backlog, cr));
+}
+
+int
+ksocket_accept(ksocket_t ks, struct sockaddr *addr,
+ socklen_t *addrlenp, ksocket_t *nks, struct cred *cr)
+{
+ int error;
+ struct sonode *nso = NULL;
+
+ *nks = NULL;
+
+ if (!KSOCKET_VALID(ks))
+ return (ENOTSOCK);
+
+ if (addr != NULL && addrlenp == NULL)
+ return (EFAULT);
+
+ error = socket_accept(KSTOSO(ks), KSOCKET_FMODE(ks), cr, &nso);
+ if (error != 0)
+ return (error);
+
+ ASSERT(nso != NULL);
+
+ nso->so_mode |= SM_KERNEL;
+
+ if (addr != NULL && addrlenp != NULL) {
+ error = socket_getpeername(nso, addr, addrlenp, B_TRUE, cr);
+ if (error != 0) {
+ (void) socket_close(nso, 0, cr);
+ socket_destroy(nso);
+ return ((error == ENOTCONN) ? ECONNABORTED : error);
+ }
+ }
+
+ *nks = SOTOKS(nso);
+
+ return (error);
+}
+
+int
+ksocket_connect(ksocket_t ks, const struct sockaddr *addr, socklen_t addrlen,
+ struct cred *cr)
+{
+ if (!KSOCKET_VALID(ks))
+ return (ENOTSOCK);
+
+ return (socket_connect(KSTOSO(ks), addr, addrlen,
+ KSOCKET_FMODE(ks), 0, cr));
+}
+
+int
+ksocket_send(ksocket_t ks, void *msg, size_t msglen, int flags,
+ size_t *sent, struct cred *cr)
+{
+ int error;
+ struct nmsghdr msghdr;
+ struct uio auio;
+ struct iovec iov;
+
+ if (!KSOCKET_VALID(ks)) {
+ if (sent != NULL)
+ *sent = 0;
+ return (ENOTSOCK);
+ }
+
+ iov.iov_base = msg;
+ iov.iov_len = msglen;
+
+ bzero(&auio, sizeof (struct uio));
+ auio.uio_loffset = 0;
+ auio.uio_iov = &iov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = msglen;
+ if (flags & MSG_USERSPACE)
+ auio.uio_segflg = UIO_USERSPACE;
+ else
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_extflg = UIO_COPY_DEFAULT;
+ auio.uio_limit = 0;
+ auio.uio_fmode = KSOCKET_FMODE(ks);
+
+ msghdr.msg_name = NULL;
+ msghdr.msg_namelen = 0;
+ msghdr.msg_control = NULL;
+ msghdr.msg_controllen = 0;
+ msghdr.msg_flags = flags | MSG_EOR;
+
+ error = socket_sendmsg(KSTOSO(ks), &msghdr, &auio, cr);
+ if (error != 0) {
+ if (sent != NULL)
+ *sent = 0;
+ return (error);
+ }
+
+ if (sent != NULL)
+ *sent = msglen - auio.uio_resid;
+ return (0);
+}
+
+int
+ksocket_sendto(ksocket_t ks, void *msg, size_t msglen, int flags,
+ struct sockaddr *name, socklen_t namelen, size_t *sent, struct cred *cr)
+{
+ int error;
+ struct nmsghdr msghdr;
+ struct uio auio;
+ struct iovec iov;
+
+ if (!KSOCKET_VALID(ks)) {
+ if (sent != NULL)
+ *sent = 0;
+ return (ENOTSOCK);
+ }
+
+ iov.iov_base = msg;
+ iov.iov_len = msglen;
+
+ bzero(&auio, sizeof (struct uio));
+ auio.uio_loffset = 0;
+ auio.uio_iov = &iov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = msglen;
+ if (flags & MSG_USERSPACE)
+ auio.uio_segflg = UIO_USERSPACE;
+ else
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_extflg = UIO_COPY_DEFAULT;
+ auio.uio_limit = 0;
+ auio.uio_fmode = KSOCKET_FMODE(ks);
+
+ msghdr.msg_iov = &iov;
+ msghdr.msg_iovlen = 1;
+ msghdr.msg_name = (char *)name;
+ msghdr.msg_namelen = namelen;
+ msghdr.msg_control = NULL;
+ msghdr.msg_controllen = 0;
+ msghdr.msg_flags = flags | MSG_EOR;
+
+ error = socket_sendmsg(KSTOSO(ks), &msghdr, &auio, cr);
+ if (error != 0) {
+ if (sent != NULL)
+ *sent = 0;
+ return (error);
+ }
+ if (sent != NULL)
+ *sent = msglen - auio.uio_resid;
+ return (0);
+}
+
+int
+ksocket_sendmsg(ksocket_t ks, struct nmsghdr *msg, int flags,
+ size_t *sent, struct cred *cr)
+{
+ int error;
+ ssize_t len;
+ int i;
+ struct uio auio;
+
+ if (!KSOCKET_VALID(ks)) {
+ if (sent != NULL)
+ *sent = 0;
+ return (ENOTSOCK);
+ }
+
+ bzero(&auio, sizeof (struct uio));
+ auio.uio_loffset = 0;
+ auio.uio_iov = msg->msg_iov;
+ auio.uio_iovcnt = msg->msg_iovlen;
+ if (flags & MSG_USERSPACE)
+ auio.uio_segflg = UIO_USERSPACE;
+ else
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_extflg = UIO_COPY_DEFAULT;
+ auio.uio_limit = 0;
+ auio.uio_fmode = KSOCKET_FMODE(ks);
+ len = 0;
+ for (i = 0; i < msg->msg_iovlen; i++) {
+ ssize_t iovlen;
+ iovlen = (msg->msg_iov)[i].iov_len;
+ len += iovlen;
+ if (len < 0 || iovlen < 0)
+ return (EINVAL);
+ }
+ auio.uio_resid = len;
+
+ msg->msg_flags = flags | MSG_EOR;
+
+ error = socket_sendmsg(KSTOSO(ks), msg, &auio, cr);
+ if (error != 0) {
+ if (sent != NULL)
+ *sent = 0;
+ return (error);
+ }
+
+ if (sent != NULL)
+ *sent = len - auio.uio_resid;
+ return (0);
+}
+
+
+int
+ksocket_recv(ksocket_t ks, void *msg, size_t msglen, int flags,
+ size_t *recv, struct cred *cr)
+{
+ int error;
+ struct nmsghdr msghdr;
+ struct uio auio;
+ struct iovec iov;
+
+ if (!KSOCKET_VALID(ks)) {
+ if (recv != NULL)
+ *recv = 0;
+ return (ENOTSOCK);
+ }
+
+ iov.iov_base = msg;
+ iov.iov_len = msglen;
+
+ bzero(&auio, sizeof (struct uio));
+ auio.uio_loffset = 0;
+ auio.uio_iov = &iov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = msglen;
+ if (flags & MSG_USERSPACE)
+ auio.uio_segflg = UIO_USERSPACE;
+ else
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_extflg = UIO_COPY_DEFAULT;
+ auio.uio_limit = 0;
+ auio.uio_fmode = KSOCKET_FMODE(ks);
+
+ msghdr.msg_name = NULL;
+ msghdr.msg_namelen = 0;
+ msghdr.msg_control = NULL;
+ msghdr.msg_controllen = 0;
+ msghdr.msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL |
+ MSG_DONTWAIT | MSG_USERSPACE);
+
+ error = socket_recvmsg(KSTOSO(ks), &msghdr, &auio, cr);
+ if (error != 0) {
+ if (recv != NULL)
+ *recv = 0;
+ return (error);
+ }
+
+ if (recv != NULL)
+ *recv = msglen - auio.uio_resid;
+ return (0);
+}
+
+int
+ksocket_recvfrom(ksocket_t ks, void *msg, size_t msglen, int flags,
+ struct sockaddr *name, socklen_t *namelen, size_t *recv, struct cred *cr)
+{
+ int error;
+ struct nmsghdr msghdr;
+ struct uio auio;
+ struct iovec iov;
+
+ if (!KSOCKET_VALID(ks)) {
+ if (recv != NULL)
+ *recv = 0;
+ return (ENOTSOCK);
+ }
+
+ iov.iov_base = msg;
+ iov.iov_len = msglen;
+
+ bzero(&auio, sizeof (struct uio));
+ auio.uio_loffset = 0;
+ auio.uio_iov = &iov;
+ auio.uio_iovcnt = 1;
+ auio.uio_resid = msglen;
+ if (flags & MSG_USERSPACE)
+ auio.uio_segflg = UIO_USERSPACE;
+ else
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_extflg = UIO_COPY_DEFAULT;
+ auio.uio_limit = 0;
+ auio.uio_fmode = KSOCKET_FMODE(ks);
+
+ msghdr.msg_name = (char *)name;
+ msghdr.msg_namelen = *namelen;
+ msghdr.msg_control = NULL;
+ msghdr.msg_controllen = 0;
+ msghdr.msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL |
+ MSG_DONTWAIT | MSG_USERSPACE);
+
+ error = socket_recvmsg(KSTOSO(ks), &msghdr, &auio, cr);
+ if (error != 0) {
+ if (recv != NULL)
+ *recv = 0;
+ return (error);
+ }
+ if (recv != NULL)
+ *recv = msglen - auio.uio_resid;
+
+ bcopy(msghdr.msg_name, name, msghdr.msg_namelen);
+ bcopy(&msghdr.msg_namelen, namelen, sizeof (msghdr.msg_namelen));
+ return (0);
+}
+
+int
+ksocket_recvmsg(ksocket_t ks, struct nmsghdr *msg, int flags, size_t *recv,
+ struct cred *cr)
+{
+ int error;
+ ssize_t len;
+ int i;
+ struct uio auio;
+
+ if (!KSOCKET_VALID(ks)) {
+ if (recv != NULL)
+ *recv = 0;
+ return (ENOTSOCK);
+ }
+
+ bzero(&auio, sizeof (struct uio));
+ auio.uio_loffset = 0;
+ auio.uio_iov = msg->msg_iov;
+ auio.uio_iovcnt = msg->msg_iovlen;
+ if (msg->msg_flags & MSG_USERSPACE)
+ auio.uio_segflg = UIO_USERSPACE;
+ else
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_extflg = UIO_COPY_DEFAULT;
+ auio.uio_limit = 0;
+ auio.uio_fmode = KSOCKET_FMODE(ks);
+ len = 0;
+
+ for (i = 0; i < msg->msg_iovlen; i++) {
+ ssize_t iovlen;
+ iovlen = (msg->msg_iov)[i].iov_len;
+ len += iovlen;
+ if (len < 0 || iovlen < 0)
+ return (EINVAL);
+ }
+ auio.uio_resid = len;
+
+ msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL |
+ MSG_DONTWAIT | MSG_USERSPACE);
+
+ error = socket_recvmsg(KSTOSO(ks), msg, &auio, cr);
+ if (error != 0) {
+ if (recv != NULL)
+ *recv = 0;
+ return (error);
+ }
+ if (recv != NULL)
+ *recv = len - auio.uio_resid;
+ return (0);
+
+}
+
+int
+ksocket_shutdown(ksocket_t ks, int how, struct cred *cr)
+{
+ struct sonode *so;
+
+ if (!KSOCKET_VALID(ks))
+ return (ENOTSOCK);
+
+ so = KSTOSO(ks);
+
+ return (socket_shutdown(so, how, cr));
+}
+
+int
+ksocket_close(ksocket_t ks, struct cred *cr)
+{
+ struct sonode *so;
+ so = KSTOSO(ks);
+
+ mutex_enter(&so->so_lock);
+
+ if (!KSOCKET_VALID(ks)) {
+ mutex_exit(&so->so_lock);
+ return (ENOTSOCK);
+ }
+
+ so->so_state |= SS_CLOSING;
+
+ if (so->so_count > 1) {
+ mutex_enter(&so->so_acceptq_lock);
+ cv_broadcast(&so->so_acceptq_cv);
+ mutex_exit(&so->so_acceptq_lock);
+ cv_broadcast(&so->so_rcv_cv);
+ cv_broadcast(&so->so_state_cv);
+ cv_broadcast(&so->so_want_cv);
+ cv_broadcast(&so->so_snd_cv);
+ cv_broadcast(&so->so_copy_cv);
+ }
+ while (so->so_count > 1)
+ cv_wait(&so->so_closing_cv, &so->so_lock);
+
+ mutex_exit(&so->so_lock);
+ /* Remove callbacks, if any */
+ (void) ksocket_setcallbacks(ks, NULL, NULL, cr);
+
+ (void) socket_close(so, 0, cr);
+ socket_destroy(so);
+
+ return (0);
+}
+
+int
+ksocket_getsockname(ksocket_t ks, struct sockaddr *addr, socklen_t *addrlen,
+ struct cred *cr)
+{
+ struct sonode *so;
+
+ if (!KSOCKET_VALID(ks))
+ return (ENOTSOCK);
+
+ so = KSTOSO(ks);
+
+ if (addrlen == NULL || (addr == NULL && *addrlen != 0))
+ return (EFAULT);
+
+ return (socket_getsockname(so, addr, addrlen, cr));
+}
+
+int
+ksocket_getpeername(ksocket_t ks, struct sockaddr *addr, socklen_t *addrlen,
+ struct cred *cr)
+{
+ struct sonode *so;
+
+ if (!KSOCKET_VALID(ks))
+ return (ENOTSOCK);
+
+ so = KSTOSO(ks);
+
+ if (addrlen == NULL || (addr == NULL && *addrlen != 0))
+ return (EFAULT);
+
+ return (socket_getpeername(so, addr, addrlen, B_FALSE, cr));
+}
+
+int
+ksocket_getsockopt(ksocket_t ks, int level, int optname, void *optval,
+ int *optlen, struct cred *cr)
+{
+ struct sonode *so;
+
+ if (!KSOCKET_VALID(ks))
+ return (ENOTSOCK);
+
+ so = KSTOSO(ks);
+
+ if (optlen == NULL)
+ return (EFAULT);
+ if (*optlen > SO_MAXARGSIZE)
+ return (EINVAL);
+
+ return (socket_getsockopt(so, level, optname, optval,
+ (socklen_t *)optlen, 0, cr));
+}
+
+int
+ksocket_setsockopt(ksocket_t ks, int level, int optname, const void *optval,
+ int optlen, struct cred *cr)
+{
+ struct sonode *so;
+
+ if (!KSOCKET_VALID(ks))
+ return (ENOTSOCK);
+
+ so = KSTOSO(ks);
+
+ if (optval == NULL)
+ optlen = 0;
+
+ return (socket_setsockopt(so, level, optname, optval,
+ (t_uscalar_t)optlen, cr));
+}
+
+/* ARGSUSED */
+int
+ksocket_setcallbacks(ksocket_t ks, ksocket_callbacks_t *cb, void *arg,
+ struct cred *cr)
+{
+ struct sonode *so;
+
+ if (!KSOCKET_VALID(ks))
+ return (ENOTSOCK);
+
+ so = KSTOSO(ks);
+
+ if (cb == NULL && arg != NULL)
+ return (EFAULT);
+ if (cb == NULL) {
+ mutex_enter(&so->so_lock);
+ bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t));
+ so->so_ksock_cb_arg = NULL;
+ mutex_exit(&so->so_lock);
+ } else {
+ mutex_enter(&so->so_lock);
+ SETCALLBACK(so, cb, connected, KSOCKET_CB_CONNECTED)
+ SETCALLBACK(so, cb, connectfailed, KSOCKET_CB_CONNECTFAILED)
+ SETCALLBACK(so, cb, disconnected, KSOCKET_CB_DISCONNECTED)
+ SETCALLBACK(so, cb, newdata, KSOCKET_CB_NEWDATA)
+ SETCALLBACK(so, cb, newconn, KSOCKET_CB_NEWCONN)
+ SETCALLBACK(so, cb, cansend, KSOCKET_CB_CANSEND)
+ SETCALLBACK(so, cb, oobdata, KSOCKET_CB_OOBDATA)
+ SETCALLBACK(so, cb, cantsendmore, KSOCKET_CB_CANTSENDMORE)
+ SETCALLBACK(so, cb, cantrecvmore, KSOCKET_CB_CANTRECVMORE)
+ so->so_ksock_cb_arg = arg;
+ mutex_exit(&so->so_lock);
+ }
+ return (0);
+}
+
+int
+ksocket_ioctl(ksocket_t ks, int cmd, intptr_t arg, int *rvalp, struct cred *cr)
+{
+ struct sonode *so;
+ int rval;
+
+ if (!KSOCKET_VALID(ks))
+ return (ENOTSOCK);
+
+ so = KSTOSO(ks);
+
+ switch (cmd) {
+ default:
+ /* STREAM iotcls are not supported */
+ if ((cmd & 0xffffff00U) == STR) {
+ rval = EOPNOTSUPP;
+ } else {
+ rval = socket_ioctl(so, cmd, arg,
+ KSOCKET_FMODE(ks) | FKIOCTL, cr, rvalp);
+ }
+ break;
+ case FIOASYNC:
+ case SIOCSPGRP:
+ case FIOSETOWN:
+ case SIOCGPGRP:
+ case FIOGETOWN:
+ rval = EOPNOTSUPP;
+ break;
+ }
+
+ return (rval);
+}
+
+int
+ksocket_sendmblk(ksocket_t ks, struct nmsghdr *msg, int flags,
+ mblk_t **mpp, cred_t *cr)
+{
+ struct sonode *so;
+ int i_val;
+ socklen_t val_len;
+ mblk_t *mp = *mpp;
+ int error;
+
+ if (!KSOCKET_VALID(ks))
+ return (ENOTSOCK);
+
+ so = KSTOSO(ks);
+
+ if (flags & MSG_MBLK_QUICKRELE) {
+ error = socket_getsockopt(so, SOL_SOCKET, SO_SND_COPYAVOID,
+ &i_val, &val_len, 0, CRED());
+ if (error != 0)
+ return (error);
+
+ /* Zero copy is not enable */
+ if (i_val == 0)
+ return (ECANCELED);
+
+ for (; mp != NULL; mp = mp->b_cont)
+ mp->b_datap->db_struioflag |= STRUIO_ZC;
+ }
+
+ error = socket_sendmblk(so, msg, flags, cr, mpp);
+
+ return (error);
+}
+
+
+void
+ksocket_hold(ksocket_t ks)
+{
+ struct sonode *so;
+ so = KSTOSO(ks);
+
+ if (!mutex_owned(&so->so_lock)) {
+ mutex_enter(&so->so_lock);
+ so->so_count++;
+ mutex_exit(&so->so_lock);
+ } else
+ so->so_count++;
+}
+
+void
+ksocket_rele(ksocket_t ks)
+{
+ struct sonode *so;
+
+ so = KSTOSO(ks);
+ /*
+ * When so_count equals 1 means no thread working on this ksocket
+ */
+ if (so->so_count < 2)
+ cmn_err(CE_PANIC, "ksocket_rele: sonode ref count 0 or 1");
+
+ if (!mutex_owned(&so->so_lock)) {
+ mutex_enter(&so->so_lock);
+ if (--so->so_count == 1)
+ cv_signal(&so->so_closing_cv);
+ mutex_exit(&so->so_lock);
+ } else {
+ if (--so->so_count == 1)
+ cv_signal(&so->so_closing_cv);
+ }
+}
diff --git a/usr/src/uts/common/io/ksocket/ksocket_impl.h b/usr/src/uts/common/io/ksocket/ksocket_impl.h
new file mode 100644
index 0000000000..ac5251540f
--- /dev/null
+++ b/usr/src/uts/common/io/ksocket/ksocket_impl.h
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _INET_KSOCKET_KSOCKET_IMPL_H
+#define _INET_KSOCKET_KSOCKET_IMPL_H
+
+#define KSTOSO(ks) ((struct sonode *)(ks))
+#define SOTOKS(so) ((ksocket_t)(uintptr_t)(so))
+
+#define IS_KERNEL_SOCKET(so) ((so)->so_mode & SM_KERNEL)
+
+#define KSOCKET_MOD_VERSION "kernel socket module"
+
+#define __KSOCKET_EV_connected KSOCKET_EV_CONNECTED
+#define __KSOCKET_EV_connectfailed KSOCKET_EV_CONNECTFAILED
+#define __KSOCKET_EV_disconnected KSOCKET_EV_DISCONNECTED
+#define __KSOCKET_EV_oobdata KSOCKET_EV_OOBDATA
+#define __KSOCKET_EV_newdata KSOCKET_EV_NEWDATA
+#define __KSOCKET_EV_newconn KSOCKET_EV_NEWCONN
+#define __KSOCKET_EV_cansend KSOCKET_EV_CANSEND
+#define __KSOCKET_EV_cantsendmore KSOCKET_EV_CANTSENDMORE
+#define __KSOCKET_EV_cantrecvmore KSOCKET_EV_CANTRECVMORE
+#define __KSOCKET_EV_error KSOCKET_EV_ERROR
+
+#define KSOCKET_CALLBACK(so, cbfn, arg) \
+ if ((so)->so_ksock_callbacks.ksock_cb_##cbfn != NULL) { \
+ (*(so)->so_ksock_callbacks.ksock_cb_##cbfn)(SOTOKS(so), \
+ __KSOCKET_EV_##cbfn, (so)->so_ksock_cb_arg, (arg)); \
+ }
+
+#define KSOCKET_FMODE(so) FREAD|FWRITE| \
+ ((KSTOSO(so)->so_state & (SS_NDELAY|SS_NONBLOCK)) ? FNDELAY : 0)
+
+#define KSOCKET_VALID(ks) \
+ ((ks) != NULL && (KSTOSO(ks))->so_mode & SM_KERNEL && \
+ !((KSTOSO(ks))->so_state & SS_CLOSING))
+
+#define SETCALLBACK(so, cb, cbfn, cbflg) \
+ if ((cb)->ksock_cb_flags & (cbflg)) { \
+ (so)->so_ksock_callbacks.ksock_cb_##cbfn \
+ = (cb)->ksock_cb_##cbfn; \
+ if ((cb)->ksock_cb_##cbfn == NULL) \
+ (so)->so_ksock_callbacks.ksock_cb_flags \
+ &= ~(cbflg); \
+ else \
+ (so)->so_ksock_callbacks.ksock_cb_flags \
+ |= (cbflg); \
+ }
+
+
+#endif /* _INET_KSOCKET_KSOCKET_IMPL_H */
diff --git a/usr/src/uts/common/io/ksocket/ksocket_mod.c b/usr/src/uts/common/io/ksocket/ksocket_mod.c
new file mode 100644
index 0000000000..da3b4091a5
--- /dev/null
+++ b/usr/src/uts/common/io/ksocket/ksocket_mod.c
@@ -0,0 +1,57 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/sunddi.h>
+#include <sys/errno.h>
+#include <sys/modctl.h>
+
+#include <io/ksocket/ksocket_impl.h>
+
+static struct modlmisc modlmisc = {
+ &mod_miscops, KSOCKET_MOD_VERSION
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, (void *)&modlmisc, NULL
+};
+
+int
+_init(void)
+{
+ return (mod_install(&modlinkage));
+}
+
+int
+_fini(void)
+{
+ return (mod_remove(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
diff --git a/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi.h b/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi.h
index cc42247897..6d59ce3810 100644
--- a/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi.h
+++ b/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi.h
@@ -546,7 +546,7 @@ typedef struct iscsi_conn {
kcondvar_t conn_state_change;
boolean_t conn_state_destroy;
- struct sonode *conn_socket; /* aka. kernel net. socket */
+ void *conn_socket; /* kernel socket */
/* base connection information */
iscsi_sockaddr_t conn_base_addr;
@@ -846,7 +846,7 @@ typedef struct iscsi_network {
int (*connect)(void *, struct sockaddr *, int, int, int);
int (*listen)(void *, int);
void* (*accept)(void *, struct sockaddr *, int *);
- int (*getsockname)(void *);
+ int (*getsockname)(void *, struct sockaddr *, socklen_t *);
int (*getsockopt)(void *, int, int, void *, int *, int);
int (*setsockopt)(void *, int, int, void *, int);
int (*shutdown)(void *, int);
diff --git a/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_ioctl.c b/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_ioctl.c
index e5967dab8c..611b2bc967 100644
--- a/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_ioctl.c
+++ b/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_ioctl.c
@@ -237,12 +237,16 @@ iscsi_ioctl_conn_props_get(iscsi_hba_t *ihp, iscsi_conn_props_t *cp)
iscsi_sess_t *isp;
iscsi_conn_t *icp;
boolean_t rtn;
+ struct sockaddr_in6 t_addr;
+ socklen_t t_addrlen;
/* Let's check the version. */
if (cp->cp_vers != ISCSI_INTERFACE_VERSION) {
return (B_FALSE);
}
+ bzero(&t_addr, sizeof (struct sockaddr_in6));
+ t_addrlen = sizeof (struct sockaddr_in6);
/* Let's find the session. */
rw_enter(&ihp->hba_sess_list_rwlock, RW_READER);
if (iscsi_sess_get(cp->cp_sess_oid, ihp, &isp) != 0) {
@@ -263,18 +267,15 @@ iscsi_ioctl_conn_props_get(iscsi_hba_t *ihp, iscsi_conn_props_t *cp)
ASSERT(icp->conn_sig == ISCSI_SIG_CONN);
if (icp->conn_oid == cp->cp_oid) {
-
- if (icp->conn_socket->so_laddr.soa_len <=
- sizeof (cp->cp_local)) {
- bcopy(icp->conn_socket->so_laddr.soa_sa,
- &cp->cp_local,
- icp->conn_socket->so_laddr.soa_len);
+ iscsi_net->getsockname(icp->conn_socket,
+ (struct sockaddr *)&t_addr, &t_addrlen);
+ if (t_addrlen <= sizeof (cp->cp_local)) {
+ bcopy(&t_addr, &cp->cp_local, t_addrlen);
}
- if (icp->conn_socket->so_faddr.soa_len <=
- sizeof (cp->cp_peer)) {
- bcopy(icp->conn_socket->so_faddr.soa_sa,
- &cp->cp_peer,
- icp->conn_socket->so_faddr.soa_len);
+ ksocket_getpeername((ksocket_t)(icp->conn_socket),
+ (struct sockaddr *)&t_addr, &t_addrlen, CRED());
+ if (t_addrlen <= sizeof (cp->cp_peer)) {
+ bcopy(&t_addr, &cp->cp_peer, t_addrlen);
}
if (icp->conn_state == ISCSI_CONN_STATE_LOGGED_IN) {
diff --git a/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_login.c b/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_login.c
index 8a1c1914b4..c1a201f73c 100644
--- a/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_login.c
+++ b/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_login.c
@@ -1934,10 +1934,12 @@ iscsi_login_failure_str(uchar_t status_class, uchar_t status_detail)
static iscsi_status_t
iscsi_login_connect(iscsi_conn_t *icp)
{
- iscsi_hba_t *ihp;
- iscsi_sess_t *isp;
- struct sockaddr *addr;
- struct sonode *so = NULL;
+ iscsi_hba_t *ihp;
+ iscsi_sess_t *isp;
+ struct sockaddr *addr;
+ struct sockaddr_in6 t_addr;
+ struct sonode *so = NULL;
+ socklen_t t_addrlen;
ASSERT(icp != NULL);
isp = icp->conn_sess;
@@ -1946,6 +1948,8 @@ iscsi_login_connect(iscsi_conn_t *icp)
ASSERT(ihp != NULL);
addr = &icp->conn_curr_addr.sin;
+ t_addrlen = sizeof (struct sockaddr_in6);
+ bzero(&t_addr, sizeof (struct sockaddr_in6));
so = iscsi_net->socket(addr->sa_family, SOCK_STREAM, 0);
if (so == NULL) {
cmn_err(CE_WARN, "iscsi connection(%u) unable "
@@ -1982,7 +1986,8 @@ iscsi_login_connect(iscsi_conn_t *icp)
}
icp->conn_socket = so;
- if (iscsi_net->getsockname(icp->conn_socket) != 0) {
+ if (iscsi_net->getsockname(icp->conn_socket,
+ (struct sockaddr *)&t_addr, &t_addrlen) != 0) {
cmn_err(CE_NOTE, "iscsi connection(%u) failed to get "
"socket information", icp->conn_oid);
}
diff --git a/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_net.c b/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_net.c
index 23e64684a1..1f06106bf2 100644
--- a/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_net.c
+++ b/usr/src/uts/common/io/scsi/adapters/iscsi/iscsi_net.c
@@ -34,8 +34,9 @@
#include <sys/fs/dv_node.h> /* declares: devfs_lookupname */
#include <sys/bootconf.h>
#include <sys/bootprops.h>
-
+#include <netinet/in.h>
#include "iscsi.h"
+#include <sys/ksocket.h>
/*
* This is a high level description of the default
@@ -60,42 +61,42 @@
* The following listing describes the iscsi_net
* entry points:
*
- * socket - Creates TCP/IP socket connection. In the
- * default implementation creates a sonode
- * via the sockfs kernel layer.
- * bind - Performs standard TCP/IP BSD operation. In
- * the default implementation this only act
- * as a soft binding based on the IP and routing
- * tables. It would be preferred if this was
- * a hard binding but that is currently not
- * possible with Solaris's networking stack.
- * connect - Performs standard TCP/IP BSD operation. This
- * establishes the TCP SYN to the peer IP address.
- * listen - Performs standard TCP/IP BSD operation. This
- * listens for incoming peer connections.
- * accept - Performs standard TCP/IP BSD operation. This
- * accepts incoming peer connections.
- * shutdown - This disconnects the TCP/IP connection while
- * maintaining the resources.
- * close - This disconnects the TCP/IP connection and
- * releases the resources.
+ * socket - Creates TCP/IP socket connection. In the
+ * default implementation creates a sonode
+ * via the sockfs kernel layer.
+ * bind - Performs standard TCP/IP BSD operation. In
+ * the default implementation this only act
+ * as a soft binding based on the IP and routing
+ * tables. It would be preferred if this was
+ * a hard binding but that is currently not
+ * possible with Solaris's networking stack.
+ * connect - Performs standard TCP/IP BSD operation. This
+ * establishes the TCP SYN to the peer IP address.
+ * listen - Performs standard TCP/IP BSD operation. This
+ * listens for incoming peer connections.
+ * accept - Performs standard TCP/IP BSD operation. This
+ * accepts incoming peer connections.
+ * shutdown - This disconnects the TCP/IP connection while
+ * maintaining the resources.
+ * close - This disconnects the TCP/IP connection and
+ * releases the resources.
*
- * getsockopt - Gets socket option for specified socket.
- * setsockopt - Sets socket option for specified socket.
+ * getsockopt - Gets socket option for specified socket.
+ * setsockopt - Sets socket option for specified socket.
*
* The current socket options that are used by the initiator
* are listed below.
*
- * TCP_CONN_NOTIFY_THRESHOLD
- * TCP_CONN_ABORT_THRESHOLD
- * TCP_ABORT_THRESHOLD
- * TCP_NODELAY
- * SO_RCVBUF
- * SO_SNDBUF
+ * TCP_CONN_NOTIFY_THRESHOLD
+ * TCP_CONN_ABORT_THRESHOLD
+ * TCP_ABORT_THRESHOLD
+ * TCP_NODELAY
+ * SO_RCVBUF
+ * SO_SNDBUF
*
* iscsi_net_poll - Poll socket interface for a specified amount
- * of data. If data not received in timeout
- * period fail request.
+ * of data. If data not received in timeout
+ * period fail request.
* iscsi_net_sendmsg - Send message on socket connection
* iscsi_net_recvmsg - Receive message on socket connection
*
@@ -109,8 +110,8 @@
* generate or validate the iSCSI
* header digest CRC.
* ISCSI_NET_DATA_DIGESt - The interface should either
- * generate or validate the iSCSI
- * data digest CRC.
+ * generate or validate the iSCSI
+ * data digest CRC.
*/
@@ -144,25 +145,18 @@ const int is_incoming_opcode_invalid[256] = {
/* 0xEX */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* 0xFX */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
};
-/*
- * Define macros to manipulate snode, vnode, and open device flags
- */
-#define VTYP_VALID(i) (((i) == VCHR) || ((i) == VBLK))
-#define STYP_VALID(i) (((i) == S_IFCHR) || ((i) == S_IFBLK))
-#define STYP_TO_VTYP(i) (((i) == S_IFCHR) ? VCHR : VBLK)
#define IP_4_BITS 32
#define IP_6_BITS 128
extern int modrootloaded;
-extern ib_boot_prop_t *iscsiboot_prop;
+extern ib_boot_prop_t *iscsiboot_prop;
/* prototypes */
/* for iSCSI boot */
static int net_up = 0;
static iscsi_status_t iscsi_net_interface();
-static int iscsi_ldi_vp_from_name(char *path, vnode_t **vpp);
/* boot prototypes end */
static void * iscsi_net_socket(int domain, int type, int protocol);
@@ -173,7 +167,7 @@ static int iscsi_net_connect(void *socket, struct sockaddr *
static int iscsi_net_listen(void *socket, int backlog);
static void * iscsi_net_accept(void *socket, struct sockaddr *addr,
int *addr_len);
-static int iscsi_net_getsockname(void *socket);
+static int iscsi_net_getsockname(void *socket, struct sockaddr *, socklen_t *);
static int iscsi_net_getsockopt(void *socket, int level,
int option_name, void *option_val, int *option_len, int flags);
static int iscsi_net_setsockopt(void *socket, int level,
@@ -198,7 +192,7 @@ static void iscsi_net_set_postconnect_options(void *socket);
/*
* +--------------------------------------------------------------------+
- * | network interface registration functions |
+ * | network interface registration functions |
* +--------------------------------------------------------------------+
*/
@@ -287,7 +281,7 @@ iscsi_net_set_postconnect_options(void *socket)
/*
* +--------------------------------------------------------------------+
- * | register network interfaces |
+ * | register network interfaces |
* +--------------------------------------------------------------------+
*/
@@ -297,93 +291,53 @@ iscsi_net_set_postconnect_options(void *socket)
static void *
iscsi_net_socket(int domain, int type, int protocol)
{
- vnode_t *dvp = NULL,
- *vp = NULL;
- struct snode *csp = NULL;
- int err = 0;
- major_t maj;
+ ksocket_t socket;
+ int err = 0;
if (!modrootloaded && !net_up && iscsiboot_prop) {
if (iscsi_net_interface() == ISCSI_STATUS_SUCCESS)
net_up = 1;
}
- /* ---- solookup: start ---- */
- if ((vp = solookup(domain, type, protocol, NULL, &err)) == NULL) {
-
- /*
- * solookup calls sogetvp if the vp is not found in
- * the cache. Since the call to sogetvp is hardwired
- * to use USERSPACE and declared static we'll do the
- * work here instead.
- */
- if (!modrootloaded) {
- err = iscsi_ldi_vp_from_name("/devices/pseudo/tcp@0:"
- "tcp", &vp);
- } else {
- err = lookupname(type == SOCK_STREAM ? "/dev/tcp" :
- "/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
- }
- if (err) {
- return (NULL);
- }
+ err = ksocket_socket(&socket, domain, type, protocol, KSOCKET_SLEEP,
+ CRED());
+ if (!err)
+ return ((void *)socket);
+ else
+ return (NULL);
- /* ---- check that it is the correct vnode ---- */
- if (vp->v_type != VCHR) {
- VN_RELE(vp);
- return (NULL);
- }
-
- csp = VTOS(VTOS(vp)->s_commonvp);
- if (!(csp->s_flag & SDIPSET)) {
- char *pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
- err = ddi_dev_pathname(vp->v_rdev, S_IFCHR,
- pathname);
- if (err == 0) {
- err = devfs_lookupname(pathname, NULLVPP,
- &dvp);
- }
- VN_RELE(vp);
- kmem_free(pathname, MAXPATHLEN);
- if (err != 0) {
- return (NULL);
- }
- vp = dvp;
- }
-
- maj = getmajor(vp->v_rdev);
- if (!STREAMSTAB(maj)) {
- VN_RELE(vp);
- return (NULL);
- }
- }
- /* ---- solookup: end ---- */
- return (socreate(vp, domain, type, protocol, SOV_DEFAULT, NULL, &err));
}
/*
* iscsi_net_bind - bind socket to a specific sockaddr
*/
+/* ARGSUSED */
static int
iscsi_net_bind(void *socket, struct sockaddr *name, int name_len,
int backlog, int flags)
{
- return (sobind((struct sonode *)socket, name, name_len,
- backlog, flags));
+ ksocket_t ks = (ksocket_t)socket;
+ int error;
+ error = ksocket_bind(ks, name, name_len, CRED());
+ if (error == 0 && backlog != 0)
+ error = ksocket_listen(ks, backlog, CRED());
+
+ return (error);
}
/*
* iscsi_net_connect - connect socket to peer sockaddr
*/
+/* ARGSUSED */
static int
iscsi_net_connect(void *socket, struct sockaddr *name, int name_len,
int fflag, int flags)
{
+ ksocket_t ks = (ksocket_t)socket;
int rval;
iscsi_net_set_preconnect_options(socket);
- rval = soconnect((struct sonode *)socket, name,
- name_len, fflag, flags);
+ rval = ksocket_connect(ks, name, name_len, CRED());
iscsi_net_set_postconnect_options(socket);
return (rval);
@@ -395,7 +349,8 @@ iscsi_net_connect(void *socket, struct sockaddr *name, int name_len,
static int
iscsi_net_listen(void *socket, int backlog)
{
- return (solisten((struct sonode *)socket, backlog));
+ ksocket_t ks = (ksocket_t)socket;
+ return (ksocket_listen(ks, backlog, CRED()));
}
/*
@@ -404,41 +359,35 @@ iscsi_net_listen(void *socket, int backlog)
static void *
iscsi_net_accept(void *socket, struct sockaddr *addr, int *addr_len)
{
- struct sonode *listening_socket;
-
- (void) soaccept((struct sonode *)socket,
- ((struct sonode *)socket)->so_flag,
- &listening_socket);
- if (listening_socket != NULL) {
- bcopy(listening_socket->so_faddr_sa, addr,
- (socklen_t)listening_socket->so_faddr_len);
- *addr_len = listening_socket->so_faddr_len;
- } else {
- *addr_len = 0;
- }
+ ksocket_t listen_ks;
+ ksocket_t ks = (ksocket_t)socket;
- return ((void *)listening_socket);
+ ksocket_accept(ks, addr, (socklen_t *)addr_len, &listen_ks, CRED());
+
+ return ((void *)listen_ks);
}
/*
* iscsi_net_getsockname -
*/
static int
-iscsi_net_getsockname(void *socket)
+iscsi_net_getsockname(void *socket, struct sockaddr *addr, socklen_t *addrlen)
{
- return (sogetsockname((struct sonode *)socket));
+ ksocket_t ks = (ksocket_t)socket;
+ return (ksocket_getsockname(ks, addr, addrlen, CRED()));
}
/*
* iscsi_net_getsockopt - get value of option on socket
*/
+/* ARGSUSED */
static int
iscsi_net_getsockopt(void *socket, int level, int option_name,
void *option_val, int *option_len, int flags)
{
- return (sogetsockopt((struct sonode *)socket, level,
- option_name, option_val, (socklen_t *)option_len,
- flags));
+ ksocket_t ks = (ksocket_t)socket;
+ return (ksocket_getsockopt(ks, level, option_name, option_val,
+ option_len, CRED()));
}
/*
@@ -448,8 +397,9 @@ static int
iscsi_net_setsockopt(void *socket, int level, int option_name,
void *option_val, int option_len)
{
- return (sosetsockopt((struct sonode *)socket, level,
- option_name, option_val, option_len));
+ ksocket_t ks = (ksocket_t)socket;
+ return (ksocket_setsockopt(ks, level, option_name, option_val,
+ option_len, CRED()));
}
/*
@@ -458,7 +408,8 @@ iscsi_net_setsockopt(void *socket, int level, int option_name,
static int
iscsi_net_shutdown(void *socket, int how)
{
- return (soshutdown((struct sonode *)socket, how));
+ ksocket_t ks = (ksocket_t)socket;
+ return (ksocket_shutdown(ks, how, CRED()));
}
/*
@@ -467,26 +418,32 @@ iscsi_net_shutdown(void *socket, int how)
static void
iscsi_net_close(void *socket)
{
- vnode_t *vp = SOTOV((struct sonode *)socket);
- (void) soshutdown((struct sonode *)socket, 2);
- (void) VOP_CLOSE(vp, 0, 1, 0, kcred, NULL);
- VN_RELE(vp);
+ ksocket_t ks = (ksocket_t)socket;
+ (void) ksocket_close(ks, CRED());
}
/*
* iscsi_net_poll - poll socket for data
*/
+/* ARGSUSED */
static size_t
iscsi_net_poll(void *socket, clock_t timeout)
{
int pflag;
- uchar_t pri;
- rval_t rval;
+ char msg[64];
+ size_t recv = 0;
+ struct timeval tl;
+ ksocket_t ks = (ksocket_t)socket;
+ /* timeout is millisecond */
+ tl.tv_sec = timeout / 1000;
+ tl.tv_usec = (timeout % 1000) * 1000;
+
+ (void) ksocket_setsockopt(ks, SOL_SOCKET, SO_RCVTIMEO, &tl,
+ sizeof (struct timeval), CRED());
- pri = 0;
pflag = MSG_ANY;
- return (kstrgetmsg(SOTOV((struct sonode *)socket), NULL, NULL,
- &pri, &pflag, timeout, &rval));
+ bzero(msg, sizeof (msg));
+ return (ksocket_recv(ks, msg, sizeof (msg), pflag, &recv, CRED()));
}
/*
@@ -496,24 +453,12 @@ iscsi_net_poll(void *socket, clock_t timeout)
static size_t
iscsi_net_sendmsg(void *socket, struct msghdr *msg)
{
- int i = 0;
- int total_len = 0;
- struct uio uio;
-
- /* Initialization of the uio structure. */
- bzero(&uio, sizeof (uio));
- uio.uio_iov = msg->msg_iov;
- uio.uio_iovcnt = msg->msg_iovlen;
- uio.uio_segflg = UIO_SYSSPACE;
-
- for (i = 0; i < msg->msg_iovlen; i++) {
- total_len += (msg->msg_iov)[i].iov_len;
- }
- uio.uio_resid = total_len;
-
- (void) sosendmsg((struct sonode *)socket, msg, &uio);
- DTRACE_PROBE2(sosendmsg, size_t, total_len, size_t, uio.uio_resid);
- return (total_len - uio.uio_resid);
+ ksocket_t ks = (ksocket_t)socket;
+ size_t sent = 0;
+ int flag = msg->msg_flags;
+ (void) ksocket_sendmsg(ks, msg, flag, &sent, CRED());
+ DTRACE_PROBE1(ksocket_sendmsg, size_t, sent);
+ return (sent);
}
/*
@@ -523,80 +468,25 @@ iscsi_net_sendmsg(void *socket, struct msghdr *msg)
static size_t
iscsi_net_recvmsg(void *socket, struct msghdr *msg, int timeout)
{
- int idx;
- int total_len = 0;
- struct uio uio;
- uchar_t pri = 0;
- int prflag = MSG_ANY;
- rval_t rval;
- struct sonode *sonode = (struct sonode *)socket;
-
- /* Initialization of the uio structure. */
- bzero(&uio, sizeof (uio));
- uio.uio_iov = msg->msg_iov;
- uio.uio_iovcnt = msg->msg_iovlen;
- uio.uio_segflg = UIO_SYSSPACE;
-
- for (idx = 0; idx < msg->msg_iovlen; idx++) {
- total_len += (msg->msg_iov)[idx].iov_len;
- }
- uio.uio_resid = total_len;
-
- /* If timeout requested on receive */
- if (timeout > 0) {
- boolean_t loopback = B_FALSE;
-
- /* And this isn't a loopback connection */
- if (sonode->so_laddr.soa_sa->sa_family == AF_INET) {
- struct sockaddr_in *lin =
- (struct sockaddr_in *)sonode->so_laddr.soa_sa;
- struct sockaddr_in *fin =
- (struct sockaddr_in *)sonode->so_faddr.soa_sa;
-
- if ((lin->sin_family == fin->sin_family) &&
- (bcmp(&lin->sin_addr, &fin->sin_addr,
- sizeof (struct in_addr)) == 0)) {
- loopback = B_TRUE;
- }
- } else {
- struct sockaddr_in6 *lin6 =
- (struct sockaddr_in6 *)sonode->so_laddr.soa_sa;
- struct sockaddr_in6 *fin6 =
- (struct sockaddr_in6 *)sonode->so_faddr.soa_sa;
-
- if ((lin6->sin6_family == fin6->sin6_family) &&
- (bcmp(&lin6->sin6_addr, &fin6->sin6_addr,
- sizeof (struct in6_addr)) == 0)) {
- loopback = B_TRUE;
- }
- }
-
- if (loopback == B_FALSE) {
- /*
- * Then poll device for up to the timeout
- * period or the requested data is received.
- */
- if (kstrgetmsg(SOTOV(sonode),
- NULL, NULL, &pri, &prflag, timeout * 1000,
- &rval) == ETIME) {
- return (0);
- }
- }
- }
-
+ int prflag = msg->msg_flags;
+ ksocket_t ks = (ksocket_t)socket;
+ size_t recv = 0;
+ struct timeval tl;
+
+ tl.tv_sec = timeout;
+ tl.tv_usec = 0;
+
+ /* Set recv timeout */
+ if (ksocket_setsockopt(ks, SOL_SOCKET, SO_RCVTIMEO, &tl,
+ sizeof (struct timeval), CRED()))
+ return (0);
/*
* Receive the requested data. Block until all
- * data is received.
- *
- * resid occurs only when the connection is
- * disconnected. In that case it will return
- * the amount of data that was not received.
- * In general this is the total amount we
- * requested.
+ * data is received or timeout.
*/
- (void) sorecvmsg((struct sonode *)socket, msg, &uio);
- DTRACE_PROBE2(sorecvmsg, size_t, total_len, size_t, uio.uio_resid);
- return (total_len - uio.uio_resid);
+ ksocket_recvmsg(ks, msg, prflag, &recv, CRED());
+ DTRACE_PROBE1(ksocket_recvmsg, size_t, recv);
+ return (recv);
}
/*
@@ -701,7 +591,7 @@ iscsi_net_sendpdu(void *socket, iscsi_hdr_t *ihp, char *data, int flags)
msg.msg_flags = MSG_WAITALL;
msg.msg_iovlen = iovlen;
- send_len = iscsi_net->sendmsg((struct sonode *)socket, &msg);
+ send_len = iscsi_net->sendmsg(socket, &msg);
DTRACE_PROBE2(sendmsg, size_t, total_len, size_t, send_len);
if (total_len != send_len) {
return (ISCSI_STATUS_TCP_TX_ERROR);
@@ -873,7 +763,6 @@ iscsi_net_recvdata(void *socket, iscsi_hdr_t *ihp, char *data,
}
if (dlength) {
-
/* calculate pad */
pad_len = ((ISCSI_PAD_WORD_LEN -
(dlength & (ISCSI_PAD_WORD_LEN - 1))) &
@@ -1067,83 +956,3 @@ iscsi_net_interface()
return (ISCSI_STATUS_SUCCESS);
}
}
-
-/*
- * vp is needed to create the socket for the time being.
- */
-static int
-iscsi_ldi_vp_from_name(char *path, vnode_t **vpp)
-{
- vnode_t *vp = NULL;
- int ret;
-
- /* sanity check required input parameters */
- if ((path == NULL) || (vpp == NULL))
- return (EINVAL);
-
- if (modrootloaded) {
- cred_t *saved_cred = curthread->t_cred;
-
- /* we don't want lookupname to fail because of credentials */
- curthread->t_cred = kcred;
-
- /*
- * all lookups should be done in the global zone. but
- * lookupnameat() won't actually do this if an absolute
- * path is passed in. since the ldi interfaces require an
- * absolute path we pass lookupnameat() a pointer to
- * the character after the leading '/' and tell it to
- * start searching at the current system root directory.
- */
- ASSERT(*path == '/');
- ret = lookupnameat(path + 1, UIO_SYSSPACE, FOLLOW, NULLVPP,
- &vp, rootdir);
-
- /* restore this threads credentials */
- curthread->t_cred = saved_cred;
-
- if (ret == 0) {
- if (!vn_matchops(vp, spec_getvnodeops()) ||
- !VTYP_VALID(vp->v_type)) {
- VN_RELE(vp);
- return (ENXIO);
- }
- }
- }
-
- if (vp == NULL) {
- dev_info_t *dip;
- dev_t dev;
- int spec_type;
-
- /*
- * Root is not mounted, the minor node is not specified,
- * or an OBP path has been specified.
- */
-
- /*
- * Determine if path can be pruned to produce an
- * OBP or devfs path for resolve_pathname.
- */
- if (strncmp(path, "/devices/", 9) == 0)
- path += strlen("/devices");
-
- /*
- * if no minor node was specified the DEFAULT minor node
- * will be returned. if there is no DEFAULT minor node
- * one will be fabricated of type S_IFCHR with the minor
- * number equal to the instance number.
- */
- ret = resolve_pathname(path, &dip, &dev, &spec_type);
- if (ret != 0)
- return (ENODEV);
-
- ASSERT(STYP_VALID(spec_type));
- vp = makespecvp(dev, STYP_TO_VTYP(spec_type));
- spec_assoc_vp_with_devi(vp, dip);
- ddi_release_devi(dip);
- }
-
- *vpp = vp;
- return (0);
-}
diff --git a/usr/src/uts/common/io/scsi/adapters/iscsi/isns_client.c b/usr/src/uts/common/io/scsi/adapters/iscsi/isns_client.c
index fd5d226e0f..5ed6acdc2b 100644
--- a/usr/src/uts/common/io/scsi/adapters/iscsi/isns_client.c
+++ b/usr/src/uts/common/io/scsi/adapters/iscsi/isns_client.c
@@ -1518,7 +1518,11 @@ void
struct sockaddr_in6 s_in6;
} sa_rsvr = { 0 };
void *so;
+ struct sockaddr_in6 t_addr;
+ socklen_t t_addrlen;
+ bzero(&t_addr, sizeof (struct sockaddr_in6));
+ t_addrlen = sizeof (struct sockaddr_in6);
if (isns_server_addr->a_addr.i_insize == sizeof (struct in_addr)) {
/* IPv4 */
sa_rsvr.s_in4.sin_family = AF_INET;
@@ -1555,7 +1559,8 @@ void
return (NULL);
}
- (void) iscsi_net->getsockname(so);
+ (void) iscsi_net->getsockname(so, (struct sockaddr *)&t_addr,
+ &t_addrlen);
return (so);
}
@@ -2961,6 +2966,8 @@ isns_service_esi_scn(iscsi_thread_t *thread, void *arg)
isns_pdu_t *in_pdu;
size_t bytes_received, in_pdu_size = 0;
uint8_t *lhba_handle;
+ struct sockaddr_in6 t_addr;
+ socklen_t t_addrlen;
union {
struct sockaddr sin;
struct sockaddr_in s_in4;
@@ -2978,12 +2985,13 @@ isns_service_esi_scn(iscsi_thread_t *thread, void *arg)
/* Done using the argument - free it */
kmem_free(larg, sizeof (*larg));
+ bzero(&t_addr, sizeof (struct sockaddr_in6));
+ t_addrlen = sizeof (struct sockaddr_in6);
- if (((struct sonode *)listening_so)->so_laddr.soa_len <=
- sizeof (local_conn_prop)) {
- bcopy(((struct sonode *)listening_so)->so_laddr.soa_sa,
- &local_conn_prop,
- ((struct sonode *)listening_so)->so_laddr.soa_len);
+ (void) iscsi_net->getsockname(listening_so,
+ (struct sockaddr *)&t_addr, &t_addrlen);
+ if (t_addrlen <= sizeof (local_conn_prop)) {
+ bcopy(&t_addr, &local_conn_prop, t_addrlen);
}
if (iscsi_net->listen(listening_so, 5) < 0) {
@@ -2999,8 +3007,7 @@ isns_service_esi_scn(iscsi_thread_t *thread, void *arg)
/* Blocking call */
connecting_so = iscsi_net->accept(
- (struct sonode *)listening_so,
- &clnt_addr.sin, &clnt_len);
+ listening_so, &clnt_addr.sin, &clnt_len);
mutex_enter(&esi_scn_thr_mutex);
if (esi_scn_thr_to_shutdown == B_TRUE) {
@@ -3092,10 +3099,14 @@ find_local_portal(iscsi_addr_t *isns_server_addr,
struct sockaddr_in6 s_in6;
} serv_addr = { 0 };
void *so;
+ struct sockaddr_in6 t_addr;
+ socklen_t t_addrlen;
*local_addr = NULL;
*listening_so = NULL;
+ bzero(&t_addr, sizeof (struct sockaddr_in6));
+ t_addrlen = sizeof (struct sockaddr_in6);
/*
* Determine the local IP address.
*/
@@ -3104,16 +3115,14 @@ find_local_portal(iscsi_addr_t *isns_server_addr,
return (B_FALSE);
}
- if (((struct sonode *)so)->so_laddr.soa_len >
- sizeof (local_conn_prop)) {
+ iscsi_net->getsockname(so, (struct sockaddr *)&t_addr, &t_addrlen);
+ if (t_addrlen > sizeof (local_conn_prop)) {
iscsi_net->close(so);
return (B_FALSE);
}
- bcopy(((struct sonode *)so)->so_laddr.soa_sa,
- &local_conn_prop,
- ((struct sonode *)so)->so_laddr.soa_len);
-
+ bcopy(&t_addr, &local_conn_prop, t_addrlen);
+ t_addrlen = sizeof (struct sockaddr_in6);
if (local_conn_prop.soa4.sin_family == AF_INET) {
*local_addr = (iscsi_addr_t *)kmem_zalloc(sizeof (iscsi_addr_t),
KM_SLEEP);
@@ -3160,11 +3169,10 @@ find_local_portal(iscsi_addr_t *isns_server_addr,
return (B_FALSE);
}
- if (((struct sonode *)so)->so_laddr.soa_len <=
- sizeof (local_conn_prop)) {
- bcopy(((struct sonode *)so)->so_laddr.soa_sa,
- &local_conn_prop,
- ((struct sonode *)so)->so_laddr.soa_len);
+ (void) iscsi_net->getsockname(so, (struct sockaddr *)&t_addr,
+ &t_addrlen);
+ if (t_addrlen <= sizeof (local_conn_prop)) {
+ bcopy(&t_addr, &local_conn_prop, t_addrlen);
(*local_addr)->a_port = ntohs(local_conn_prop.soa4.sin_port);
} else {
(*local_addr)->a_port = ISNS_DEFAULT_ESI_SCN_PORT;
diff --git a/usr/src/uts/common/io/sock_conf.c b/usr/src/uts/common/io/sock_conf.c
new file mode 100644
index 0000000000..b6d31de8ea
--- /dev/null
+++ b/usr/src/uts/common/io/sock_conf.c
@@ -0,0 +1,251 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/sysmacros.h>
+#include <sys/atomic.h>
+#include <sys/strsubr.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/cmn_err.h>
+#include <sys/modctl.h>
+#include <sys/sdt.h>
+
+list_t smod_list;
+kmutex_t smod_list_lock;
+
+so_create_func_t sock_comm_create_function;
+so_destroy_func_t sock_comm_destroy_function;
+
+static smod_info_t *smod_create(const char *);
+static void smod_destroy(smod_info_t *);
+
+extern void smod_add(smod_info_t *);
+
+void
+smod_init(void)
+{
+ list_create(&smod_list, sizeof (smod_info_t),
+ offsetof(smod_info_t, smod_node));
+ mutex_init(&smod_list_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+static smod_info_t *
+smod_find(const char *modname)
+{
+ smod_info_t *smodp;
+
+ ASSERT(MUTEX_HELD(&smod_list_lock));
+
+ for (smodp = list_head(&smod_list); smodp != NULL;
+ smodp = list_next(&smod_list, smodp))
+ if (strcmp(smodp->smod_name, modname) == 0)
+ return (smodp);
+ return (NULL);
+}
+
+/*
+ * Register the socket module.
+ */
+int
+smod_register(const smod_reg_t *reg)
+{
+ smod_info_t *smodp;
+
+ /*
+ * Make sure the socket module does not depend on capabilities
+ * not available on the system.
+ */
+ if (reg->smod_version != SOCKMOD_VERSION ||
+ reg->smod_dc_version != SOCK_DC_VERSION ||
+ reg->smod_uc_version != SOCK_UC_VERSION) {
+ cmn_err(CE_WARN,
+ "Failed to register socket module %s: version mismatch",
+ reg->smod_name);
+ return (EINVAL);
+ }
+
+#ifdef DEBUG
+ mutex_enter(&smod_list_lock);
+ if ((smodp = smod_find(reg->smod_name)) != NULL) {
+ mutex_exit(&smod_list_lock);
+ return (EEXIST);
+ }
+ mutex_exit(&smod_list_lock);
+#endif
+
+ smodp = smod_create(reg->smod_name);
+ smodp->smod_version = reg->smod_version;
+ if (strcmp(smodp->smod_name, SOTPI_SMOD_NAME) == 0 ||
+ strcmp(smodp->smod_name, "socksctp") == 0 ||
+ strcmp(smodp->smod_name, "socksdp") == 0) {
+ ASSERT(smodp->smod_proto_create_func == NULL);
+ ASSERT(reg->__smod_priv != NULL);
+ smodp->smod_sock_create_func =
+ reg->__smod_priv->smodp_sock_create_func;
+ smodp->smod_sock_destroy_func =
+ reg->__smod_priv->smodp_sock_destroy_func;
+ smodp->smod_proto_create_func = NULL;
+ } else {
+ if (reg->smod_proto_create_func == NULL ||
+ (reg->__smod_priv != NULL &&
+ (reg->__smod_priv->smodp_sock_create_func != NULL ||
+ reg->__smod_priv->smodp_sock_destroy_func != NULL))) {
+#ifdef DEBUG
+ cmn_err(CE_CONT, "smod_register of %s failed",
+ smodp->smod_name);
+#endif
+ smod_destroy(smodp);
+ return (EINVAL);
+ }
+ smodp->smod_proto_create_func = reg->smod_proto_create_func;
+ smodp->smod_sock_create_func = sock_comm_create_function;
+ smodp->smod_sock_destroy_func = sock_comm_destroy_function;
+ smodp->smod_uc_version = reg->smod_uc_version;
+ smodp->smod_dc_version = reg->smod_dc_version;
+ if (reg->__smod_priv != NULL) {
+ smodp->smod_proto_fallback_func =
+ reg->__smod_priv->smodp_proto_fallback_func;
+ }
+ }
+ smod_add(smodp);
+ return (0);
+}
+
+/*
+ * Unregister the socket module
+ */
+int
+smod_unregister(const char *mod_name)
+{
+ smod_info_t *smodp;
+
+ mutex_enter(&smod_list_lock);
+ if ((smodp = smod_find(mod_name)) != NULL) {
+ if (smodp->smod_refcnt != 0) {
+ mutex_exit(&smod_list_lock);
+ return (EBUSY);
+ } else {
+ /*
+ * Delete the entry from the socket module list.
+ */
+ list_remove(&smod_list, smodp);
+ mutex_exit(&smod_list_lock);
+
+ smod_destroy(smodp);
+ return (0);
+ }
+ }
+ mutex_exit(&smod_list_lock);
+
+ return (ENXIO);
+}
+
+/*
+ * Initialize the socket module entry.
+ */
+static smod_info_t *
+smod_create(const char *modname)
+{
+ smod_info_t *smodp;
+ int len;
+
+ smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
+ len = strlen(modname) + 1;
+ smodp->smod_name = kmem_alloc(len, KM_SLEEP);
+ bcopy(modname, smodp->smod_name, len);
+ smodp->smod_name[len - 1] = '\0';
+ return (smodp);
+}
+
+/*
+ * Clean up the socket module part of the sockparams entry.
+ */
+static void
+smod_destroy(smod_info_t *smodp)
+{
+ ASSERT(smodp->smod_name != NULL);
+ ASSERT(smodp->smod_refcnt == 0);
+ ASSERT(!list_link_active(&smodp->smod_node));
+ ASSERT(strcmp(smodp->smod_name, "socktpi") != 0);
+
+ kmem_free(smodp->smod_name, strlen(smodp->smod_name) + 1);
+ smodp->smod_name = NULL;
+ smodp->smod_proto_create_func = NULL;
+ smodp->smod_sock_create_func = NULL;
+ smodp->smod_sock_destroy_func = NULL;
+ kmem_free(smodp, sizeof (*smodp));
+}
+
+/*
+ * Add an entry at the front of the socket module list.
+ */
+void
+smod_add(smod_info_t *smodp)
+{
+ ASSERT(smodp != NULL);
+ mutex_enter(&smod_list_lock);
+ list_insert_head(&smod_list, smodp);
+ mutex_exit(&smod_list_lock);
+}
+
+/*
+ * Lookup the socket module table by the socket module name.
+ * If there is an existing entry, then increase the reference count.
+ * Otherwise we load the module and in the module register function create
+ * a new entry and add it to the end of the socket module table.
+ */
+smod_info_t *
+smod_lookup_byname(const char *modname)
+{
+ smod_info_t *smodp;
+ int error;
+
+again:
+ /*
+ * If find an entry, increase the reference count and
+ * return the entry pointer.
+ */
+ mutex_enter(&smod_list_lock);
+ if ((smodp = smod_find(modname)) != NULL) {
+ SMOD_INC_REF(smodp);
+ mutex_exit(&smod_list_lock);
+ return (smodp);
+ }
+ mutex_exit(&smod_list_lock);
+
+ /*
+ * We have a sockmod, and it is not loaded.
+ * Load the module into the kernel, modload() will
+ * take care of the multiple threads.
+ */
+ DTRACE_PROBE1(load__socket__module, char *, modname);
+ error = modload(SOCKMOD_PATH, modname);
+ if (error == -1) {
+ cmn_err(CE_CONT, "modload of %s/%s failed",
+ SOCKMOD_PATH, modname);
+ return (NULL);
+ }
+ goto again;
+}
diff --git a/usr/src/uts/common/io/strplumb.c b/usr/src/uts/common/io/strplumb.c
index 27b9cc8843..33406bea05 100644
--- a/usr/src/uts/common/io/strplumb.c
+++ b/usr/src/uts/common/io/strplumb.c
@@ -62,6 +62,7 @@
#include <inet/ip6.h>
#include <inet/tcp.h>
#include <inet/sctp_ip.h>
+#include <inet/udp_impl.h>
#include <sys/strlog.h>
#include <sys/log.h>
diff --git a/usr/src/uts/common/netinet/icmp6.h b/usr/src/uts/common/netinet/icmp6.h
index 2d8903d6f1..560b825595 100644
--- a/usr/src/uts/common/netinet/icmp6.h
+++ b/usr/src/uts/common/netinet/icmp6.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _NETINET_ICMP6_H
#define _NETINET_ICMP6_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -481,6 +478,7 @@ typedef struct icmp6_filter {
#define ICMP6_FILTER_WILLBLOCK(type, filterp) \
((((filterp)->__icmp6_filt[(type) >> 5]) & (1 << ((type) & 31))) == 0)
+#define ICMP_IOC_DEFAULT_Q (('I' << 8) + 51)
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/os/fio.c b/usr/src/uts/common/os/fio.c
index 13f592993a..d78f4bbdb0 100644
--- a/usr/src/uts/common/os/fio.c
+++ b/usr/src/uts/common/os/fio.c
@@ -23,12 +23,10 @@
/* All Rights Reserved */
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/sysmacros.h>
#include <sys/param.h>
@@ -1167,11 +1165,8 @@ f_getfl(int fd, int *flagp)
/*
* BSD fcntl() FASYNC compatibility.
- *
- * SCTP doesn't have an associated stream and thus
- * doesn't store flags on it.
*/
- if ((vp->v_type == VSOCK) && (vp->v_stream != NULL))
+ if (vp->v_type == VSOCK)
flag |= sock_getfasync(vp);
*flagp = flag;
error = 0;
diff --git a/usr/src/uts/common/os/modconf.c b/usr/src/uts/common/os/modconf.c
index 7c41975c48..cf25d86183 100644
--- a/usr/src/uts/common/os/modconf.c
+++ b/usr/src/uts/common/os/modconf.c
@@ -23,8 +23,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/param.h>
@@ -59,6 +57,7 @@
#include <sys/cpc_pcbe.h>
#include <sys/kstat.h>
#include <sys/fs/sdev_node.h>
+#include <sys/socketvar.h>
#include <sys/kiconv.h>
extern int moddebug;
@@ -186,6 +185,17 @@ struct mod_ops mod_strmodops = {
};
/*
+ * Socket modules.
+ */
+static int mod_infosockmod(struct modlsockmod *, struct modlinkage *, int *);
+static int mod_installsockmod(struct modlsockmod *, struct modlinkage *);
+static int mod_removesockmod(struct modlsockmod *, struct modlinkage *);
+
+struct mod_ops mod_sockmodops = {
+ mod_installsockmod, mod_removesockmod, mod_infosockmod
+};
+
+/*
* Scheduling classes.
*/
static int mod_infosched(struct modlsched *, struct modlinkage *, int *);
@@ -1178,6 +1188,59 @@ mod_removestrmod(struct modlstrmod *modl, struct modlinkage *modlp)
}
/*
+ * Get status of a socket module.
+ */
+/*ARGSUSED*/
+static int
+mod_infosockmod(struct modlsockmod *modl, struct modlinkage *modlp, int *p0)
+{
+ *p0 = -1; /* no useful info */
+ return (0);
+}
+
+/*
+ * Install a socket module.
+ */
+/*ARGSUSED*/
+static int
+mod_installsockmod(struct modlsockmod *modl, struct modlinkage *modlp)
+{
+ struct modctl *mcp;
+ char *mod_name;
+
+ mcp = mod_getctl(modlp);
+ ASSERT(mcp != NULL);
+ mod_name = mcp->mod_modname;
+ if (strcmp(mod_name, modl->sockmod_reg_info->smod_name) != 0) {
+#ifdef DEBUG
+ cmn_err(CE_CONT, "mod_installsockmod: different names"
+ " %s != %s \n", mod_name,
+ modl->sockmod_reg_info->smod_name);
+#endif
+ return (EINVAL);
+ }
+
+ /*
+ * Register module.
+ */
+ return (smod_register(modl->sockmod_reg_info));
+}
+
+/*
+ * Remove a socket module.
+ */
+/*ARGSUSED*/
+static int
+mod_removesockmod(struct modlsockmod *modl, struct modlinkage *modlp)
+{
+ /*
+ * unregister from the global socket creation table
+ * check the refcnt in the lookup table
+ */
+ return (smod_unregister(modl->sockmod_reg_info->smod_name));
+}
+
+/*
* Get status of a scheduling class module.
*/
/*ARGSUSED1*/
diff --git a/usr/src/uts/common/os/move.c b/usr/src/uts/common/os/move.c
index 7e1c2f2d62..d4a127794f 100644
--- a/usr/src/uts/common/os/move.c
+++ b/usr/src/uts/common/os/move.c
@@ -558,8 +558,6 @@ uioainit(uio_t *uiop, uioa_t *uioap)
uioap->uioa_mbytes = 0;
- uioap->uioa_mbytes = 0;
-
/* uio_t/uioa_t uio_t common struct copy */
*((uio_t *)uioap) = *uiop;
diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c
index 236626a4f0..42d0b8e17c 100644
--- a/usr/src/uts/common/os/streamio.c
+++ b/usr/src/uts/common/os/streamio.c
@@ -77,6 +77,7 @@
#include <sys/policy.h>
#include <sys/dld.h>
#include <sys/zone.h>
+#include <sys/sodirect.h>
/*
* This define helps improve the readability of streams code while
@@ -1110,50 +1111,7 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
}
bp = getq_noenab(q, rbytes);
- if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) {
- /*
- * A uioa flaged mblk_t chain, already uio processed,
- * add it to the sodirect uioa pending free list.
- *
- * Note, a b_cont chain headed by a DBLK_UIOA enable
- * mblk_t must have all mblk_t(s) DBLK_UIOA enabled.
- */
- mblk_t *bpt = sodp->sod_uioaft;
-
- ASSERT(sodp != NULL);
- ASSERT(msgdsize(bp) == sodp->sod_uioa.uioa_mbytes);
-
- /*
- * Add first mblk_t of "bp" chain to current sodirect uioa
- * free list tail mblk_t, if any, else empty list so new head.
- */
- if (bpt == NULL)
- sodp->sod_uioafh = bp;
- else
- bpt->b_cont = bp;
-
- /*
- * Walk mblk_t "bp" chain to find tail and adjust rptr of
- * each to reflect that uioamove() has consumed all data.
- */
- bpt = bp;
- for (;;) {
- bpt->b_rptr = bpt->b_wptr;
- if (bpt->b_cont == NULL)
- break;
- bpt = bpt->b_cont;
-
- ASSERT(bpt->b_datap->db_flags & DBLK_UIOA);
- }
- /* New sodirect uioa free list tail */
- sodp->sod_uioaft = bpt;
-
- /* Only 1 strget() with data returned per uioa_t */
- if (sodp->sod_uioa.uioa_state & UIOA_ENABLED) {
- sodp->sod_uioa.uioa_state &= UIOA_CLR;
- sodp->sod_uioa.uioa_state |= UIOA_FINI;
- }
- }
+ sod_uioa_mblk_done(sodp, bp);
return (bp);
}
diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c
index 442ced2b51..469ef329db 100644
--- a/usr/src/uts/common/os/strsubr.c
+++ b/usr/src/uts/common/os/strsubr.c
@@ -286,7 +286,6 @@ static void outer_insert(syncq_t *, syncq_t *);
static void outer_remove(syncq_t *, syncq_t *);
static void write_now(syncq_t *);
static void clr_qfull(queue_t *);
-static void enable_svc(queue_t *);
static void runbufcalls(void);
static void sqenable(syncq_t *);
static void sqfill_events(syncq_t *, queue_t *, mblk_t *, void (*)());
@@ -8401,6 +8400,21 @@ mblk_setcred(mblk_t *mp, cred_t *cr)
}
}
+/*
+ * Set the cred and pid for each mblk in the message. It is assumed that
+ * the message passed in does not already have a cred.
+ */
+void
+msg_setcredpid(mblk_t *mp, cred_t *cr, pid_t pid)
+{
+ while (mp != NULL) {
+ ASSERT(DB_CRED(mp) == NULL);
+ mblk_setcred(mp, cr);
+ DB_CPID(mp) = pid;
+ mp = mp->b_cont;
+ }
+}
+
int
hcksum_assoc(mblk_t *mp, multidata_t *mmd, pdesc_t *pd,
uint32_t start, uint32_t stuff, uint32_t end, uint32_t value,
diff --git a/usr/src/uts/common/smbsrv/smb_kproto.h b/usr/src/uts/common/smbsrv/smb_kproto.h
index 2131c88e19..b14005074a 100644
--- a/usr/src/uts/common/smbsrv/smb_kproto.h
+++ b/usr/src/uts/common/smbsrv/smb_kproto.h
@@ -38,6 +38,7 @@ extern "C" {
#include <sys/socket.h>
#include <sys/strsubr.h>
#include <sys/socketvar.h>
+#include <sys/ksocket.h>
#include <sys/cred.h>
#include <smbsrv/smb_vops.h>
#include <smbsrv/smb_xdr.h>
@@ -307,19 +308,17 @@ uint32_t smb_decode_sd(struct smb_xa *, smb_sd_t *);
/*
* Socket functions
*/
-struct sonode *smb_socreate(int domain, int type, int protocol);
-void smb_soshutdown(struct sonode *so);
-void smb_sodestroy(struct sonode *so);
-int smb_sorecv(struct sonode *so, void *msg, size_t len);
-int smb_iov_sorecv(struct sonode *so, iovec_t *iop, int iovlen,
- size_t total_len);
+ksocket_t smb_socreate(int domain, int type, int protocol);
+void smb_soshutdown(ksocket_t so);
+void smb_sodestroy(ksocket_t so);
+int smb_sorecv(ksocket_t so, void *msg, size_t len);
int smb_net_init(void);
void smb_net_fini(void);
void smb_net_txl_constructor(smb_txlst_t *);
void smb_net_txl_destructor(smb_txlst_t *);
smb_txreq_t *smb_net_txr_alloc(void);
void smb_net_txr_free(smb_txreq_t *);
-int smb_net_txr_send(struct sonode *, smb_txlst_t *, smb_txreq_t *);
+int smb_net_txr_send(ksocket_t, smb_txlst_t *, smb_txreq_t *);
/*
* SMB RPC interface
@@ -489,7 +488,7 @@ void smb_request_cancel(smb_request_t *sr);
/*
* session functions (file smb_session.c)
*/
-smb_session_t *smb_session_create(struct sonode *, uint16_t, smb_server_t *);
+smb_session_t *smb_session_create(ksocket_t, uint16_t, smb_server_t *);
int smb_session_daemon(smb_session_list_t *);
void smb_session_reconnection_check(smb_session_list_t *, smb_session_t *);
void smb_session_timers(smb_session_list_t *);
diff --git a/usr/src/uts/common/smbsrv/smb_ktypes.h b/usr/src/uts/common/smbsrv/smb_ktypes.h
index 13f5783116..918746a701 100644
--- a/usr/src/uts/common/smbsrv/smb_ktypes.h
+++ b/usr/src/uts/common/smbsrv/smb_ktypes.h
@@ -46,6 +46,8 @@ extern "C" {
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/cred.h>
+#include <netinet/in.h>
+#include <sys/ksocket.h>
#include <sys/fem.h>
#include <sys/door.h>
#include <smbsrv/smb.h>
@@ -683,7 +685,7 @@ typedef struct smb_session {
uint32_t capabilities;
struct smb_sign signing;
- struct sonode *sock;
+ ksocket_t sock;
smb_slist_t s_req_list;
smb_llist_t s_xa_list;
@@ -1453,7 +1455,7 @@ typedef struct {
typedef struct {
kthread_t *ld_kth;
kt_did_t ld_ktdid;
- struct sonode *ld_so;
+ ksocket_t ld_so;
struct sockaddr_in ld_sin;
smb_session_list_t ld_session_list;
} smb_listener_daemon_t;
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index cecccf50ab..451ce87f1f 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -329,6 +329,7 @@ CHKHDRS= \
kmem_impl.h \
kobj.h \
kobj_impl.h \
+ ksocket.h \
kstat.h \
kstr.h \
ksyms.h \
@@ -503,6 +504,7 @@ CHKHDRS= \
sobject.h \
socket.h \
socket_impl.h \
+ socket_proto.h \
socketvar.h \
sockio.h \
sodirect.h \
diff --git a/usr/src/uts/common/sys/idm/idm_so.h b/usr/src/uts/common/sys/idm/idm_so.h
index 134896ed4f..42c39c6461 100644
--- a/usr/src/uts/common/sys/idm/idm_so.h
+++ b/usr/src/uts/common/sys/idm/idm_so.h
@@ -31,7 +31,7 @@ extern "C" {
#endif
#include <sys/idm/idm_transport.h>
-
+#include <sys/ksocket.h>
/*
* Define TCP window size (send and receive buffer sizes)
*/
@@ -41,7 +41,7 @@ extern "C" {
/* sockets-specific portion of idm_svc_t */
typedef struct idm_so_svc_s {
- struct sonode *is_so;
+ ksocket_t is_so;
kthread_t *is_thread;
kt_did_t is_thread_did;
boolean_t is_thread_running;
@@ -49,7 +49,7 @@ typedef struct idm_so_svc_s {
/* sockets-specific portion of idm_conn_t */
typedef struct idm_so_conn_s {
- struct sonode *ic_so;
+ ksocket_t ic_so;
kthread_t *ic_tx_thread;
kt_did_t ic_tx_thread_did;
@@ -68,24 +68,24 @@ void idm_so_fini();
/* Socket functions */
-struct sonode *
+ksocket_t
idm_socreate(int domain, int type, int protocol);
-void idm_soshutdown(struct sonode *so);
+void idm_soshutdown(ksocket_t so);
-void idm_sodestroy(struct sonode *so);
+void idm_sodestroy(ksocket_t so);
int idm_get_ipaddr(idm_addr_list_t **);
-int idm_sorecv(struct sonode *so, void *msg, size_t len);
+int idm_sorecv(ksocket_t so, void *msg, size_t len);
-int idm_sosendto(struct sonode *so, void *buff, size_t len,
+int idm_sosendto(ksocket_t so, void *buff, size_t len,
struct sockaddr *name, socklen_t namelen);
-int idm_iov_sosend(struct sonode *so, iovec_t *iop, int iovlen,
+int idm_iov_sosend(ksocket_t so, iovec_t *iop, int iovlen,
size_t total_len);
-int idm_iov_sorecv(struct sonode *so, iovec_t *iop, int iovlen,
+int idm_iov_sorecv(ksocket_t so, iovec_t *iop, int iovlen,
size_t total_len);
void idm_sotx_thread(void *arg);
diff --git a/usr/src/uts/common/sys/iscsit/radius_packet.h b/usr/src/uts/common/sys/iscsit/radius_packet.h
index bbf96d5cb2..80ee57a202 100644
--- a/usr/src/uts/common/sys/iscsit/radius_packet.h
+++ b/usr/src/uts/common/sys/iscsit/radius_packet.h
@@ -32,7 +32,7 @@ extern "C" {
#include <netinet/in.h>
#include <sys/types.h>
-
+#include <sys/ksocket.h>
#include <sys/iscsit/radius_protocol.h>
/* A total of RAD_RCV_TIMEOUT * RAD_RETRY_MAX seconds timeout. */
@@ -69,7 +69,7 @@ typedef struct radius_packet_data {
*
*/
int
-iscsit_snd_radius_request(void *socket,
+iscsit_snd_radius_request(ksocket_t socket,
iscsi_ipaddr_t rsvr_ip_addr,
uint32_t rsvr_port,
radius_packet_data_t *packet_data);
@@ -85,7 +85,7 @@ iscsit_snd_radius_request(void *socket,
* Return receive status.
*/
int
-iscsit_rcv_radius_response(void *socket,
+iscsit_rcv_radius_response(ksocket_t socket,
uint8_t *shared_secret,
uint32_t shared_secret_len,
uint8_t *req_authenticator,
diff --git a/usr/src/uts/common/sys/ksocket.h b/usr/src/uts/common/sys/ksocket.h
new file mode 100644
index 0000000000..fb834b027f
--- /dev/null
+++ b/usr/src/uts/common/sys/ksocket.h
@@ -0,0 +1,127 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_KSOCKET_H_
+#define _SYS_KSOCKET_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Opaque kernel socket type */
+typedef struct __ksocket *ksocket_t;
+struct nmsghdr;
+
+/* flag bit for each Callback Event */
+#define KSOCKET_CB_CONNECTED 0x00000001
+#define KSOCKET_CB_CONNECTFAILED 0x00000002
+#define KSOCKET_CB_DISCONNECTED 0x00000004
+#define KSOCKET_CB_NEWDATA 0x00000008
+#define KSOCKET_CB_NEWCONN 0x00000010
+#define KSOCKET_CB_CANSEND 0x00000020
+#define KSOCKET_CB_OOBDATA 0x00000040
+#define KSOCKET_CB_CANTSENDMORE 0x00000080
+#define KSOCKET_CB_CANTRECVMORE 0x00000100
+#define KSOCKET_CB_ERROR 0x00000200
+
+/*
+ * Kernel Socket Callback Events
+ */
+typedef enum ksocket_event {
+ KSOCKET_EV_CONNECTED,
+ KSOCKET_EV_CONNECTFAILED,
+ KSOCKET_EV_DISCONNECTED,
+ KSOCKET_EV_OOBDATA,
+ KSOCKET_EV_NEWDATA,
+ KSOCKET_EV_NEWCONN,
+ KSOCKET_EV_CANSEND,
+ KSOCKET_EV_CANTSENDMORE,
+ KSOCKET_EV_CANTRECVMORE,
+ KSOCKET_EV_ERROR
+} ksocket_callback_event_t;
+
+typedef void (*ksocket_callback_t)(ksocket_t, ksocket_callback_event_t,
+ void *, uintptr_t);
+
+typedef struct ksocket_callbacks {
+ uint32_t ksock_cb_flags;
+ ksocket_callback_t ksock_cb_connected;
+ ksocket_callback_t ksock_cb_connectfailed;
+ ksocket_callback_t ksock_cb_disconnected;
+ ksocket_callback_t ksock_cb_newdata;
+ ksocket_callback_t ksock_cb_newconn;
+ ksocket_callback_t ksock_cb_cansend;
+ ksocket_callback_t ksock_cb_oobdata;
+ ksocket_callback_t ksock_cb_cantsendmore;
+ ksocket_callback_t ksock_cb_cantrecvmore;
+ ksocket_callback_t ksock_cb_error;
+} ksocket_callbacks_t;
+
+#define KSOCKET_SLEEP SOCKET_SLEEP
+#define KSOCKET_NOSLEEP SOCKET_NOSLEEP
+
+extern int ksocket_socket(ksocket_t *, int, int, int, int, struct cred *);
+extern int ksocket_bind(ksocket_t, struct sockaddr *, socklen_t,
+ struct cred *);
+extern int ksocket_listen(ksocket_t, int, struct cred *);
+extern int ksocket_accept(ksocket_t, struct sockaddr *, socklen_t *,
+ ksocket_t *, struct cred *);
+extern int ksocket_connect(ksocket_t, const struct sockaddr *, socklen_t,
+ struct cred *);
+extern int ksocket_send(ksocket_t, void *, size_t, int, size_t *,
+ struct cred *);
+extern int ksocket_sendto(ksocket_t, void *, size_t, int,
+ struct sockaddr *, socklen_t, size_t *, struct cred *);
+extern int ksocket_sendmsg(ksocket_t, struct nmsghdr *, int, size_t *,
+ struct cred *);
+extern int ksocket_sendmblk(ksocket_t, struct nmsghdr *, int, mblk_t **,
+ struct cred *);
+extern int ksocket_recv(ksocket_t, void *, size_t, int, size_t *,
+ struct cred *);
+extern int ksocket_recvfrom(ksocket_t, void *, size_t, int,
+ struct sockaddr *, socklen_t *, size_t *, struct cred *);
+extern int ksocket_recvmsg(ksocket_t, struct nmsghdr *, int, size_t *,
+ struct cred *);
+extern int ksocket_shutdown(ksocket_t, int, struct cred *);
+extern int ksocket_setsockopt(ksocket_t, int, int, const void *, int,
+ struct cred *);
+extern int ksocket_getsockopt(ksocket_t, int, int, void *, int *,
+ struct cred *);
+extern int ksocket_getpeername(ksocket_t, struct sockaddr *, socklen_t *,
+ struct cred *);
+extern int ksocket_getsockname(ksocket_t, struct sockaddr *, socklen_t *,
+ struct cred *);
+extern int ksocket_ioctl(ksocket_t, int, intptr_t, int *, struct cred *);
+extern int ksocket_setcallbacks(ksocket_t, ksocket_callbacks_t *, void *,
+ struct cred *);
+extern int ksocket_close(ksocket_t, struct cred *);
+extern void ksocket_hold(ksocket_t);
+extern void ksocket_rele(ksocket_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_KSOCKET_H_ */
diff --git a/usr/src/uts/common/sys/modctl.h b/usr/src/uts/common/sys/modctl.h
index 47a83b15d9..ed0811c580 100644
--- a/usr/src/uts/common/sys/modctl.h
+++ b/usr/src/uts/common/sys/modctl.h
@@ -26,8 +26,6 @@
#ifndef _SYS_MODCTL_H
#define _SYS_MODCTL_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* loadable module support.
*/
@@ -73,6 +71,7 @@ extern struct mod_ops mod_miscops;
extern struct mod_ops mod_schedops;
extern struct mod_ops mod_strmodops;
extern struct mod_ops mod_syscallops;
+extern struct mod_ops mod_sockmodops;
#ifdef _SYSCALL32_IMPL
extern struct mod_ops mod_syscallops32;
#endif
@@ -191,6 +190,13 @@ struct modldev {
struct devname_ops *dev_ops;
};
+/* For socket Modules. */
+struct modlsockmod {
+ struct mod_ops *sockmod_modops;
+ char *sockmod_linkinfo;
+ struct smod_reg_s *sockmod_reg_info;
+};
+
/* For kiconv modules */
struct modlkiconv {
struct mod_ops *kiconv_modops;
diff --git a/usr/src/uts/common/sys/socket.h b/usr/src/uts/common/sys/socket.h
index 0432b529be..593505a426 100644
--- a/usr/src/uts/common/sys/socket.h
+++ b/usr/src/uts/common/sys/socket.h
@@ -120,6 +120,15 @@ typedef void *_RESTRICT_KYWD Psocklen_t;
#ifdef _KERNEL
#define SO_SND_COPYAVOID 0x0800 /* Internal: use zero-copy */
+#define SO_SND_BUFINFO 0x1000 /* Internal: get buffer info */
+ /* when doing zero-copy */
+
+struct so_snd_bufinfo {
+ ushort_t sbi_wroff; /* Write offset */
+ ssize_t sbi_maxblk; /* Max size of a single mblk */
+ ssize_t sbi_maxpsz; /* Max total size of a mblk chain */
+ ushort_t sbi_tail; /* Extra space available at the end */
+};
#endif /* _KERNEL */
/*
@@ -143,6 +152,7 @@ typedef void *_RESTRICT_KYWD Psocklen_t;
#define SO_ANON_MLP 0x100a /* create MLP on anonymous bind */
#define SO_MAC_EXEMPT 0x100b /* allow dominated unlabeled peers */
#define SO_DOMAIN 0x100c /* get socket domain */
+#define SO_RCVPSH 0x100d /* receive interval to push data */
/* "Socket"-level control message types: */
#define SCM_RIGHTS 0x1010 /* access rights (array of int) */
@@ -167,6 +177,21 @@ typedef void *_RESTRICT_KYWD Psocklen_t;
*/
#define SO_ACCEPTOR 0x20000 /* acceptor socket */
#define SO_SOCKSTR 0x40000 /* normal socket stream */
+#define SO_FALLBACK 0x80000 /* fallback to TPI socket */
+
+/*
+ * Flags for socket_create() and socket_newconn()
+ */
+#define SOCKET_SLEEP KM_SLEEP
+#define SOCKET_NOSLEEP KM_NOSLEEP
+
+
+/*
+ * flags used by sockfs when falling back to tpi socket
+ */
+#define SO_FB_START 0x1
+#define SO_FB_FINISH 0x2
+
#endif /* _KERNEL */
/*
@@ -340,6 +365,8 @@ struct msghdr32 {
#define MSG_CTRUNC 0x10 /* Control data truncated */
#define MSG_TRUNC 0x20 /* Normal data truncated */
#define MSG_WAITALL 0x40 /* Wait for complete recv or error */
+#define MSG_DUPCTRL 0x800 /* Save control message for use with */
+ /* with left over data */
/* End of XPGv2 compliance */
#define MSG_DONTWAIT 0x80 /* Don't block for this recv */
#define MSG_NOTIFICATION 0x100 /* Notification, not data */
@@ -347,6 +374,18 @@ struct msghdr32 {
#define MSG_MAXIOVLEN 16
+#ifdef _KERNEL
+
+/*
+ * for kernel socket only
+ */
+#define MSG_MBLK_QUICKRELE 0x10000000 /* free mblk chain */
+ /* in timely manner */
+#define MSG_USERSPACE 0x20000000 /* buffer from user space */
+
+#endif /* _KERNEL */
+
+
/* Added for XPGv2 compliance */
#define SHUT_RD 0
#define SHUT_WR 1
diff --git a/usr/src/uts/common/sys/socket_proto.h b/usr/src/uts/common/sys/socket_proto.h
new file mode 100644
index 0000000000..8f60ea9e31
--- /dev/null
+++ b/usr/src/uts/common/sys/socket_proto.h
@@ -0,0 +1,182 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SOCKET_PROTO_H_
+#define _SYS_SOCKET_PROTO_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/socket.h>
+
+/*
+ * Generation count
+ */
+typedef uint64_t sock_connid_t;
+
+#define SOCK_CONNID_INIT(id) { \
+ (id) = 0; \
+}
+#define SOCK_CONNID_BUMP(id) (++(id))
+#define SOCK_CONNID_LT(id1, id2) ((int64_t)((id1)-(id2)) < 0)
+
+/* Socket protocol properties */
+struct sock_proto_props {
+ uint_t sopp_flags; /* options to set */
+ ushort_t sopp_wroff; /* write offset */
+ ssize_t sopp_txhiwat; /* tx hi water mark */
+ ssize_t sopp_txlowat; /* tx lo water mark */
+ ssize_t sopp_rxhiwat; /* recv high water mark */
+ ssize_t sopp_rxlowat; /* recv low water mark */
+ ssize_t sopp_maxblk; /* maximum message block size */
+ ssize_t sopp_maxpsz; /* maximum packet size */
+ ssize_t sopp_minpsz; /* minimum packet size */
+ ushort_t sopp_tail; /* space available at the end */
+ uint_t sopp_zcopyflag; /* zero copy flag */
+ boolean_t sopp_oobinline; /* OOB inline */
+ uint_t sopp_rcvtimer; /* delayed recv notification (time) */
+ uint32_t sopp_rcvthresh; /* delayed recv notification (bytes) */
+ socklen_t sopp_maxaddrlen; /* maximum size of protocol address */
+};
+
+/* flags to determine which socket options are set */
+#define SOCKOPT_WROFF 0x0001 /* set write offset */
+#define SOCKOPT_RCVHIWAT 0x0002 /* set read side high water */
+#define SOCKOPT_RCVLOWAT 0x0004 /* set read side high water */
+#define SOCKOPT_MAXBLK 0x0008 /* set maximum message block size */
+#define SOCKOPT_TAIL 0x0010 /* set the extra allocated space */
+#define SOCKOPT_ZCOPY 0x0020 /* set/unset zero copy for sendfile */
+#define SOCKOPT_MAXPSZ 0x0040 /* set maxpsz for protocols */
+#define SOCKOPT_OOBINLINE 0x0080 /* set oob inline processing */
+#define SOCKOPT_RCVTIMER 0x0100
+#define SOCKOPT_RCVTHRESH 0x0200
+#define SOCKOPT_MAXADDRLEN 0x0400 /* set max address length */
+#define SOCKOPT_MINPSZ 0x0800 /* set minpsz for protocols */
+
+#define IS_SO_OOB_INLINE(so) ((so)->so_proto_props.sopp_oobinline)
+
+#ifdef _KERNEL
+
+struct T_capability_ack;
+
+typedef struct sock_upcalls_s sock_upcalls_t;
+typedef struct sock_downcalls_s sock_downcalls_t;
+
+/*
+ * Upcall and downcall handle for sockfs and transport layer.
+ */
+typedef struct __sock_upper_handle *sock_upper_handle_t;
+typedef struct __sock_lower_handle *sock_lower_handle_t;
+
+struct sock_downcalls_s {
+ void (*sd_activate)(sock_lower_handle_t, sock_upper_handle_t,
+ sock_upcalls_t *, int, cred_t *);
+ int (*sd_accept)(sock_lower_handle_t, sock_lower_handle_t,
+ sock_upper_handle_t, cred_t *);
+ int (*sd_bind)(sock_lower_handle_t, struct sockaddr *, socklen_t,
+ cred_t *);
+ int (*sd_listen)(sock_lower_handle_t, int, cred_t *);
+ int (*sd_connect)(sock_lower_handle_t, const struct sockaddr *,
+ socklen_t, sock_connid_t *, cred_t *);
+ int (*sd_getpeername)(sock_lower_handle_t, struct sockaddr *,
+ socklen_t *, cred_t *);
+ int (*sd_getsockname)(sock_lower_handle_t, struct sockaddr *,
+ socklen_t *, cred_t *);
+ int (*sd_getsockopt)(sock_lower_handle_t, int, int, void *,
+ socklen_t *, cred_t *);
+ int (*sd_setsockopt)(sock_lower_handle_t, int, int, const void *,
+ socklen_t, cred_t *);
+ int (*sd_send)(sock_lower_handle_t, mblk_t *, struct nmsghdr *,
+ cred_t *);
+ int (*sd_send_uio)(sock_lower_handle_t, uio_t *, struct nmsghdr *,
+ cred_t *);
+ int (*sd_recv_uio)(sock_lower_handle_t, uio_t *, struct nmsghdr *,
+ cred_t *);
+ short (*sd_poll)(sock_lower_handle_t, short, int, cred_t *);
+ int (*sd_shutdown)(sock_lower_handle_t, int, cred_t *);
+ void (*sd_clr_flowctrl)(sock_lower_handle_t);
+ int (*sd_ioctl)(sock_lower_handle_t, int, intptr_t, int,
+ int32_t *, cred_t *);
+ int (*sd_close)(sock_lower_handle_t, int, cred_t *);
+};
+
+typedef sock_lower_handle_t (*so_proto_create_func_t)(int, int, int,
+ sock_downcalls_t **, uint_t *, int *, int, cred_t *);
+
+typedef void (*so_proto_quiesced_cb_t)(sock_upper_handle_t, queue_t *,
+ struct T_capability_ack *, struct sockaddr *, socklen_t,
+ struct sockaddr *, socklen_t, short);
+typedef void (*so_proto_fallback_func_t)(sock_lower_handle_t, queue_t *,
+ boolean_t, so_proto_quiesced_cb_t);
+
+/*
+ * Upcalls and related information
+ */
+
+/*
+ * su_opctl() actions
+ */
+typedef enum sock_opctl_action {
+ SOCK_OPCTL_ENAB_ACCEPT = 0,
+ SOCK_OPCTL_SHUT_SEND,
+ SOCK_OPCTL_SHUT_RECV
+} sock_opctl_action_t;
+
+struct sock_upcalls_s {
+ sock_upper_handle_t (*su_newconn)(sock_upper_handle_t,
+ sock_lower_handle_t, sock_downcalls_t *, cred_t *, pid_t,
+ sock_upcalls_t **);
+ void (*su_connected)(sock_upper_handle_t, sock_connid_t, cred_t *,
+ pid_t);
+ int (*su_disconnected)(sock_upper_handle_t, sock_connid_t, int);
+ void (*su_opctl)(sock_upper_handle_t, sock_opctl_action_t,
+ uintptr_t);
+ ssize_t (*su_recv)(sock_upper_handle_t, mblk_t *, size_t, int,
+ int *, boolean_t *);
+ void (*su_set_proto_props)(sock_upper_handle_t,
+ struct sock_proto_props *);
+ void (*su_txq_full)(sock_upper_handle_t, boolean_t);
+ void (*su_signal_oob)(sock_upper_handle_t, ssize_t);
+ void (*su_zcopy_notify)(sock_upper_handle_t);
+ void (*su_set_error)(sock_upper_handle_t, int);
+};
+
+#define SOCK_UC_VERSION sizeof (sock_upcalls_t)
+#define SOCK_DC_VERSION sizeof (sock_downcalls_t)
+
+#define SOCKET_RECVHIWATER (48 * 1024)
+#define SOCKET_RECVLOWATER 1024
+
+#define SOCKET_NO_RCVTIMER 0
+#define SOCKET_TIMER_INTERVAL 50
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_SOCKET_PROTO_H_ */
diff --git a/usr/src/uts/common/sys/socketvar.h b/usr/src/uts/common/sys/socketvar.h
index 37a699345a..510d9445cf 100644
--- a/usr/src/uts/common/sys/socketvar.h
+++ b/usr/src/uts/common/sys/socketvar.h
@@ -48,25 +48,18 @@
#include <sys/file.h>
#include <sys/param.h>
#include <sys/zone.h>
+#include <sys/sdt.h>
+#include <sys/modctl.h>
+#include <sys/atomic.h>
+#include <sys/socket.h>
+#include <sys/ksocket.h>
#include <sys/sodirect.h>
-#include <inet/kssl/ksslapi.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
- * Internal representation used for addresses.
- */
-struct soaddr {
- struct sockaddr *soa_sa; /* Actual address */
- t_uscalar_t soa_len; /* Length in bytes for kmem_free */
- t_uscalar_t soa_maxlen; /* Allocated length */
-};
-/* Maximum size address for transports that have ADDR_size == 1 */
-#define SOA_DEFSIZE 128
-
-/*
* Internal representation of the address used to represent addresses
* in the loopback transport for AF_UNIX. While the sockaddr_un is used
* as the sockfs layer address for AF_UNIX the pathnames contained in
@@ -97,6 +90,10 @@ struct sockaddr_ux {
struct so_ux_addr sou_addr;
};
+#if defined(_KERNEL) || defined(_KMEMUSER)
+
+#include <sys/socket_proto.h>
+
typedef struct sonodeops sonodeops_t;
typedef struct sonode sonode_t;
@@ -105,236 +102,149 @@ typedef struct sonode sonode_t;
* name space and can not be opened using open() - only the socket, socketpair
* and accept calls create sonodes.
*
- * When an AF_UNIX socket is bound to a pathname the sockfs
- * creates a VSOCK vnode in the underlying file system. However, the vnodeops
- * etc in this VNODE remain those of the underlying file system.
- * Sockfs uses the v_stream pointer in the underlying file system VSOCK node
- * to find the sonode bound to the pathname. The bound pathname vnode
- * is accessed through so_ux_vp.
- *
- * A socket always corresponds to a VCHR stream representing the transport
- * provider (e.g. /dev/tcp). This information is retrieved from the kernel
- * socket configuration table and entered into so_accessvp. sockfs uses
- * this to perform VOP_ACCESS checks before allowing an open of the transport
- * provider.
+ * The locking of sockfs uses the so_lock mutex plus the SOLOCKED and
+ * SOREADLOCKED flags in so_flag. The mutex protects all the state in the
+ * sonode. It is expected that the underlying transport protocol serializes
+ * socket operations, so sockfs will not normally not single-thread
+ * operations. However, certain sockets, including TPI based ones, can only
+ * handle one control operation at a time. The SOLOCKED flag is used to
+ * single-thread operations from sockfs users to prevent e.g. multiple bind()
+ * calls to operate on the same sonode concurrently. The SOREADLOCKED flag is
+ * used to ensure that only one thread sleeps in kstrgetmsg for a given
+ * sonode. This is needed to ensure atomic operation for things like
+ * MSG_WAITALL.
*
- * The locking of sockfs uses the so_lock mutex plus the SOLOCKED
- * and SOREADLOCKED flags in so_flag. The mutex protects all the state
- * in the sonode. The SOLOCKED flag is used to single-thread operations from
- * sockfs users to prevent e.g. multiple bind() calls to operate on the
- * same sonode concurrently. The SOREADLOCKED flag is used to ensure that
- * only one thread sleeps in kstrgetmsg for a given sonode. This is needed
- * to ensure atomic operation for things like MSG_WAITALL.
+ * The so_fallback_rwlock is used to ensure that for sockets that can
+ * fall back to TPI, the fallback is not initiated until all pending
+ * operations have completed.
*
* Note that so_lock is sometimes held across calls that might go to sleep
* (kmem_alloc and soallocproto*). This implies that no other lock in
* the system should be held when calling into sockfs; from the system call
- * side or from strrput. If locks are held while calling into sockfs
- * the system might hang when running low on memory.
+ * side or from strrput (in case of TPI based sockets). If locks are held
+ * while calling into sockfs the system might hang when running low on memory.
*/
struct sonode {
struct vnode *so_vnode; /* vnode associated with this sonode */
- sonodeops_t *so_ops; /* operations vector for this sonode */
-
- /*
- * These fields are initialized once.
- */
- dev_t so_dev; /* device the sonode represents */
- struct vnode *so_accessvp; /* vnode for the /dev entry */
+ sonodeops_t *so_ops; /* operations vector for this sonode */
+ void *so_priv; /* sonode private data */
- /* The locks themselves */
+ krwlock_t so_fallback_rwlock;
kmutex_t so_lock; /* protects sonode fields */
- kmutex_t so_plumb_lock; /* serializes plumbs, and the related */
- /* fields so_version and so_pushcnt */
+
kcondvar_t so_state_cv; /* synchronize state changes */
- kcondvar_t so_ack_cv; /* wait for TPI acks */
- kcondvar_t so_connind_cv; /* wait for T_CONN_IND */
kcondvar_t so_want_cv; /* wait due to SOLOCKED */
/* These fields are protected by so_lock */
- uint_t so_state; /* internal state flags SS_*, below */
- uint_t so_mode; /* characteristics on socket. SM_* */
- mblk_t *so_ack_mp; /* TPI ack received from below */
- mblk_t *so_conn_ind_head; /* b_next list of T_CONN_IND */
- mblk_t *so_conn_ind_tail;
- mblk_t *so_unbind_mp; /* Preallocated T_UNBIND_REQ message */
+ uint_t so_state; /* internal state flags SS_*, below */
+ uint_t so_mode; /* characteristics on socket. SM_* */
+ ushort_t so_flag; /* flags, see below */
+ int so_count; /* count of opened references */
+
+ sock_connid_t so_proto_connid; /* protocol generation number */
- ushort_t so_flag; /* flags, see below */
- dev_t so_fsid; /* file system identifier */
- time_t so_atime; /* time of last access */
- time_t so_mtime; /* time of last modification */
- time_t so_ctime; /* time of last attributes change */
- int so_count; /* count of opened references */
+ ushort_t so_error; /* error affecting connection */
+ struct sockparams *so_sockparams; /* vnode or socket module */
/* Needed to recreate the same socket for accept */
short so_family;
short so_type;
short so_protocol;
short so_version; /* From so_socket call */
- short so_pushcnt; /* Number of modules above "sockmod" */
+
+ /* Accept queue */
+ kmutex_t so_acceptq_lock; /* protects accept queue */
+ struct sonode *so_acceptq_next; /* acceptq list node */
+ struct sonode *so_acceptq_head;
+ struct sonode **so_acceptq_tail;
+ unsigned int so_acceptq_len;
+ unsigned int so_backlog; /* Listen backlog */
+ kcondvar_t so_acceptq_cv; /* wait for new conn. */
/* Options */
short so_options; /* From socket call, see socket.h */
struct linger so_linger; /* SO_LINGER value */
- int so_sndbuf; /* SO_SNDBUF value */
- int so_rcvbuf; /* SO_RCVBUF value */
- int so_sndlowat; /* send low water mark */
- int so_rcvlowat; /* receive low water mark */
-#ifdef notyet
- int so_sndtimeo; /* Not yet implemented */
- int so_rcvtimeo; /* Not yet implemented */
-#endif /* notyet */
- ushort_t so_error; /* error affecting connection */
- ushort_t so_delayed_error; /* From T_uderror_ind */
- int so_backlog; /* Listen backlog */
+#define so_sndbuf so_proto_props.sopp_txhiwat /* SO_SNDBUF value */
+#define so_sndlowat so_proto_props.sopp_txlowat /* tx low water mark */
+#define so_rcvbuf so_proto_props.sopp_rxhiwat /* SO_RCVBUF value */
+#define so_rcvlowat so_proto_props.sopp_rxlowat /* rx low water mark */
+#define so_max_addr_len so_proto_props.sopp_maxaddrlen
+#define so_minpsz so_proto_props.sopp_minpsz
+#define so_maxpsz so_proto_props.sopp_maxpsz
+
+ clock_t so_sndtimeo; /* send timeout */
+ clock_t so_rcvtimeo; /* recv timeout */
- /*
- * The counts (so_oobcnt and so_oobsigcnt) track the number of
- * urgent indicates that are (logically) queued on the stream head
- * read queue. The urgent data is queued on the stream head
- * as follows.
- *
- * In the normal case the SIGURG is not generated until
- * the T_EXDATA_IND arrives at the stream head. However, transports
- * that have an early indication that urgent data is pending
- * (e.g. TCP receiving a "new" urgent pointer value) can send up
- * an M_PCPROTO/SIGURG message to generate the signal early.
- *
- * The mark is indicated by either:
- * - a T_EXDATA_IND (with no M_DATA b_cont) with MSGMARK set.
- * When this message is consumed by sorecvmsg the socket layer
- * sets SS_RCVATMARK until data has been consumed past the mark.
- * - a message with MSGMARKNEXT set (indicating that the
- * first byte of the next message constitutes the mark). When
- * the last byte of the MSGMARKNEXT message is consumed in
- * the stream head the stream head sets STRATMARK. This flag
- * is cleared when at least one byte is read. (Note that
- * the MSGMARKNEXT messages can be of zero length when there
- * is no previous data to which the marknext can be attached.)
- *
- * While the T_EXDATA_IND method is the common case which is used
- * with all TPI transports, the MSGMARKNEXT method is needed to
- * indicate the mark when e.g. the TCP urgent byte has not been
- * received yet but the TCP urgent pointer has made TCP generate
- * the M_PCSIG/SIGURG.
- *
- * The signal (the M_PCSIG carrying the SIGURG) and the mark
- * indication can not be delivered as a single message, since
- * the signal should be delivered as high priority and any mark
- * indication must flow with the data. This implies that immediately
- * when the SIGURG has been delivered if the stream head queue is
- * empty it is impossible to determine if this will be the position
- * of the mark. This race condition is resolved by using MSGNOTMARKNEXT
- * messages and the STRNOTATMARK flag in the stream head. The
- * SIOCATMARK code calls the stream head to wait for either a
- * non-empty queue or one of the STR*ATMARK flags being set.
- * This implies that any transport that is sending M_PCSIG(SIGURG)
- * should send the appropriate MSGNOTMARKNEXT message (which can be
- * zero length) after sending an M_PCSIG to prevent SIOCATMARK
- * from sleeping unnecessarily.
- */
mblk_t *so_oobmsg; /* outofline oob data */
- uint_t so_oobsigcnt; /* Number of SIGURG generated */
- uint_t so_oobcnt; /* Number of T_EXDATA_IND queued */
+ ssize_t so_oobmark; /* offset of the oob data */
+
pid_t so_pgrp; /* pgrp for signals */
- /* From T_info_ack */
- t_uscalar_t so_tsdu_size;
- t_uscalar_t so_etsdu_size;
- t_scalar_t so_addr_size;
- t_uscalar_t so_opt_size;
- t_uscalar_t so_tidu_size;
- t_scalar_t so_serv_type;
+ cred_t *so_peercred; /* connected socket peer cred */
+ pid_t so_cpid; /* connected socket peer cached pid */
+ zoneid_t so_zoneid; /* opener's zoneid */
- /* From T_capability_ack */
- t_uscalar_t so_acceptor_id;
+ struct pollhead so_poll_list; /* common pollhead */
+ short so_pollev; /* events that should be generated */
- /* Internal provider information */
- struct tpi_provinfo *so_provinfo;
+ /* Receive */
+ unsigned int so_rcv_queued;
+ mblk_t *so_rcv_q_head;
+ mblk_t *so_rcv_q_last_head;
+ mblk_t *so_rcv_head; /* 1st mblk in the list */
+ mblk_t *so_rcv_last_head; /* last mblk in b_next chain */
+ kcondvar_t so_rcv_cv;
+ uint_t so_rcv_wanted; /* # of bytes wanted by app */
+ timeout_id_t so_rcv_timer_tid;
- /*
- * The local and remote addresses have multiple purposes
- * but one of the key reasons for their existence and careful
- * tracking in sockfs is to support getsockname and getpeername
- * when the transport does not handle the TI_GET*NAME ioctls
- * and caching when it does (signaled by valid bits in so_state).
- * When all transports support the new TPI (with T_ADDR_REQ)
- * we can revisit this code.
- * The other usage of so_faddr is to keep the "connected to"
- * address for datagram sockets.
- * Finally, for AF_UNIX both local and remote addresses are used
- * to record the sockaddr_un since we use a separate namespace
- * in the loopback transport.
- */
- struct soaddr so_laddr; /* Local address */
- struct soaddr so_faddr; /* Peer address */
-#define so_laddr_sa so_laddr.soa_sa
-#define so_faddr_sa so_faddr.soa_sa
-#define so_laddr_len so_laddr.soa_len
-#define so_faddr_len so_faddr.soa_len
-#define so_laddr_maxlen so_laddr.soa_maxlen
-#define so_faddr_maxlen so_faddr.soa_maxlen
- mblk_t *so_eaddr_mp; /* for so_delayed_error */
+#define so_rcv_thresh so_proto_props.sopp_rcvthresh
+#define so_rcv_timer_interval so_proto_props.sopp_rcvtimer
- /*
- * For AF_UNIX sockets:
- * so_ux_laddr/faddr records the internal addresses used with the
- * transport.
- * so_ux_vp and v_stream->sd_vnode form the cross-
- * linkage between the underlying fs vnode corresponding to
- * the bound sockaddr_un and the socket node.
- */
- struct so_ux_addr so_ux_laddr; /* laddr bound with the transport */
- struct so_ux_addr so_ux_faddr; /* temporary peer address */
- struct vnode *so_ux_bound_vp; /* bound AF_UNIX file system vnode */
- struct sonode *so_next; /* next sonode on socklist */
- struct sonode *so_prev; /* previous sonode on socklist */
- mblk_t *so_discon_ind_mp; /* T_DISCON_IND received from below */
-
- /* put here for delayed processing */
- void *so_priv; /* sonode private data */
- cred_t *so_peercred; /* connected socket peer cred */
- pid_t so_cpid; /* connected socket peer cached pid */
- zoneid_t so_zoneid; /* opener's zoneid */
+ /* Send */
+ boolean_t so_snd_qfull; /* Transmit full */
+ kcondvar_t so_snd_cv;
- kmem_cache_t *so_cache; /* object cache of this "sonode". */
- void *so_obj; /* object to free */
+ boolean_t so_rcv_wakeup;
+ boolean_t so_snd_wakeup;
- /*
- * For NL7C sockets:
- *
- * so_nl7c_flags the NL7C state of URL processing.
- *
- * so_nl7c_rcv_mp mblk_t chain of already received data to be
- * passed up to the app after NL7C gives up on
- * a socket.
- *
- * so_nl7c_rcv_rval returned rval for last mblk_t from above.
- *
- * so_nl7c_uri the URI currently being processed.
- *
- * so_nl7c_rtime URI request gethrestime_sec().
- *
- * so_nl7c_addr pointer returned by nl7c_addr_lookup().
- */
- uint64_t so_nl7c_flags;
- mblk_t *so_nl7c_rcv_mp;
- int64_t so_nl7c_rcv_rval;
- void *so_nl7c_uri;
- time_t so_nl7c_rtime;
- void *so_nl7c_addr;
-
- /* For sockets acting as an in-kernel SSL proxy */
- kssl_endpt_type_t so_kssl_type; /* is proxy/is proxied/none */
- kssl_ent_t so_kssl_ent; /* SSL config entry */
- kssl_ctx_t so_kssl_ctx; /* SSL session context */
+ /* Communication channel with protocol */
+ sock_lower_handle_t so_proto_handle;
+ sock_downcalls_t *so_downcalls;
+
+ struct sock_proto_props so_proto_props; /* protocol settings */
+ boolean_t so_flowctrld; /* Flow controlled */
+ uint_t so_copyflag; /* Copy related flag */
+ kcondvar_t so_copy_cv; /* Copy cond variable */
+
+ /* kernel sockets */
+ ksocket_callbacks_t so_ksock_callbacks;
+ void *so_ksock_cb_arg; /* callback argument */
+ kcondvar_t so_closing_cv;
/* != NULL for sodirect_t enabled socket */
- sodirect_t *so_direct;
+ sodirect_t *so_direct;
};
+/*
+ * We do an initial check for events without holding locks. However,
+ * if there are no event available, then we redo the check for POLLIN
+ * events under the lock.
+ */
+#define SO_HAVE_DATA(so) \
+ ((so)->so_rcv_timer_tid == 0 && (so->so_rcv_queued > 0)) || \
+ ((so)->so_rcv_queued > (so)->so_rcv_thresh) || \
+ ((so)->so_state & SS_CANTRCVMORE)
+
+/*
+ * Events handled by the protocol (in case sd_poll is set)
+ */
+#define SO_PROTO_POLLEV (POLLIN|POLLRDNORM|POLLRDBAND)
+
+
+#endif /* _KERNEL || _KMEMUSER */
+
/* flags */
#define SOMOD 0x0001 /* update socket modification time */
#define SOACC 0x0002 /* update socket access time */
@@ -345,6 +255,8 @@ struct sonode {
#define SOCLONE 0x0080 /* child of clone driver */
#define SOASYNC_UNBIND 0x0100 /* wait for ACK of async unbind */
+#define SOCK_IS_NONSTR(so) ((so)->so_vnode->v_stream == NULL)
+
/*
* Socket state bits.
*/
@@ -360,31 +272,59 @@ struct sonode {
#define SS_ASYNC 0x00000100 /* async i/o notify */
#define SS_ACCEPTCONN 0x00000200 /* listen done */
-#define SS_HASCONNIND 0x00000400 /* T_CONN_IND for poll */
+/* unused 0x00000400 */ /* was SS_HASCONNIND */
#define SS_SAVEDEOR 0x00000800 /* Saved MSG_EOR rcv side state */
#define SS_RCVATMARK 0x00001000 /* at mark on input */
#define SS_OOBPEND 0x00002000 /* OOB pending or present - poll */
#define SS_HAVEOOBDATA 0x00004000 /* OOB data present */
#define SS_HADOOBDATA 0x00008000 /* OOB data consumed */
+#define SS_CLOSING 0x00010000 /* in process of closing */
-#define SS_FADDR_NOXLATE 0x00020000 /* No xlation of faddr for AF_UNIX */
-
-#define SS_HASDATA 0x00040000 /* NCAfs: data available */
-#define SS_DONEREAD 0x00080000 /* NCAfs: all data read */
-#define SS_MOREDATA 0x00100000 /* NCAfs: NCA has more data */
+/* unused 0x00020000 */ /* was SS_FADDR_NOXLATE */
+/* unused 0x00040000 */ /* was SS_HASDATA */
+/* unused 0x00080000 */ /* was SS_DONEREAD */
+/* unused 0x00100000 */ /* was SS_MOREDATA */
+/* unused 0x00200000 */ /* was SS_DIRECT */
-#define SS_DIRECT 0x00200000 /* transport is directly below */
#define SS_SODIRECT 0x00400000 /* transport supports sodirect */
-#define SS_LADDR_VALID 0x01000000 /* so_laddr valid for user */
-#define SS_FADDR_VALID 0x02000000 /* so_faddr valid for user */
+/* unused 0x01000000 */ /* was SS_LADDR_VALID */
+/* unused 0x02000000 */ /* was SS_FADDR_VALID */
+
+#define SS_SENTLASTREADSIG 0x10000000 /* last rx signal has been sent */
+#define SS_SENTLASTWRITESIG 0x20000000 /* last tx signal has been sent */
+
+#define SS_FALLBACK_PENDING 0x40000000
+#define SS_FALLBACK_COMP 0x80000000
+
/* Set of states when the socket can't be rebound */
#define SS_CANTREBIND (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING|\
SS_CANTSENDMORE|SS_CANTRCVMORE|SS_ACCEPTCONN)
/*
+ * Sockets that can fall back to TPI must ensure that fall back is not
+ * initiated while a thread is using a socket.
+ */
+#define SO_BLOCK_FALLBACK(so, fn) { \
+ ASSERT(MUTEX_NOT_HELD(&(so)->so_lock)); \
+ rw_enter(&(so)->so_fallback_rwlock, RW_READER); \
+ if ((so)->so_state & SS_FALLBACK_COMP) { \
+ rw_exit(&(so)->so_fallback_rwlock); \
+ return (fn); \
+ } \
+}
+
+#define SO_UNBLOCK_FALLBACK(so) { \
+ rw_exit(&(so)->so_fallback_rwlock); \
+}
+
+/* Poll events */
+#define SO_POLLEV_IN 0x1 /* POLLIN wakeup needed */
+#define SO_POLLEV_ALWAYS 0x2 /* wakeups */
+
+/*
* Characteristics of sockets. Not changed after the socket is created.
*/
#define SM_PRIV 0x001 /* privileged for broadcast, raw... */
@@ -399,6 +339,10 @@ struct sonode {
#define SM_ACCEPTOR_ID 0x100 /* so_acceptor_id is valid */
+#define SM_KERNEL 0x200 /* kernel socket */
+
+#define SM_ACCEPTSUPP 0x400 /* can handle accept() */
+
/*
* Socket versions. Used by the socket library when calling _so_socket().
*/
@@ -409,21 +353,177 @@ struct sonode {
#define SOV_XPG4_2 4 /* Xnet socket */
#if defined(_KERNEL) || defined(_KMEMUSER)
+
+/*
+ * sonode create and destroy functions.
+ */
+typedef struct sonode *(*so_create_func_t)(struct sockparams *,
+ int, int, int, int, int, int *, cred_t *);
+typedef void (*so_destroy_func_t)(struct sonode *);
+
+/* STREAM device information */
+typedef struct sdev_info {
+ char *sd_devpath;
+ int sd_devpathlen; /* Is 0 if sp_devpath is a static string */
+ vnode_t *sd_vnode;
+} sdev_info_t;
+
+#define SOCKMOD_VERSION 1
+/* name of the TPI pseudo socket module */
+#define SOTPI_SMOD_NAME "socktpi"
+
+typedef struct __smod_priv_s {
+ so_create_func_t smodp_sock_create_func;
+ so_destroy_func_t smodp_sock_destroy_func;
+ so_proto_fallback_func_t smodp_proto_fallback_func;
+} __smod_priv_t;
+
/*
- * Used for mapping family/type/protocol to vnode.
- * Defined here so that crash can use it.
+ * Socket module register information
+ */
+typedef struct smod_reg_s {
+ int smod_version;
+ char *smod_name;
+ size_t smod_uc_version;
+ size_t smod_dc_version;
+ so_proto_create_func_t smod_proto_create_func;
+
+ /* __smod_priv_data must be NULL */
+ __smod_priv_t *__smod_priv;
+} smod_reg_t;
+
+/*
+ * Socket module information
+ */
+typedef struct smod_info {
+ int smod_version;
+ char *smod_name;
+ uint_t smod_refcnt; /* # of entries */
+ size_t smod_uc_version; /* upcall version */
+ size_t smod_dc_version; /* down call version */
+ so_proto_create_func_t smod_proto_create_func;
+ so_proto_fallback_func_t smod_proto_fallback_func;
+ so_create_func_t smod_sock_create_func;
+ so_destroy_func_t smod_sock_destroy_func;
+ list_node_t smod_node;
+} smod_info_t;
+
+/*
+ * sockparams
+ *
+ * Used for mapping family/type/protocol to module
*/
struct sockparams {
- int sp_domain;
- int sp_type;
- int sp_protocol;
- char *sp_devpath;
- int sp_devpathlen; /* Is 0 if sp_devpath is a static string */
- vnode_t *sp_vnode;
- struct sockparams *sp_next;
+ /*
+ * The family, type, protocol, sdev_info and smod_info are
+ * set when the entry is created, and they will never change
+ * thereafter.
+ */
+ int sp_family;
+ int sp_type;
+ int sp_protocol;
+
+ sdev_info_t sp_sdev_info; /* STREAM device */
+ char *sp_smod_name; /* socket module name */
+ smod_info_t *sp_smod_info; /* socket module */
+
+ kmutex_t sp_lock; /* lock for refcnt */
+ uint64_t sp_refcnt; /* entry reference count */
+
+ /*
+ * The entries below are only modified while holding
+ * splist_lock as a writer.
+ */
+ int sp_flags; /* see below */
+ list_node_t sp_node;
};
-extern struct sockparams *sphead;
+
+/*
+ * sockparams flags
+ */
+#define SOCKPARAMS_EPHEMERAL 0x1 /* temp. entry, not on global list */
+
+extern void sockparams_init(void);
+extern struct sockparams *sockparams_hold_ephemeral_bydev(int, int, int,
+ const char *, int, int *);
+extern struct sockparams *sockparams_hold_ephemeral_bymod(int, int, int,
+ const char *, int, int *);
+extern void sockparams_ephemeral_drop_last_ref(struct sockparams *);
+
+extern void smod_init(void);
+extern void smod_add(smod_info_t *);
+extern int smod_register(const smod_reg_t *);
+extern int smod_unregister(const char *);
+extern smod_info_t *smod_lookup_byname(const char *);
+
+#define SOCKPARAMS_HAS_DEVICE(sp) \
+ ((sp)->sp_sdev_info.sd_devpath != NULL)
+
+/* Increase the smod_info_t reference count */
+#define SMOD_INC_REF(smodp) { \
+ ASSERT((smodp) != NULL); \
+ DTRACE_PROBE1(smodinfo__inc__ref, struct smod_info *, (smodp)); \
+ atomic_inc_uint(&(smodp)->smod_refcnt); \
+}
+
+/*
+ * Decreace the socket module entry reference count.
+ * When no one mapping to the entry, we try to unload the module from the
+ * kernel. If the module can't unload, just leave the module entry with
+ * a zero refcnt.
+ */
+#define SMOD_DEC_REF(sp, smodp) { \
+ ASSERT((smodp) != NULL); \
+ ASSERT((smodp)->smod_refcnt != 0); \
+ atomic_dec_uint(&(smodp)->smod_refcnt); \
+ /* \
+ * No need to atomically check the return value because the \
+ * socket module framework will verify that no one is using \
+ * the module before unloading. Worst thing that can happen \
+ * here is multiple calls to mod_remove_by_name(), which is OK. \
+ */ \
+ if ((smodp)->smod_refcnt == 0) \
+ (void) mod_remove_by_name((sp)->sp_smod_name); \
+}
+
+/* Increase the reference count */
+#define SOCKPARAMS_INC_REF(sp) { \
+ ASSERT((sp) != NULL); \
+ DTRACE_PROBE1(sockparams__inc__ref, struct sockparams *, (sp)); \
+ mutex_enter(&(sp)->sp_lock); \
+ (sp)->sp_refcnt++; \
+ ASSERT((sp)->sp_refcnt != 0); \
+ mutex_exit(&(sp)->sp_lock); \
+}
+
+/*
+ * Decrease the reference count.
+ *
+ * If the sockparams is ephemeral, then the thread dropping the last ref
+ * count will destroy the entry.
+ */
+#define SOCKPARAMS_DEC_REF(sp) { \
+ ASSERT((sp) != NULL); \
+ DTRACE_PROBE1(sockparams__dec__ref, struct sockparams *, (sp)); \
+ mutex_enter(&(sp)->sp_lock); \
+ ASSERT((sp)->sp_refcnt > 0); \
+ if ((sp)->sp_refcnt == 1) { \
+ if ((sp)->sp_flags & SOCKPARAMS_EPHEMERAL) { \
+ mutex_exit(&(sp)->sp_lock); \
+ sockparams_ephemeral_drop_last_ref((sp)); \
+ } else { \
+ (sp)->sp_refcnt--; \
+ if ((sp)->sp_smod_info != NULL) \
+ SMOD_DEC_REF(sp, (sp)->sp_smod_info); \
+ (sp)->sp_smod_info = NULL; \
+ mutex_exit(&(sp)->sp_lock); \
+ } \
+ } else { \
+ (sp)->sp_refcnt--; \
+ mutex_exit(&(sp)->sp_lock); \
+ } \
+}
/*
* Used to traverse the list of AF_UNIX sockets to construct the kstat
@@ -490,49 +590,71 @@ struct sendfile_queue {
/* Socket network operations switch */
struct sonodeops {
- int (*sop_accept)(struct sonode *, int, struct sonode **);
- int (*sop_bind)(struct sonode *, struct sockaddr *, socklen_t,
+ int (*sop_init)(struct sonode *, struct sonode *, cred_t *,
int);
- int (*sop_listen)(struct sonode *, int);
+ int (*sop_accept)(struct sonode *, int, cred_t *, struct sonode **);
+ int (*sop_bind)(struct sonode *, struct sockaddr *, socklen_t,
+ int, cred_t *);
+ int (*sop_listen)(struct sonode *, int, cred_t *);
int (*sop_connect)(struct sonode *, const struct sockaddr *,
- socklen_t, int, int);
+ socklen_t, int, int, cred_t *);
int (*sop_recvmsg)(struct sonode *, struct msghdr *,
- struct uio *);
+ struct uio *, cred_t *);
int (*sop_sendmsg)(struct sonode *, struct msghdr *,
- struct uio *);
- int (*sop_getpeername)(struct sonode *);
- int (*sop_getsockname)(struct sonode *);
- int (*sop_shutdown)(struct sonode *, int);
+ struct uio *, cred_t *);
+ int (*sop_sendmblk)(struct sonode *, struct msghdr *, int,
+ cred_t *, mblk_t **);
+ int (*sop_getpeername)(struct sonode *, struct sockaddr *,
+ socklen_t *, boolean_t, cred_t *);
+ int (*sop_getsockname)(struct sonode *, struct sockaddr *,
+ socklen_t *, cred_t *);
+ int (*sop_shutdown)(struct sonode *, int, cred_t *);
int (*sop_getsockopt)(struct sonode *, int, int, void *,
- socklen_t *, int);
+ socklen_t *, int, cred_t *);
int (*sop_setsockopt)(struct sonode *, int, int, const void *,
- socklen_t);
+ socklen_t, cred_t *);
+ int (*sop_ioctl)(struct sonode *, int, intptr_t, int,
+ cred_t *, int32_t *);
+ int (*sop_poll)(struct sonode *, short, int, short *,
+ struct pollhead **);
+ int (*sop_close)(struct sonode *, int, cred_t *);
};
-#define SOP_ACCEPT(so, fflag, nsop) \
- ((so)->so_ops->sop_accept((so), (fflag), (nsop)))
-#define SOP_BIND(so, name, namelen, flags) \
- ((so)->so_ops->sop_bind((so), (name), (namelen), (flags)))
-#define SOP_LISTEN(so, backlog) \
- ((so)->so_ops->sop_listen((so), (backlog)))
-#define SOP_CONNECT(so, name, namelen, fflag, flags) \
- ((so)->so_ops->sop_connect((so), (name), (namelen), (fflag), (flags)))
-#define SOP_RECVMSG(so, msg, uiop) \
- ((so)->so_ops->sop_recvmsg((so), (msg), (uiop)))
-#define SOP_SENDMSG(so, msg, uiop) \
- ((so)->so_ops->sop_sendmsg((so), (msg), (uiop)))
-#define SOP_GETPEERNAME(so) \
- ((so)->so_ops->sop_getpeername((so)))
-#define SOP_GETSOCKNAME(so) \
- ((so)->so_ops->sop_getsockname((so)))
-#define SOP_SHUTDOWN(so, how) \
- ((so)->so_ops->sop_shutdown((so), (how)))
-#define SOP_GETSOCKOPT(so, level, optionname, optval, optlenp, flags) \
+#define SOP_INIT(so, flag, cr, flags) \
+ ((so)->so_ops->sop_init((so), (flag), (cr), (flags)))
+#define SOP_ACCEPT(so, fflag, cr, nsop) \
+ ((so)->so_ops->sop_accept((so), (fflag), (cr), (nsop)))
+#define SOP_BIND(so, name, namelen, flags, cr) \
+ ((so)->so_ops->sop_bind((so), (name), (namelen), (flags), (cr)))
+#define SOP_LISTEN(so, backlog, cr) \
+ ((so)->so_ops->sop_listen((so), (backlog), (cr)))
+#define SOP_CONNECT(so, name, namelen, fflag, flags, cr) \
+ ((so)->so_ops->sop_connect((so), (name), (namelen), (fflag), (flags), \
+ (cr)))
+#define SOP_RECVMSG(so, msg, uiop, cr) \
+ ((so)->so_ops->sop_recvmsg((so), (msg), (uiop), (cr)))
+#define SOP_SENDMSG(so, msg, uiop, cr) \
+ ((so)->so_ops->sop_sendmsg((so), (msg), (uiop), (cr)))
+#define SOP_SENDMBLK(so, msg, size, cr, mpp) \
+ ((so)->so_ops->sop_sendmblk((so), (msg), (size), (cr), (mpp)))
+#define SOP_GETPEERNAME(so, addr, addrlen, accept, cr) \
+ ((so)->so_ops->sop_getpeername((so), (addr), (addrlen), (accept), (cr)))
+#define SOP_GETSOCKNAME(so, addr, addrlen, cr) \
+ ((so)->so_ops->sop_getsockname((so), (addr), (addrlen), (cr)))
+#define SOP_SHUTDOWN(so, how, cr) \
+ ((so)->so_ops->sop_shutdown((so), (how), (cr)))
+#define SOP_GETSOCKOPT(so, level, optionname, optval, optlenp, flags, cr) \
((so)->so_ops->sop_getsockopt((so), (level), (optionname), \
- (optval), (optlenp), (flags)))
-#define SOP_SETSOCKOPT(so, level, optionname, optval, optlen) \
+ (optval), (optlenp), (flags), (cr)))
+#define SOP_SETSOCKOPT(so, level, optionname, optval, optlen, cr) \
((so)->so_ops->sop_setsockopt((so), (level), (optionname), \
- (optval), (optlen)))
+ (optval), (optlen), (cr)))
+#define SOP_IOCTL(so, cmd, arg, mode, cr, rvalp) \
+ ((so)->so_ops->sop_ioctl((so), (cmd), (arg), (mode), (cr), (rvalp)))
+#define SOP_POLL(so, events, anyyet, reventsp, phpp) \
+ ((so)->so_ops->sop_poll((so), (events), (anyyet), (reventsp), (phpp)))
+#define SOP_CLOSE(so, flag, cr) \
+ ((so)->so_ops->sop_close((so), (flag), (cr)))
#endif /* defined(_KERNEL) || defined(_KMEMUSER) */
@@ -544,6 +666,8 @@ struct sonodeops {
#define ROUNDUP_cmsglen(len) \
(((len) + _CMSG_HDR_ALIGNMENT - 1) & ~(_CMSG_HDR_ALIGNMENT - 1))
+#define IS_NON_STREAM_SOCK(vp) \
+ ((vp)->v_type == VSOCK && (vp)->v_stream == NULL)
/*
* Macros that operate on struct cmsghdr.
* Used in parsing msg_control.
@@ -686,10 +810,8 @@ extern int sockprinterr;
#endif /* defined(DEBUG) */
extern struct vfsops sock_vfsops;
-extern struct vnodeops *socktpi_vnodeops;
-extern const struct fs_operation_def socktpi_vnodeops_template[];
-
-extern sonodeops_t sotpi_sonodeops;
+extern struct vnodeops *socket_vnodeops;
+extern const struct fs_operation_def socket_vnodeops_template[];
extern dev_t sockdev;
@@ -700,20 +822,10 @@ extern int sock_getmsg(vnode_t *, struct strbuf *, struct strbuf *,
uchar_t *, int *, int, rval_t *);
extern int sock_putmsg(vnode_t *, struct strbuf *, struct strbuf *,
uchar_t, int, int);
-struct sonode *sotpi_create(vnode_t *, int, int, int, int, struct sonode *,
- int *);
-extern int socktpi_open(struct vnode **, int, struct cred *,
- caller_context_t *);
-extern int so_sock2stream(struct sonode *);
-extern void so_stream2sock(struct sonode *);
+extern int sogetvp(char *, vnode_t **, int);
extern int sockinit(int, char *);
-extern struct vnode
- *makesockvp(struct vnode *, int, int, int);
-extern void sockfree(struct sonode *);
-extern void so_update_attrs(struct sonode *, int);
-extern int soconfig(int, int, int, char *, int);
-extern struct vnode
- *solookup(int, int, int, char *, int *);
+extern int soconfig(int, int, int, char *, int, char *);
+extern int solookup(int, int, int, struct sockparams **);
extern void so_lock_single(struct sonode *);
extern void so_unlock_single(struct sonode *, int);
extern int so_lock_read(struct sonode *, int);
@@ -723,10 +835,6 @@ extern void *sogetoff(mblk_t *, t_uscalar_t, t_uscalar_t, uint_t);
extern void so_getopt_srcaddr(void *, t_uscalar_t,
void **, t_uscalar_t *);
extern int so_getopt_unix_close(void *, t_uscalar_t);
-extern int so_addr_verify(struct sonode *, const struct sockaddr *,
- socklen_t);
-extern int so_ux_addr_xlate(struct sonode *, struct sockaddr *,
- socklen_t, int, void **, socklen_t *);
extern void fdbuf_free(struct fdbuf *);
extern mblk_t *fdbuf_allocmsg(int, struct fdbuf *);
extern int fdbuf_create(void *, int, struct fdbuf **);
@@ -744,55 +852,13 @@ extern void soisdisconnected(struct sonode *, int);
extern void socantsendmore(struct sonode *);
extern void socantrcvmore(struct sonode *);
extern void soseterror(struct sonode *, int);
-extern int sogeterr(struct sonode *);
-extern int sogetrderr(vnode_t *, int, int *);
-extern int sogetwrerr(vnode_t *, int, int *);
-extern void so_unix_close(struct sonode *);
-extern mblk_t *soallocproto(size_t, int);
-extern mblk_t *soallocproto1(const void *, ssize_t, ssize_t, int);
-extern void soappendmsg(mblk_t *, const void *, ssize_t);
-extern mblk_t *soallocproto2(const void *, ssize_t, const void *, ssize_t,
- ssize_t, int);
-extern mblk_t *soallocproto3(const void *, ssize_t, const void *, ssize_t,
- const void *, ssize_t, ssize_t, int);
-extern int sowaitprim(struct sonode *, t_scalar_t, t_scalar_t,
- t_uscalar_t, mblk_t **, clock_t);
-extern int sowaitokack(struct sonode *, t_scalar_t);
-extern int sowaitack(struct sonode *, mblk_t **, clock_t);
-extern void soqueueack(struct sonode *, mblk_t *);
-extern int sowaitconnind(struct sonode *, int, mblk_t **);
-extern void soqueueconnind(struct sonode *, mblk_t *);
-extern int soflushconnind(struct sonode *, t_scalar_t);
-extern void so_drain_discon_ind(struct sonode *);
-extern void so_flush_discon_ind(struct sonode *);
+extern int sogeterr(struct sonode *, boolean_t);
extern int sowaitconnected(struct sonode *, int, int);
-extern int sostream_direct(struct sonode *, struct uio *,
- mblk_t *, cred_t *);
-extern int sosend_dgram(struct sonode *, struct sockaddr *,
- socklen_t, struct uio *, int);
-extern int sosend_svc(struct sonode *, struct uio *, t_scalar_t, int, int);
-extern void so_installhooks(struct sonode *);
-extern int so_strinit(struct sonode *, struct sonode *);
-extern int sotpi_recvmsg(struct sonode *, struct nmsghdr *,
- struct uio *);
-extern int sotpi_getpeername(struct sonode *);
-extern int sotpi_getsockopt(struct sonode *, int, int, void *,
- socklen_t *, int);
-extern int sotpi_setsockopt(struct sonode *, int, int, const void *,
- socklen_t);
-extern int socktpi_ioctl(struct vnode *, int, intptr_t, int,
- struct cred *, int *, caller_context_t *);
-extern int sodisconnect(struct sonode *, t_scalar_t, int);
extern ssize_t soreadfile(file_t *, uchar_t *, u_offset_t, int *, size_t);
-extern int so_set_asyncsigs(vnode_t *, pid_t, int, int, cred_t *);
-extern int so_set_events(struct sonode *, vnode_t *, cred_t *);
-extern int so_flip_async(struct sonode *, vnode_t *, int, cred_t *);
-extern int so_set_siggrp(struct sonode *, vnode_t *, pid_t, int, cred_t *);
extern void *sock_kstat_init(zoneid_t);
extern void sock_kstat_fini(zoneid_t, void *);
extern struct sonode *getsonode(int, int *, file_t **);
-
/*
* Function wrappers (mostly around the sonode switch) for
* backward compatibility.
@@ -805,44 +871,18 @@ extern int soconnect(struct sonode *, const struct sockaddr *, socklen_t,
int, int);
extern int sorecvmsg(struct sonode *, struct nmsghdr *, struct uio *);
extern int sosendmsg(struct sonode *, struct nmsghdr *, struct uio *);
-extern int sogetpeername(struct sonode *);
-extern int sogetsockname(struct sonode *);
extern int soshutdown(struct sonode *, int);
extern int sogetsockopt(struct sonode *, int, int, void *, socklen_t *,
int);
extern int sosetsockopt(struct sonode *, int, int, const void *,
t_uscalar_t);
-extern struct sonode *socreate(vnode_t *, int, int, int, int,
- struct sonode *, int *);
+extern struct sonode *socreate(struct sockparams *, int, int, int, int,
+ int *);
extern int so_copyin(const void *, void *, size_t, int);
extern int so_copyout(const void *, void *, size_t, int);
-extern int socktpi_access(struct vnode *, int, int, struct cred *,
- caller_context_t *);
-extern int socktpi_fid(struct vnode *, struct fid *, caller_context_t *);
-extern int socktpi_fsync(struct vnode *, int, struct cred *,
- caller_context_t *);
-extern int socktpi_getattr(struct vnode *, struct vattr *, int,
- struct cred *, caller_context_t *);
-extern int socktpi_seek(struct vnode *, offset_t, offset_t *,
- caller_context_t *);
-extern int socktpi_setattr(struct vnode *, struct vattr *, int,
- struct cred *, caller_context_t *);
-extern int socktpi_setfl(vnode_t *, int, int, cred_t *,
- caller_context_t *);
-
-/* SCTP sockfs */
-extern struct sonode *sosctp_create(vnode_t *, int, int, int, int,
- struct sonode *, int *);
-extern int sosctp_init(void);
-
-/* SDP sockfs */
-extern struct sonode *sosdp_create(vnode_t *, int, int, int, int,
- struct sonode *, int *);
-extern int sosdp_init(void);
-
#endif
/*
@@ -865,9 +905,11 @@ struct sockinfo {
uint16_t si_faddr_family;
char si_laddr_sun_path[MAXPATHLEN + 1]; /* NULL terminated */
char si_faddr_sun_path[MAXPATHLEN + 1];
+ boolean_t si_faddr_noxlate;
zoneid_t si_szoneid;
};
+#define SOCKMOD_PATH "socketmod" /* dir where sockmods are stored */
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/sys/sockio.h b/usr/src/uts/common/sys/sockio.h
index 012e7f3061..9e107ff3ef 100644
--- a/usr/src/uts/common/sys/sockio.h
+++ b/usr/src/uts/common/sys/sockio.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -39,8 +39,6 @@
#ifndef _SYS_SOCKIO_H
#define _SYS_SOCKIO_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* General socket ioctl definitions.
*/
@@ -316,7 +314,9 @@ extern "C" {
#define SIOCSIPMPFAILBACK _IOW('i', 182, int) /* enable/disable */
/* FAILBACK */
-#define SIOCSENABLESDP _IOWR('i', 183, int) /* Enable SDP */
+#define SIOCSENABLESDP _IOWR('i', 183, int) /* Enable SDP */
+
+#define SIOCSQPTR _IOWR('i', 184, int) /* set q_ptr of stream */
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/sys/sodirect.h b/usr/src/uts/common/sys/sodirect.h
index c8acfcea44..f87d010f56 100644
--- a/usr/src/uts/common/sys/sodirect.h
+++ b/usr/src/uts/common/sys/sodirect.h
@@ -52,12 +52,15 @@
extern "C" {
#endif
+typedef int (*sod_enq_func)();
+typedef void (*sod_wakeup_func)();
+
typedef struct sodirect_s {
uint32_t sod_state; /* State bits */
uint32_t sod_want; /* Pending read byte count or 0 */
queue_t *sod_q; /* Socket Q */
- int (*sod_enqueue)(); /* Call to enqueue an mblk_t */
- void (*sod_wakeup)(); /* Call to awkake a read()er, if any */
+ sod_enq_func sod_enqueue; /* Call to enqueue an mblk_t */
+ sod_wakeup_func sod_wakeup; /* Call to awkake a read()er, if any */
mblk_t *sod_uioafh; /* To be freed list head, or NULL */
mblk_t *sod_uioaft; /* To be freed list tail */
kmutex_t *sod_lockp; /* Pointer to the lock needed */
@@ -107,10 +110,36 @@ typedef struct sodirect_s {
#define SOD_QFULL(p) ((p)->sod_q->q_flag & QFULL)
#define SOD_QCNT(p) ((p)->sod_q->q_count)
-#define SOD_DISABLE(p) (p)->sod_state &= ~SOD_ENABLED
+#define SOD_DISABLE(p) { \
+ if ((p) != NULL) \
+ (p)->sod_state &= ~SOD_ENABLED; \
+}
#define SOD_QTOSODP(q) (q)->q_stream->sd_sodirect
+#define SOD_SOTOSODP(so) ((sonode_t *)so)->so_direct
+
+#define SOD_UIOAFINI(sodp) { \
+ if ((sodp) && (sodp)->sod_uioa.uioa_state & UIOA_ENABLED) { \
+ (sodp)->sod_uioa.uioa_state &= UIOA_CLR; \
+ (sodp)->sod_uioa.uioa_state |= UIOA_FINI; \
+ } \
+}
+
+struct sonode;
+struct sodirect_s;
+
+extern uio_t *sod_rcv_init(struct sonode *, int, struct uio **);
+extern int sod_rcv_done(struct sonode *, struct uio *, struct uio *);
+
+extern mblk_t *sod_uioa_mblk_init(struct sodirect_s *, mblk_t *, size_t);
+extern void sod_uioa_so_init(struct sonode *, struct sodirect_s *,
+ struct uio *);
+extern ssize_t sod_uioa_mblk(struct sonode *, mblk_t *);
+extern void sod_uioa_mblk_done(struct sodirect_s *, mblk_t *);
+extern void sod_init();
+extern void sod_sock_init(struct sonode *, struct stdata *, sod_enq_func,
+ sod_wakeup_func, kmutex_t *);
#ifdef __cplusplus
}
diff --git a/usr/src/uts/common/sys/squeue.h b/usr/src/uts/common/sys/squeue.h
index ec09b3a88b..e14ded203a 100644
--- a/usr/src/uts/common/sys/squeue.h
+++ b/usr/src/uts/common/sys/squeue.h
@@ -85,6 +85,9 @@ extern void squeue_enter(squeue_t *, mblk_t *, mblk_t *,
uint32_t, int, uint8_t);
extern uintptr_t *squeue_getprivate(squeue_t *, sqprivate_t);
+extern int squeue_synch_enter(squeue_t *, void *, uint8_t);
+extern void squeue_synch_exit(squeue_t *, void *);
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/sys/squeue_impl.h b/usr/src/uts/common/sys/squeue_impl.h
index 501377e53f..bd934cc0b3 100644
--- a/usr/src/uts/common/sys/squeue_impl.h
+++ b/usr/src/uts/common/sys/squeue_impl.h
@@ -102,6 +102,7 @@ struct squeue_s {
clock_t sq_curr_time; /* Current tick (lbolt) */
kcondvar_t sq_worker_cv; /* cond var. worker thread blocks on */
kcondvar_t sq_poll_cv; /* cond variable poll_thr waits on */
+ kcondvar_t sq_synch_cv; /* cond var. synch thread waits on */
kcondvar_t sq_ctrlop_done_cv; /* cond variable for ctrl ops */
clock_t sq_wait; /* lbolts to wait after a fill() */
timeout_id_t sq_tid; /* timer id of pending timeout() */
@@ -163,6 +164,7 @@ struct squeue_s {
#define SQS_POLL_RESTART_DONE 0x01000000
#define SQS_POLL_THR_QUIESCE 0x02000000
+#define SQS_PAUSE 0x04000000 /* The squeue has been paused */
#define SQS_WORKER_THR_CONTROL \
(SQS_POLL_QUIESCE | SQS_POLL_RESTART | SQS_POLL_CLEANUP)
diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h
index 41097cab7f..8d1ac458df 100644
--- a/usr/src/uts/common/sys/stream.h
+++ b/usr/src/uts/common/sys/stream.h
@@ -425,6 +425,7 @@ typedef struct bcache {
#define MSGMARKNEXT 0x10 /* Private: first byte of next msg marked */
#define MSGNOTMARKNEXT 0x20 /* Private: ... not marked */
#define MSGHASREF 0x40 /* Private: message has reference to owner */
+#define MSGWAITSYNC 0x80 /* Private: waiting for sync squeue enter */
/*
* Streams message types.
diff --git a/usr/src/uts/common/sys/strsubr.h b/usr/src/uts/common/sys/strsubr.h
index 04c778feaa..33ec38cac5 100644
--- a/usr/src/uts/common/sys/strsubr.h
+++ b/usr/src/uts/common/sys/strsubr.h
@@ -1126,7 +1126,6 @@ extern void strclean(struct vnode *);
extern void str_cn_clean(); /* XXX hook for consoles signal cleanup */
extern int strwrite(struct vnode *, struct uio *, cred_t *);
extern int strwrite_common(struct vnode *, struct uio *, cred_t *, int);
-extern int kstrwritemp(struct vnode *, mblk_t *, ushort_t);
extern int strread(struct vnode *, struct uio *, cred_t *);
extern int strioctl(struct vnode *, int, intptr_t, int, int, cred_t *, int *);
extern int strrput(queue_t *, mblk_t *);
@@ -1151,6 +1150,7 @@ extern int strcopyout(void *, void *, size_t, int);
extern void strsignal(struct stdata *, int, int32_t);
extern clock_t str_cv_wait(kcondvar_t *, kmutex_t *, clock_t, int);
extern void disable_svc(queue_t *);
+extern void enable_svc(queue_t *);
extern void remove_runlist(queue_t *);
extern void wait_svc(queue_t *);
extern void backenable(queue_t *, uchar_t);
@@ -1212,6 +1212,7 @@ extern mblk_t *allocb_cred_wait(size_t, uint_t, int *, cred_t *);
extern mblk_t *allocb_tmpl(size_t, const mblk_t *);
extern mblk_t *allocb_tryhard(size_t);
extern void mblk_setcred(mblk_t *, cred_t *);
+extern void msg_setcredpid(mblk_t *, cred_t *, pid_t);
extern void strpollwakeup(vnode_t *, short);
extern int putnextctl_wait(queue_t *, int);
diff --git a/usr/src/uts/common/syscall/sendfile.c b/usr/src/uts/common/syscall/sendfile.c
index 16ada25629..13b480a304 100644
--- a/usr/src/uts/common/syscall/sendfile.c
+++ b/usr/src/uts/common/syscall/sendfile.c
@@ -53,6 +53,8 @@
#include <sys/socket.h>
#include <sys/socketvar.h>
+#include <fs/sockfs/sockcommon.h>
+#include <fs/sockfs/socktpi.h>
#include <netinet/in.h>
#include <sys/sendfile.h>
@@ -71,103 +73,11 @@ extern int nl7c_sendfilev(struct sonode *, u_offset_t *, struct sendfilevec *,
int, ssize_t *);
extern int snf_segmap(file_t *, vnode_t *, u_offset_t, u_offset_t, ssize_t *,
boolean_t);
+extern sotpi_info_t *sotpi_sototpi(struct sonode *);
#define readflg (V_WRITELOCK_FALSE)
#define rwflag (V_WRITELOCK_TRUE)
-/*
- * kstrwritemp() has very similar semantics as that of strwrite().
- * The main difference is it obtains mblks from the caller and also
- * does not do any copy as done in strwrite() from user buffers to
- * kernel buffers.
- *
- * Currently, this routine is used by sendfile to send data allocated
- * within the kernel without any copying. This interface does not use the
- * synchronous stream interface as synch. stream interface implies
- * copying.
- */
-int
-kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
-{
- struct stdata *stp;
- struct queue *wqp;
- mblk_t *newmp;
- char waitflag;
- int tempmode;
- int error = 0;
- int done = 0;
- struct sonode *so;
- boolean_t direct;
-
- ASSERT(vp->v_stream);
- stp = vp->v_stream;
-
- so = VTOSO(vp);
- direct = (so->so_state & SS_DIRECT);
-
- /*
- * This is the sockfs direct fast path. canputnext() need
- * not be accurate so we don't grab the sd_lock here. If
- * we get flow-controlled, we grab sd_lock just before the
- * do..while loop below to emulate what strwrite() does.
- */
- wqp = stp->sd_wrq;
- if (canputnext(wqp) && direct &&
- !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
- return (sostream_direct(so, NULL, mp, CRED()));
- } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
- /* Fast check of flags before acquiring the lock */
- mutex_enter(&stp->sd_lock);
- error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
- mutex_exit(&stp->sd_lock);
- if (error != 0) {
- if (!(stp->sd_flag & STPLEX) &&
- (stp->sd_wput_opt & SW_SIGPIPE)) {
- tsignal(curthread, SIGPIPE);
- error = EPIPE;
- }
- return (error);
- }
- }
-
- waitflag = WRITEWAIT;
- if (stp->sd_flag & OLDNDELAY)
- tempmode = fmode & ~FNDELAY;
- else
- tempmode = fmode;
-
- mutex_enter(&stp->sd_lock);
- do {
- if (canputnext(wqp)) {
- mutex_exit(&stp->sd_lock);
- if (stp->sd_wputdatafunc != NULL) {
- newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
- NULL, NULL, NULL);
- if (newmp == NULL) {
- /* The caller will free mp */
- return (ECOMM);
- }
- mp = newmp;
- }
- putnext(wqp, mp);
- return (0);
- }
- error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
- &done);
- } while (error == 0 && !done);
-
- mutex_exit(&stp->sd_lock);
- /*
- * EAGAIN tells the application to try again. ENOMEM
- * is returned only if the memory allocation size
- * exceeds the physical limits of the system. ENOMEM
- * can't be true here.
- */
- if (error == ENOMEM)
- error = EAGAIN;
- return (error);
-}
-
#define SEND_MAX_CHUNK 16
#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
@@ -510,6 +420,7 @@ sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
size_t size = total_size;
size_t extra;
int tail_len;
+ struct nmsghdr msg;
fflag = fp->f_flag;
vp = fp->f_vnode;
@@ -521,8 +432,17 @@ sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
if (total_size == 0)
return (0);
- wroff = (int)vp->v_stream->sd_wroff;
- tail_len = (int)vp->v_stream->sd_tail;
+ if (vp->v_stream != NULL) {
+ wroff = (int)vp->v_stream->sd_wroff;
+ tail_len = (int)vp->v_stream->sd_tail;
+ } else {
+ struct sonode *so;
+
+ so = VTOSO(vp);
+ wroff = so->so_proto_props.sopp_wroff;
+ tail_len = so->so_proto_props.sopp_tail;
+ }
+
extra = wroff + tail_len;
buf_left = MIN(total_size, maxblk);
@@ -530,6 +450,7 @@ sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
if (head == NULL)
return (ENOMEM);
head->b_wptr = head->b_rptr = head->b_rptr + wroff;
+ bzero(&msg, sizeof (msg));
auio.uio_extflg = UIO_COPY_DEFAULT;
for (i = 0; i < copy_cnt; i++) {
@@ -738,9 +659,10 @@ sendvec_small_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
}
ASSERT(total_size == 0);
- error = kstrwritemp(vp, head, fflag);
+ error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &head);
if (error != 0) {
- freemsg(head);
+ if (head != NULL)
+ freemsg(head);
return (error);
}
ttolwp(curthread)->lwp_ru.ioch += (ulong_t)size;
@@ -776,19 +698,28 @@ sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
int maxblk, wroff, tail_len;
struct sonode *so;
stdata_t *stp;
+ struct nmsghdr msg;
fflag = fp->f_flag;
vp = fp->f_vnode;
if (vp->v_type == VSOCK) {
so = VTOSO(vp);
- stp = vp->v_stream;
- wroff = (int)stp->sd_wroff;
- tail_len = (int)stp->sd_tail;
- maxblk = (int)stp->sd_maxblk;
+ if (vp->v_stream != NULL) {
+ stp = vp->v_stream;
+ wroff = (int)stp->sd_wroff;
+ tail_len = (int)stp->sd_tail;
+ maxblk = (int)stp->sd_maxblk;
+ } else {
+ stp = NULL;
+ wroff = so->so_proto_props.sopp_wroff;
+ tail_len = so->so_proto_props.sopp_tail;
+ maxblk = so->so_proto_props.sopp_maxblk;
+ }
extra = wroff + tail_len;
}
+ bzero(&msg, sizeof (msg));
auio.uio_extflg = UIO_COPY_DEFAULT;
for (i = 0; i < copy_cnt; i++) {
if (ISSIG(curthread, JUSTLOOKING))
@@ -841,7 +772,8 @@ sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
size_t iov_len;
iov_len = sfv_len;
- if (so->so_kssl_ctx != NULL)
+ if (!SOCK_IS_NONSTR(so) &&
+ SOTOTPI(so)->sti_kssl_ctx != NULL)
iov_len = MIN(iov_len, maxblk);
aiov.iov_len = iov_len;
@@ -868,9 +800,12 @@ sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
return (error);
}
dmp->b_wptr += iov_len;
- error = kstrwritemp(vp, dmp, fflag);
+ error = socket_sendmblk(VTOSO(vp),
+ &msg, fflag, CRED(), &dmp);
+
if (error != 0) {
- freeb(dmp);
+ if (dmp != NULL)
+ freeb(dmp);
return (error);
}
ttolwp(curthread)->lwp_ru.ioch +=
@@ -880,6 +815,9 @@ sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
sfv_off += iov_len;
}
} else {
+ ttolwp(curthread)->lwp_ru.ioch +=
+ (ulong_t)sfv_len;
+ *count += sfv_len;
aiov.iov_len = sfv_len;
aiov.iov_base = (caddr_t)(uintptr_t)sfv_off;
@@ -971,25 +909,30 @@ sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
return (ENOMEM);
}
} else {
+ uint_t copyflag;
+
+ copyflag = stp != NULL ? stp->sd_copyflag :
+ so->so_proto_props.sopp_zcopyflag;
/*
* For sockets acting as an SSL proxy, we
* need to adjust the size to the maximum
* SSL record size set in the stream head.
*/
- if (so->so_kssl_ctx != NULL)
+ if (!SOCK_IS_NONSTR(so) &&
+ _SOTOTPI(so)->sti_kssl_ctx != NULL)
size = MIN(size, maxblk);
if (vn_has_flocks(readvp) ||
readvp->v_flag & VNOMAP ||
- stp->sd_copyflag & STZCVMUNSAFE) {
+ copyflag & STZCVMUNSAFE) {
segmapit = 0;
- } else if (stp->sd_copyflag & STZCVMSAFE) {
+ } else if (copyflag & STZCVMSAFE) {
segmapit = 1;
} else {
int on = 1;
- if (SOP_SETSOCKOPT(VTOSO(vp),
+ if (socket_setsockopt(VTOSO(vp),
SOL_SOCKET, SO_SND_COPYAVOID,
- &on, sizeof (on)) == 0)
+ &on, sizeof (on), CRED()) == 0)
segmapit = 1;
}
}
@@ -1085,9 +1028,12 @@ sendvec_chunk(file_t *fp, u_offset_t *fileoff, struct sendfilevec *sfv,
if (vp->v_type == VSOCK) {
dmp->b_wptr = dmp->b_rptr + cnt;
- error = kstrwritemp(vp, dmp, fflag);
+ error = socket_sendmblk(VTOSO(vp),
+ &msg, fflag, CRED(), &dmp);
+
if (error != 0) {
- freeb(dmp);
+ if (dmp != NULL)
+ freeb(dmp);
VOP_RWUNLOCK(readvp, readflg,
NULL);
releasef(sfv->sfv_fd);
@@ -1186,45 +1132,11 @@ sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt,
switch (vp->v_type) {
case VSOCK:
so = VTOSO(vp);
- /* sendfile not supported for SCTP */
- if (so->so_protocol == IPPROTO_SCTP) {
- error = EPROTONOSUPPORT;
- goto err;
- }
is_sock = B_TRUE;
- switch (so->so_family) {
- case AF_INET:
- case AF_INET6:
- /*
- * Make similar checks done in SOP_WRITE().
- */
- if (so->so_state & SS_CANTSENDMORE) {
- tsignal(curthread, SIGPIPE);
- error = EPIPE;
- goto err;
- }
- if (so->so_type != SOCK_STREAM) {
- error = EOPNOTSUPP;
- goto err;
- }
-
- if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
- (SS_ISCONNECTED|SS_ISBOUND)) {
- error = ENOTCONN;
- goto err;
- }
-
- if ((so->so_state & SS_DIRECT) &&
- (so->so_priv != NULL) &&
- (so->so_kssl_ctx == NULL)) {
- maxblk = ((tcp_t *)so->so_priv)->tcp_mss;
- } else {
- maxblk = (int)vp->v_stream->sd_maxblk;
- }
- break;
- default:
- error = EAFNOSUPPORT;
- goto err;
+ if (SOCK_IS_NONSTR(so)) {
+ maxblk = so->so_proto_props.sopp_maxblk;
+ } else {
+ maxblk = (int)vp->v_stream->sd_maxblk;
}
break;
case VREG:
@@ -1361,21 +1273,18 @@ sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt,
* senfilev() function to consume the sfv[].
*/
if (is_sock) {
- switch (so->so_family) {
- case AF_INET:
- case AF_INET6:
- if (so->so_nl7c_flags != 0)
- error = nl7c_sendfilev(so, &fileoff,
- sfv, copy_cnt, &count);
- else if ((total_size <= (4 * maxblk)) &&
- error == 0)
- error = sendvec_small_chunk(fp,
- &fileoff, sfv, copy_cnt,
- total_size, maxblk, &count);
- else
- error = sendvec_chunk(fp, &fileoff,
- sfv, copy_cnt, &count);
- break;
+ if (!SOCK_IS_NONSTR(so) &&
+ _SOTOTPI(so)->sti_nl7c_flags != 0) {
+ error = nl7c_sendfilev(so, &fileoff,
+ sfv, copy_cnt, &count);
+ } else if ((total_size <= (4 * maxblk)) &&
+ error == 0) {
+ error = sendvec_small_chunk(fp,
+ &fileoff, sfv, copy_cnt,
+ total_size, maxblk, &count);
+ } else {
+ error = sendvec_chunk(fp, &fileoff,
+ sfv, copy_cnt, &count);
}
} else {
ASSERT(vp->v_type == VREG);
diff --git a/usr/src/uts/intel/Makefile.intel.shared b/usr/src/uts/intel/Makefile.intel.shared
index 0eba71bc6f..62e23247bf 100644
--- a/usr/src/uts/intel/Makefile.intel.shared
+++ b/usr/src/uts/intel/Makefile.intel.shared
@@ -565,6 +565,7 @@ MISC_KMODS += kcf
MISC_KMODS += kgssapi
MISC_KMODS += kmech_dummy
MISC_KMODS += kmech_krb5
+MISC_KMODS += ksocket
MISC_KMODS += mac
MISC_KMODS += mixer
MISC_KMODS += net80211
@@ -685,6 +686,12 @@ MAC_KMODS += mac_ib
DEVNAME_KMODS += sdev_nsconfig_mod
#
+# socketmod (kernel/socketmod)
+#
+SOCKET_KMODS += socksctp
+SOCKET_KMODS += socksdp
+
+#
# kiconv modules (/kernel/kiconv):
#
KICONV_KMODS += kiconv_emea kiconv_ja kiconv_ko kiconv_sc kiconv_tc
diff --git a/usr/src/uts/intel/ia32/ml/modstubs.s b/usr/src/uts/intel/ia32/ml/modstubs.s
index e29afc6c29..0569b9e394 100644
--- a/usr/src/uts/intel/ia32/ml/modstubs.s
+++ b/usr/src/uts/intel/ia32/ml/modstubs.s
@@ -497,7 +497,10 @@ fcnname/**/_info: \
NO_UNLOAD_STUB(sockfs, snf_segmap, nomod_einval);
NO_UNLOAD_STUB(sockfs, sock_getfasync, nomod_zero);
NO_UNLOAD_STUB(sockfs, nl7c_sendfilev, nomod_zero);
- NO_UNLOAD_STUB(sockfs, sostream_direct, nomod_zero);
+ NO_UNLOAD_STUB(sockfs, sotpi_sototpi, nomod_zero);
+ NO_UNLOAD_STUB(sockfs, socket_sendmblk, nomod_zero);
+ NO_UNLOAD_STUB(sockfs, socket_setsockopt, nomod_zero);
+ NO_UNLOAD_STUB(sockfs, sod_uioa_mblk_done, nomod_zero);
END_MODULE(sockfs);
#endif
@@ -1278,30 +1281,6 @@ fcnname/**/_info: \
#endif
/*
- * Stubs for SDP-IB driver.
- */
-#ifndef SDPIB_MODULE
- MODULE(sdpib,drv);
- STUB(sdpib, sdp_create, nomod_zero);
- STUB(sdpib, sdp_bind, nomod_einval);
- STUB(sdpib, sdp_listen, nomod_einval);
- STUB(sdpib, sdp_connect, nomod_einval);
- STUB(sdpib, sdp_recv, nomod_einval);
- STUB(sdpib, sdp_send, nomod_einval);
- STUB(sdpib, sdp_getpeername, nomod_einval);
- STUB(sdpib, sdp_getsockname, nomod_einval);
- STUB(sdpib, sdp_disconnect, nomod_einval);
- STUB(sdpib, sdp_shutdown, nomod_einval);
- STUB(sdpib, sdp_get_opt, nomod_einval);
- STUB(sdpib, sdp_set_opt, nomod_einval);
- STUB(sdpib, sdp_close, nomod_void);
- STUB(sdpib, sdp_polldata, nomod_zero);
- STUB(sdpib, sdp_ioctl, nomod_einval);
- END_MODULE(sdpib);
-#endif
-
-
-/*
* Stubs for kssl, the kernel SSL proxy
*/
#ifndef KSSL_MODULE
@@ -1348,6 +1327,35 @@ fcnname/**/_info: \
END_MODULE(iommulib);
#endif
+/*
+ * Stubs for kernel socket, for iscsi
+ */
+#ifndef KSOCKET_MODULE
+ MODULE(ksocket, misc);
+ NO_UNLOAD_STUB(ksocket, ksocket_setsockopt, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_getsockopt, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_getpeername, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_getsockname, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_socket, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_bind, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_listen, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_accept, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_connect, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_recv, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_recvfrom, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_recvmsg, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_send, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_sendto, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_sendmsg, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_ioctl, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_setcallbacks, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_hold, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_rele, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_shutdown, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_close, nomod_minus_one);
+ END_MODULE(ksocket);
+#endif
+
/ this is just a marker for the area of text that contains stubs
ENTRY_NP(stubs_end)
diff --git a/usr/src/uts/intel/icmp/Makefile b/usr/src/uts/intel/icmp/Makefile
index 25a104ffbb..259530f9dc 100644
--- a/usr/src/uts/intel/icmp/Makefile
+++ b/usr/src/uts/intel/icmp/Makefile
@@ -21,11 +21,9 @@
#
# uts/intel/icmp/Makefile
#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
# This makefile drives the production of the icmp IP driver
#
# intel implementation architecture dependent
@@ -43,7 +41,7 @@ MODULE = icmp
OBJECTS = $(ICMP_OBJS:%=$(OBJS_DIR)/%)
LINTS = $(ICMP_OBJS:%.o=$(LINTS_DIR)/%.ln)
ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE)
-ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE)
+ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE) $(ROOT_SOCK_DIR)/$(MODULE)
CONF_SRCDIR = $(UTSBASE)/common/inet/ip
#
@@ -66,9 +64,9 @@ LINT_TARGET = $(MODULE).lint
INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOTLINK) $(ROOT_CONFFILE)
#
-# depends on ip
+# depends on ip and sockfs
#
-LDFLAGS += -dy -Ndrv/ip
+LDFLAGS += -dy -Ndrv/ip -Nfs/sockfs
#
# For now, disable these lint checks; maintainers should endeavor
@@ -100,7 +98,7 @@ clean.lint: $(CLEAN_LINT_DEPS)
install: $(INSTALL_DEPS) $(SISCHECK_DEPS)
-$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOTMODULE)
+$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOT_SOCK_DIR) $(ROOTMODULE)
-$(RM) $@; ln $(ROOTMODULE) $@
#
diff --git a/usr/src/uts/intel/icmp/icmp.global-objs.debug64 b/usr/src/uts/intel/icmp/icmp.global-objs.debug64
index ba041c7e17..eeeeedc77e 100644
--- a/usr/src/uts/intel/icmp/icmp.global-objs.debug64
+++ b/usr/src/uts/intel/icmp/icmp.global-objs.debug64
@@ -19,10 +19,9 @@
# CDDL HEADER END
#
#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
cb_inet_devops
fsw
@@ -30,5 +29,8 @@ inet_dev_info
inet_devops
modldrv
modlinkage
+modlsockmod
modlstrmod
netdev_privs
+smodpriv
+smodreg
diff --git a/usr/src/uts/intel/idm/Makefile b/usr/src/uts/intel/idm/Makefile
index 463a8be02a..870fc039ed 100644
--- a/usr/src/uts/intel/idm/Makefile
+++ b/usr/src/uts/intel/idm/Makefile
@@ -60,7 +60,7 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
#
DEBUG_FLGS =
DEBUG_DEFS += $(DEBUG_FLGS)
-LDFLAGS += -dy -Nfs/sockfs
+LDFLAGS += -dy -Nfs/sockfs -Nmisc/ksocket
#
# Default build targets.
diff --git a/usr/src/uts/intel/ip/ip.global-objs.debug64 b/usr/src/uts/intel/ip/ip.global-objs.debug64
index f4bcb8ab0c..2e501f8abc 100644
--- a/usr/src/uts/intel/ip/ip.global-objs.debug64
+++ b/usr/src/uts/intel/ip/ip.global-objs.debug64
@@ -64,6 +64,7 @@ gcgrp4_hash
gcgrp6_hash
gcgrp_hash_size
gcgrp_lock
+icmp_fallback_sock_winit
icmp_frag_size_table
icmp_g_t_info_ack
icmp_ipha
@@ -104,6 +105,10 @@ ip_cgtp_filter_rev
ip_conn_cache
ip_debug
ip_g_all_ones
+ip_helper_stream_cache
+ip_helper_stream_info
+ip_helper_stream_rinit
+ip_helper_stream_winit
ip_ioctl_ftbl
ip_ire_cleanup_cnt
ip_ire_cpu_ratio
@@ -140,6 +145,7 @@ ip_squeue_worker_wait
ip_thread_data
ip_thread_list
ip_thread_rwlock
+ip_use_helper_cache
ip_wput_frag_mdt_min
ipcl_bind_fanout_size
ipcl_conn_hash_maxsize
@@ -251,6 +257,10 @@ sendq_loop_cnt
sin6_null
sin_null
skip_sctp_cksum
+sock_tcp_downcalls
+sock_rts_downcalls
+sock_rawip_downcalls
+sock_udp_downcalls
sqset_global_list
sqset_global_size
sqset_lock
@@ -264,6 +274,7 @@ tcp_acceptor_winit
tcp_conn_cache
tcp_conn_hash_size
tcp_drop_ack_unsent_cnt
+tcp_fallback_sock_winit
tcp_free_list_max_cnt
tcp_fusion_rcv_unread_min
tcp_g_kstat
@@ -303,10 +314,12 @@ tcp_winit
tcp_outbound_squeue_switch
tcpinfov4
tcpinfov6
+tli_errs
tsol_strict_error
tun_spd_hashsize
udp_bind_fanout_size
udp_conn_cache
+udp_fallback_sock_winit
udp_g_t_info_ack_ipv4
udp_g_t_info_ack_ipv6
udp_lrinit
diff --git a/usr/src/uts/intel/ip/ip.global-objs.obj64 b/usr/src/uts/intel/ip/ip.global-objs.obj64
index 3866432363..b773f8a5e0 100644
--- a/usr/src/uts/intel/ip/ip.global-objs.obj64
+++ b/usr/src/uts/intel/ip/ip.global-objs.obj64
@@ -64,6 +64,7 @@ gcgrp4_hash
gcgrp6_hash
gcgrp_hash_size
gcgrp_lock
+icmp_fallback_sock_winit
icmp_frag_size_table
icmp_g_t_info_ack
icmp_ipha
@@ -104,6 +105,10 @@ ip_cgtp_filter_rev
ip_conn_cache
ip_debug
ip_g_all_ones
+ip_helper_stream_cache
+ip_helper_stream_info
+ip_helper_stream_rinit
+ip_helper_stream_winit
ip_ioctl_ftbl
ip_ire_cleanup_cnt
ip_ire_cpu_ratio
@@ -140,6 +145,7 @@ ip_squeue_worker_wait
ip_thread_data
ip_thread_list
ip_thread_rwlock
+ip_use_helper_cache
ip_wput_frag_mdt_min
ipcl_bind_fanout_size
ipcl_conn_hash_maxsize
@@ -243,6 +249,10 @@ sctprinit
sctpwinit
sin6_null
sin_null
+sock_tcp_downcalls
+sock_rts_downcalls
+sock_rawip_downcalls
+sock_udp_downcalls
sqset_global_list
sqset_global_size
sqset_lock
@@ -256,6 +266,7 @@ tcp_acceptor_winit
tcp_conn_cache
tcp_conn_hash_size
tcp_drop_ack_unsent_cnt
+tcp_fallback_sock_winit
tcp_free_list_max_cnt
tcp_fusion_rcv_unread_min
tcp_g_kstat
@@ -295,10 +306,12 @@ tcp_winit
tcp_outbound_squeue_switch
tcpinfov4
tcpinfov6
+tli_errs
tsol_strict_error
tun_spd_hashsize
udp_bind_fanout_size
udp_conn_cache
+udp_fallback_sock_winit
udp_g_t_info_ack_ipv4
udp_g_t_info_ack_ipv6
udp_lrinit
diff --git a/usr/src/uts/intel/iscsi/Makefile b/usr/src/uts/intel/iscsi/Makefile
index 480f9caffa..efff98b964 100644
--- a/usr/src/uts/intel/iscsi/Makefile
+++ b/usr/src/uts/intel/iscsi/Makefile
@@ -61,7 +61,7 @@ INC_PATH += -I$(UTSBASE)/common/io/scsi/adapters/iscsi
#
# Note dependancy on misc/scsi.
#
-LDFLAGS += -dy -N"misc/scsi" -N"fs/sockfs" -N"sys/doorfs" -Nmisc/md5
+LDFLAGS += -dy -N"misc/scsi" -N"fs/sockfs" -N"sys/doorfs" -Nmisc/md5 -Nmisc/ksocket
LINTFLAGS += -a -erroff=E_BAD_PTR_CAST_ALIGN -erroff=E_PTRDIFF_OVERFLOW
LINTFLAGS64 += -a -erroff=E_BAD_PTR_CAST_ALIGN -erroff=E_PTRDIFF_OVERFLOW
diff --git a/usr/src/uts/intel/iscsit/Makefile b/usr/src/uts/intel/iscsit/Makefile
index 1df1235747..7ecd8be223 100644
--- a/usr/src/uts/intel/iscsit/Makefile
+++ b/usr/src/uts/intel/iscsit/Makefile
@@ -59,7 +59,7 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
# Overrides and depends_on
#
MODSTUBS_DIR = $(OBJS_DIR)
-LDFLAGS += -dy -Ndrv/stmf -Nmisc/idm -Nfs/sockfs -Nmisc/md5
+LDFLAGS += -dy -Ndrv/stmf -Nmisc/idm -Nfs/sockfs -Nmisc/md5 -Nmisc/ksocket
INC_PATH += -I$(UTSBASE)/common/io/comstar/port/iscsit
diff --git a/usr/src/uts/intel/ksocket/Makefile b/usr/src/uts/intel/ksocket/Makefile
new file mode 100644
index 0000000000..288c777b46
--- /dev/null
+++ b/usr/src/uts/intel/ksocket/Makefile
@@ -0,0 +1,84 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+#
+# This makefile drives the production of the kernel socket module
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = ksocket
+OBJECTS = $(KSOCKET_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(KSOCKET_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_MISC_DIR)/$(MODULE)
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/intel/Makefile.intel
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
+
+#
+# Overrides.
+#
+CFLAGS += $(CCVERBOSE)
+LDFLAGS += -dy -Nfs/sockfs
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/intel/Makefile.targ
diff --git a/usr/src/uts/intel/rts/Makefile b/usr/src/uts/intel/rts/Makefile
index 2247001290..8e8ec349a5 100644
--- a/usr/src/uts/intel/rts/Makefile
+++ b/usr/src/uts/intel/rts/Makefile
@@ -21,11 +21,9 @@
#
# uts/intel/rts/Makefile
#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
-#
# This makefile drives the production of the rts IP driver
#
# intel implementation architecture dependent
@@ -43,6 +41,7 @@ MODULE = rts
OBJECTS = $(RTS_OBJS:%=$(OBJS_DIR)/%)
LINTS = $(RTS_OBJS:%.o=$(LINTS_DIR)/%.ln)
ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE)
+ROOTLINK = $(ROOT_SOCK_DIR)/$(MODULE)
CONF_SRCDIR = $(UTSBASE)/common/inet/ip
#
@@ -65,9 +64,9 @@ LINT_TARGET = $(MODULE).lint
INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOTLINK) $(ROOT_CONFFILE)
#
-# depends on ip
+# depends on ip and sockfs
#
-LDFLAGS += -dy -Ndrv/ip
+LDFLAGS += -dy -Ndrv/ip -Nfs/sockfs
#
# For now, disable these lint checks; maintainers should endeavor
@@ -99,7 +98,7 @@ clean.lint: $(CLEAN_LINT_DEPS)
install: $(INSTALL_DEPS) $(SISCHECK_DEPS)
-$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOTMODULE)
+$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOT_SOCK_DIR) $(ROOTMODULE)
-$(RM) $@; ln $(ROOTMODULE) $@
#
diff --git a/usr/src/uts/intel/rts/rts.global-objs.debug64 b/usr/src/uts/intel/rts/rts.global-objs.debug64
index 4c699f6410..75b422acf6 100644
--- a/usr/src/uts/intel/rts/rts.global-objs.debug64
+++ b/usr/src/uts/intel/rts/rts.global-objs.debug64
@@ -19,14 +19,15 @@
# CDDL HEADER END
#
#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
cb_inet_devops
inet_dev_info
inet_devops
modldrv
modlinkage
+modlsockmod
netdev_privs
+smodreg
diff --git a/usr/src/uts/intel/smbsrv/Makefile b/usr/src/uts/intel/smbsrv/Makefile
index f8482ba8ce..77ef7351ba 100644
--- a/usr/src/uts/intel/smbsrv/Makefile
+++ b/usr/src/uts/intel/smbsrv/Makefile
@@ -19,11 +19,9 @@
# CDDL HEADER END
#
#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#ident "%Z%%M% %I% %E% SMI"
-#
# This makefile drives the production of the cifs server file system
# kernel module.
#
@@ -53,7 +51,8 @@ include $(UTSBASE)/intel/Makefile.intel
# Module dependencies
#
#
-LDFLAGS += -dy -Nfs/sockfs -Ndrv/ip -Nstrmod/rpcmod -Nsys/doorfs -Nmisc/kcf
+LDFLAGS += -dy -Nfs/sockfs -Nmisc/ksocket -Ndrv/ip -Nstrmod/rpcmod -Nsys/doorfs
+LDFLAGS += -Nmisc/kcf
#
# Define targets
diff --git a/usr/src/uts/intel/socksctp/Makefile b/usr/src/uts/intel/socksctp/Makefile
new file mode 100644
index 0000000000..fa316464ad
--- /dev/null
+++ b/usr/src/uts/intel/socksctp/Makefile
@@ -0,0 +1,95 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+# This makefile drives the production of the nca driver
+# kernel module.
+#
+# intel architecture dependent
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = socksctp
+OBJECTS = $(SCTP_SOCK_MOD_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(SCTP_SOCK_MOD_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_SOCK_DIR)/$(MODULE)
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/intel/Makefile.intel
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
+
+#
+# lint pass one enforcement and OS version
+#
+CFLAGS += $(CCVERBOSE)
+
+LDFLAGS += -dy -Nfs/sockfs -Ndrv/ip
+
+#
+# For now, disable these lint checks; maintainers should endeavor
+# to investigate and remove these for maximum lint coverage.
+# Please do not carry these forward to new Makefiles.
+#
+LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN
+LINTTAGS += -erroff=E_PTRDIFF_OVERFLOW
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/intel/Makefile.targ
diff --git a/usr/src/uts/intel/socksdp/Makefile b/usr/src/uts/intel/socksdp/Makefile
new file mode 100644
index 0000000000..966b436fce
--- /dev/null
+++ b/usr/src/uts/intel/socksdp/Makefile
@@ -0,0 +1,87 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
+# This makefile drives the production of the nca driver
+# kernel module.
+#
+# intel architecture dependent
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = socksdp
+OBJECTS = $(SDP_SOCK_MOD_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(SDP_SOCK_MOD_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_SOCK_DIR)/$(MODULE)
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/intel/Makefile.intel
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
+
+#
+# lint pass one enforcement and OS version
+#
+CFLAGS += $(CCVERBOSE)
+
+LDFLAGS += -dy -Nfs/sockfs -Ndrv/ip -Ndrv/sdpib
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/intel/Makefile.targ
diff --git a/usr/src/uts/intel/tcp/Makefile b/usr/src/uts/intel/tcp/Makefile
index 5bd267f765..d083460646 100644
--- a/usr/src/uts/intel/tcp/Makefile
+++ b/usr/src/uts/intel/tcp/Makefile
@@ -2,9 +2,8 @@
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
@@ -22,10 +21,9 @@
#
# uts/intel/tcp/Makefile
#
-# Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#pragma ident "%Z%%M% %I% %E% SMI"
#
# This makefile drives the production of the tcp driver kernel module.
#
@@ -44,7 +42,7 @@ MODULE = tcp
OBJECTS = $(TCP_OBJS:%=$(OBJS_DIR)/%)
LINTS = $(TCP_OBJS:%.o=$(LINTS_DIR)/%.ln)
ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE)
-ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE)
+ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE) $(ROOT_SOCK_DIR)/$(MODULE)
CONF_SRCDIR = $(UTSBASE)/common/inet/tcp
#
@@ -75,9 +73,9 @@ CINLINES = -xinline=tcp_set_ws_value,tcp_fill_header
CFLAGS += $(CINLINES)
#
-# depends on ip and md5
+# depends on ip, md5 and sockfs
#
-LDFLAGS += -dy -Ndrv/ip -Ncrypto/md5
+LDFLAGS += -dy -Ndrv/ip -Ncrypto/md5 -Nfs/sockfs
#
# Default build targets.
@@ -100,7 +98,7 @@ clean.lint: $(CLEAN_LINT_DEPS)
install: $(INSTALL_DEPS)
-$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOTMODULE)
+$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOT_SOCK_DIR) $(ROOTMODULE)
-$(RM) $@; ln $(ROOTMODULE) $@
#
diff --git a/usr/src/uts/intel/udp/Makefile b/usr/src/uts/intel/udp/Makefile
index dad550d3cf..c6238ebd8c 100644
--- a/usr/src/uts/intel/udp/Makefile
+++ b/usr/src/uts/intel/udp/Makefile
@@ -2,9 +2,8 @@
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
@@ -22,11 +21,9 @@
#
# uts/intel/udp/Makefile
#
-# Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#pragma ident "%Z%%M% %I% %E% SMI"
-#
# This makefile drives the production of the udp driver kernel module.
#
# intel implementation architecture dependent
@@ -44,7 +41,7 @@ MODULE = udp
OBJECTS = $(UDP_OBJS:%=$(OBJS_DIR)/%)
LINTS = $(UDP_OBJS:%.o=$(LINTS_DIR)/%.ln)
ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE)
-ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE)
+ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE) $(ROOT_SOCK_DIR)/$(MODULE)
CONF_SRCDIR = $(UTSBASE)/common/inet/udp
#
@@ -67,9 +64,9 @@ LINT_TARGET = $(MODULE).lint
INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOTLINK) $(ROOT_CONFFILE)
#
-# depends on ip
+# depends on ip and sockfs
#
-LDFLAGS += -dy -Ndrv/ip
+LDFLAGS += -dy -Ndrv/ip -Nfs/sockfs
#
# Default build targets.
@@ -92,7 +89,7 @@ clean.lint: $(CLEAN_LINT_DEPS)
install: $(INSTALL_DEPS)
-$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOTMODULE)
+$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOT_SOCK_DIR) $(ROOTMODULE)
-$(RM) $@; ln $(ROOTMODULE) $@
#
diff --git a/usr/src/uts/sparc/Makefile.sparc.shared b/usr/src/uts/sparc/Makefile.sparc.shared
index 80a188f75a..061befa7e3 100644
--- a/usr/src/uts/sparc/Makefile.sparc.shared
+++ b/usr/src/uts/sparc/Makefile.sparc.shared
@@ -385,6 +385,7 @@ MISC_KMODS += s1394
MISC_KMODS += hpcsvc pcihp pciehpc pcishpc
MISC_KMODS += rsmops
MISC_KMODS += kcf
+MISC_KMODS += ksocket
MISC_KMODS += ibcm
MISC_KMODS += ibdm
MISC_KMODS += ibmf
@@ -486,6 +487,12 @@ MAC_KMODS += mac_ib
DEVNAME_KMODS += sdev_nsconfig_mod
#
+# socketmod (kernel/socketmod)
+#
+SOCKET_KMODS += socksctp
+SOCKET_KMODS += socksdp
+
+#
# kiconv modules (/kernel/kiconv):
#
KICONV_KMODS += kiconv_emea kiconv_ja kiconv_ko kiconv_sc kiconv_tc
diff --git a/usr/src/uts/sparc/icmp/Makefile b/usr/src/uts/sparc/icmp/Makefile
index 5fd067b116..55c11a1ea0 100644
--- a/usr/src/uts/sparc/icmp/Makefile
+++ b/usr/src/uts/sparc/icmp/Makefile
@@ -20,11 +20,9 @@
#
#
# uts/sparc/icmp/Makefile
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#ident "%Z%%M% %I% %E% SMI"
-#
# This makefile drives the production of the icmp IP driver
#
# sparc architecture dependent
@@ -42,7 +40,7 @@ MODULE = icmp
OBJECTS = $(ICMP_OBJS:%=$(OBJS_DIR)/%)
LINTS = $(ICMP_OBJS:%.o=$(LINTS_DIR)/%.ln)
ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE)
-ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE)
+ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE) $(ROOT_SOCK_DIR)/$(MODULE)
CONF_SRCDIR = $(UTSBASE)/common/inet/ip
#
@@ -70,9 +68,9 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOTLINK) $(ROOT_CONFFILE)
CFLAGS += $(CCVERBOSE)
#
-# depends on ip
+# depends on ip and sockfs
#
-LDFLAGS += -dy -Ndrv/ip
+LDFLAGS += -dy -Ndrv/ip -Nfs/sockfs
#
# For now, disable these lint checks; maintainers should endeavor
@@ -104,7 +102,7 @@ clean.lint: $(CLEAN_LINT_DEPS)
install: $(INSTALL_DEPS) $(SISCHECK_DEPS)
-$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOTMODULE)
+$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOT_SOCK_DIR) $(ROOTMODULE)
-$(RM) $@; ln $(ROOTMODULE) $@
#
diff --git a/usr/src/uts/sparc/icmp/icmp.global-objs.debug64 b/usr/src/uts/sparc/icmp/icmp.global-objs.debug64
index ba041c7e17..eeeeedc77e 100644
--- a/usr/src/uts/sparc/icmp/icmp.global-objs.debug64
+++ b/usr/src/uts/sparc/icmp/icmp.global-objs.debug64
@@ -19,10 +19,9 @@
# CDDL HEADER END
#
#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
cb_inet_devops
fsw
@@ -30,5 +29,8 @@ inet_dev_info
inet_devops
modldrv
modlinkage
+modlsockmod
modlstrmod
netdev_privs
+smodpriv
+smodreg
diff --git a/usr/src/uts/sparc/idm/Makefile b/usr/src/uts/sparc/idm/Makefile
index 6b03fb56df..27535cf198 100644
--- a/usr/src/uts/sparc/idm/Makefile
+++ b/usr/src/uts/sparc/idm/Makefile
@@ -58,7 +58,7 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
#
DEBUG_FLGS =
DEBUG_DEFS += $(DEBUG_FLGS)
-LDFLAGS += -dy -Nfs/sockfs
+LDFLAGS += -dy -Nfs/sockfs -Nmisc/ksocket
#
# Default build targets.
diff --git a/usr/src/uts/sparc/ip/ip.global-objs.debug64 b/usr/src/uts/sparc/ip/ip.global-objs.debug64
index f4bcb8ab0c..fabffbc5f5 100644
--- a/usr/src/uts/sparc/ip/ip.global-objs.debug64
+++ b/usr/src/uts/sparc/ip/ip.global-objs.debug64
@@ -64,6 +64,7 @@ gcgrp4_hash
gcgrp6_hash
gcgrp_hash_size
gcgrp_lock
+icmp_fallback_sock_winit
icmp_frag_size_table
icmp_g_t_info_ack
icmp_ipha
@@ -104,6 +105,10 @@ ip_cgtp_filter_rev
ip_conn_cache
ip_debug
ip_g_all_ones
+ip_helper_stream_cache
+ip_helper_stream_info
+ip_helper_stream_rinit
+ip_helper_stream_winit
ip_ioctl_ftbl
ip_ire_cleanup_cnt
ip_ire_cpu_ratio
@@ -140,6 +145,7 @@ ip_squeue_worker_wait
ip_thread_data
ip_thread_list
ip_thread_rwlock
+ip_use_helper_cache
ip_wput_frag_mdt_min
ipcl_bind_fanout_size
ipcl_conn_hash_maxsize
@@ -251,6 +257,10 @@ sendq_loop_cnt
sin6_null
sin_null
skip_sctp_cksum
+sock_rawip_downcalls
+sock_rts_downcalls
+sock_tcp_downcalls
+sock_udp_downcalls
sqset_global_list
sqset_global_size
sqset_lock
@@ -264,6 +274,7 @@ tcp_acceptor_winit
tcp_conn_cache
tcp_conn_hash_size
tcp_drop_ack_unsent_cnt
+tcp_fallback_sock_winit
tcp_free_list_max_cnt
tcp_fusion_rcv_unread_min
tcp_g_kstat
@@ -303,10 +314,12 @@ tcp_winit
tcp_outbound_squeue_switch
tcpinfov4
tcpinfov6
+tli_errs
tsol_strict_error
tun_spd_hashsize
udp_bind_fanout_size
udp_conn_cache
+udp_fallback_sock_winit
udp_g_t_info_ack_ipv4
udp_g_t_info_ack_ipv6
udp_lrinit
diff --git a/usr/src/uts/sparc/ip/ip.global-objs.obj64 b/usr/src/uts/sparc/ip/ip.global-objs.obj64
index 3866432363..c7fb907f8c 100644
--- a/usr/src/uts/sparc/ip/ip.global-objs.obj64
+++ b/usr/src/uts/sparc/ip/ip.global-objs.obj64
@@ -64,6 +64,7 @@ gcgrp4_hash
gcgrp6_hash
gcgrp_hash_size
gcgrp_lock
+icmp_fallback_sock_winit
icmp_frag_size_table
icmp_g_t_info_ack
icmp_ipha
@@ -104,6 +105,10 @@ ip_cgtp_filter_rev
ip_conn_cache
ip_debug
ip_g_all_ones
+ip_helper_stream_cache
+ip_helper_stream_info
+ip_helper_stream_rinit
+ip_helper_stream_winit
ip_ioctl_ftbl
ip_ire_cleanup_cnt
ip_ire_cpu_ratio
@@ -140,6 +145,7 @@ ip_squeue_worker_wait
ip_thread_data
ip_thread_list
ip_thread_rwlock
+ip_use_helper_cache
ip_wput_frag_mdt_min
ipcl_bind_fanout_size
ipcl_conn_hash_maxsize
@@ -243,6 +249,10 @@ sctprinit
sctpwinit
sin6_null
sin_null
+sock_rawip_downcalls
+sock_rts_downcalls
+sock_tcp_downcalls
+sock_udp_downcalls
sqset_global_list
sqset_global_size
sqset_lock
@@ -256,6 +266,7 @@ tcp_acceptor_winit
tcp_conn_cache
tcp_conn_hash_size
tcp_drop_ack_unsent_cnt
+tcp_fallback_sock_winit
tcp_free_list_max_cnt
tcp_fusion_rcv_unread_min
tcp_g_kstat
@@ -295,10 +306,12 @@ tcp_winit
tcp_outbound_squeue_switch
tcpinfov4
tcpinfov6
+tli_errs
tsol_strict_error
tun_spd_hashsize
udp_bind_fanout_size
udp_conn_cache
+udp_fallback_sock_winit
udp_g_t_info_ack_ipv4
udp_g_t_info_ack_ipv6
udp_lrinit
diff --git a/usr/src/uts/sparc/iscsi/Makefile b/usr/src/uts/sparc/iscsi/Makefile
index 0e35ba9d0d..437d9b5838 100644
--- a/usr/src/uts/sparc/iscsi/Makefile
+++ b/usr/src/uts/sparc/iscsi/Makefile
@@ -61,7 +61,7 @@ INC_PATH += -I$(UTSBASE)/common/io/scsi/adapters/iscsi
#
# Note dependancy on misc/scsi.
#
-LDFLAGS += -dy -N"misc/scsi" -N"fs/sockfs" -N"sys/doorfs" -N"misc/md5"
+LDFLAGS += -dy -N"misc/scsi" -N"fs/sockfs" -N"sys/doorfs" -N"misc/md5" -Nmisc/ksocket
LINTFLAGS += -a -erroff=E_BAD_PTR_CAST_ALIGN -erroff=E_PTRDIFF_OVERFLOW
LINTFLAGS64 += -a -erroff=E_BAD_PTR_CAST_ALIGN -erroff=E_PTRDIFF_OVERFLOW
diff --git a/usr/src/uts/sparc/iscsit/Makefile b/usr/src/uts/sparc/iscsit/Makefile
index 1df1235747..7ecd8be223 100644
--- a/usr/src/uts/sparc/iscsit/Makefile
+++ b/usr/src/uts/sparc/iscsit/Makefile
@@ -59,7 +59,7 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
# Overrides and depends_on
#
MODSTUBS_DIR = $(OBJS_DIR)
-LDFLAGS += -dy -Ndrv/stmf -Nmisc/idm -Nfs/sockfs -Nmisc/md5
+LDFLAGS += -dy -Ndrv/stmf -Nmisc/idm -Nfs/sockfs -Nmisc/md5 -Nmisc/ksocket
INC_PATH += -I$(UTSBASE)/common/io/comstar/port/iscsit
diff --git a/usr/src/uts/sparc/ksocket/Makefile b/usr/src/uts/sparc/ksocket/Makefile
new file mode 100644
index 0000000000..287a7cfda6
--- /dev/null
+++ b/usr/src/uts/sparc/ksocket/Makefile
@@ -0,0 +1,84 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+
+#
+# This makefile drives the production of the kernel socket module
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = ksocket
+OBJECTS = $(KSOCKET_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(KSOCKET_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_MISC_DIR)/$(MODULE)
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/sparc/Makefile.sparc
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
+
+#
+# Overrides.
+#
+CFLAGS += $(CCVERBOSE)
+LDFLAGS += -dy -Nfs/sockfs
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/intel/Makefile.targ
diff --git a/usr/src/uts/sparc/ml/modstubs.s b/usr/src/uts/sparc/ml/modstubs.s
index e315c9857c..e3379799a7 100644
--- a/usr/src/uts/sparc/ml/modstubs.s
+++ b/usr/src/uts/sparc/ml/modstubs.s
@@ -385,7 +385,10 @@ stubs_base:
NO_UNLOAD_STUB(sockfs, snf_segmap, nomod_einval);
NO_UNLOAD_STUB(sockfs, sock_getfasync, nomod_zero);
NO_UNLOAD_STUB(sockfs, nl7c_sendfilev, nomod_zero);
- NO_UNLOAD_STUB(sockfs, sostream_direct, nomod_zero);
+ NO_UNLOAD_STUB(sockfs, sotpi_sototpi, nomod_zero);
+ NO_UNLOAD_STUB(sockfs, socket_sendmblk, nomod_zero);
+ NO_UNLOAD_STUB(sockfs, socket_setsockopt, nomod_zero);
+ NO_UNLOAD_STUB(sockfs, sod_uioa_mblk_done, nomod_zero);
END_MODULE(sockfs);
#endif
@@ -1232,27 +1235,6 @@ stubs_base:
END_MODULE(softmac);
#endif
-#ifndef SDPIB_MODULE
- MODULE(sdpib,drv);
- STUB(sdpib, sdp_create, nomod_zero);
- STUB(sdpib, sdp_bind, nomod_einval);
- STUB(sdpib, sdp_listen, nomod_einval);
- STUB(sdpib, sdp_connect, nomod_einval);
- STUB(sdpib, sdp_recv, nomod_einval);
- STUB(sdpib, sdp_send, nomod_einval);
- STUB(sdpib, sdp_getpeername, nomod_einval);
- STUB(sdpib, sdp_getsockname, nomod_einval);
- STUB(sdpib, sdp_disconnect, nomod_einval);
- STUB(sdpib, sdp_shutdown, nomod_einval);
- STUB(sdpib, sdp_get_opt, nomod_einval);
- STUB(sdpib, sdp_set_opt, nomod_einval);
- STUB(sdpib, sdp_close, nomod_void);
- STUB(sdpib, sdp_polldata, nomod_zero);
- STUB(sdpib, sdp_ioctl, nomod_einval);
- END_MODULE(sdpib);
-#endif
-
-
/*
* Stubs for kssl, the kernel SSL proxy
*/
@@ -1294,6 +1276,35 @@ stubs_base:
END_MODULE(ipnet);
#endif
+/*
+ * Stubs for kernel socket, for iscsi
+ */
+#ifndef KSOCKET_MODULE
+ MODULE(ksocket, misc);
+ NO_UNLOAD_STUB(ksocket, ksocket_setsockopt, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_getsockopt, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_getpeername, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_getsockname, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_socket, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_bind, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_listen, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_accept, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_connect, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_recv, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_recvfrom, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_recvmsg, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_send, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_sendto, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_sendmsg, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_ioctl, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_setcallbacks, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_hold, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_rele, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_shutdown, nomod_minus_one);
+ NO_UNLOAD_STUB(ksocket, ksocket_close, nomod_minus_one);
+ END_MODULE(ksocket);
+#endif
+
! this is just a marker for the area of text that contains stubs
.seg ".text"
.global stubs_end
diff --git a/usr/src/uts/sparc/rts/Makefile b/usr/src/uts/sparc/rts/Makefile
index ff635303bc..4078c24237 100644
--- a/usr/src/uts/sparc/rts/Makefile
+++ b/usr/src/uts/sparc/rts/Makefile
@@ -20,11 +20,9 @@
#
#
# uts/sparc/rts/Makefile
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#ident "%Z%%M% %I% %E% SMI"
-#
# This makefile drives the production of the rts IP driver
#
# sparc architecture dependent
@@ -42,6 +40,7 @@ MODULE = rts
OBJECTS = $(RTS_OBJS:%=$(OBJS_DIR)/%)
LINTS = $(RTS_OBJS:%.o=$(LINTS_DIR)/%.ln)
ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE)
+ROOTLINK = $(ROOT_SOCK_DIR)/$(MODULE)
CONF_SRCDIR = $(UTSBASE)/common/inet/ip
#
@@ -69,9 +68,9 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOTLINK) $(ROOT_CONFFILE)
CFLAGS += $(CCVERBOSE)
#
-# depends on tun
+# depends on ip and sockfs
#
-LDFLAGS += -dy -Ndrv/ip
+LDFLAGS += -dy -Ndrv/ip -Nfs/sockfs
#
# For now, disable these lint checks; maintainers should endeavor
@@ -103,7 +102,7 @@ clean.lint: $(CLEAN_LINT_DEPS)
install: $(INSTALL_DEPS) $(SISCHECK_DEPS)
-$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOTMODULE)
+$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOT_SOCK_DIR) $(ROOTMODULE)
-$(RM) $@; ln $(ROOTMODULE) $@
#
diff --git a/usr/src/uts/sparc/rts/rts.global-objs.debug64 b/usr/src/uts/sparc/rts/rts.global-objs.debug64
index 4c699f6410..75b422acf6 100644
--- a/usr/src/uts/sparc/rts/rts.global-objs.debug64
+++ b/usr/src/uts/sparc/rts/rts.global-objs.debug64
@@ -19,14 +19,15 @@
# CDDL HEADER END
#
#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-# ident "%Z%%M% %I% %E% SMI"
cb_inet_devops
inet_dev_info
inet_devops
modldrv
modlinkage
+modlsockmod
netdev_privs
+smodreg
diff --git a/usr/src/uts/sparc/smbsrv/Makefile b/usr/src/uts/sparc/smbsrv/Makefile
index 71c4cc5398..023d1c1cd5 100644
--- a/usr/src/uts/sparc/smbsrv/Makefile
+++ b/usr/src/uts/sparc/smbsrv/Makefile
@@ -19,11 +19,8 @@
# CDDL HEADER END
#
#
-#
-# Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
-#
-#ident "%Z%%M% %I% %E% SMI"
#
# This makefile drives the production of the cifs server file system
@@ -52,7 +49,8 @@ include $(UTSBASE)/sparc/Makefile.sparc
#
# Module dependencies
#
-LDFLAGS += -dy -Nfs/sockfs -Ndrv/ip -Nstrmod/rpcmod -Nsys/doorfs -Nmisc/kcf
+LDFLAGS += -dy -Nfs/sockfs -Nmisc/ksocket -Ndrv/ip -Nstrmod/rpcmod -Nsys/doorfs
+LDFLAGS += -Nmisc/kcf
#
# Define targets
diff --git a/usr/src/uts/sparc/socksctp/Makefile b/usr/src/uts/sparc/socksctp/Makefile
new file mode 100644
index 0000000000..5acab4cfb1
--- /dev/null
+++ b/usr/src/uts/sparc/socksctp/Makefile
@@ -0,0 +1,96 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+
+#
+# This makefile drives the production of the nca driver
+# kernel module.
+#
+# sparc architecture dependent
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = socksctp
+OBJECTS = $(SCTP_SOCK_MOD_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(SCTP_SOCK_MOD_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_SOCK_DIR)/$(MODULE)
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/sparc/Makefile.sparc
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
+
+#
+# lint pass one enforcement and OS version
+#
+CFLAGS += $(CCVERBOSE)
+
+LDFLAGS += -dy -Nfs/sockfs -Ndrv/ip
+
+#
+# For now, disable these lint checks; maintainers should endeavor
+# to investigate and remove these for maximum lint coverage.
+# Please do not carry these forward to new Makefiles.
+#
+LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN
+LINTTAGS += -erroff=E_PTRDIFF_OVERFLOW
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/sparc/Makefile.targ
diff --git a/usr/src/uts/sparc/socksdp/Makefile b/usr/src/uts/sparc/socksdp/Makefile
new file mode 100644
index 0000000000..6970c44faf
--- /dev/null
+++ b/usr/src/uts/sparc/socksdp/Makefile
@@ -0,0 +1,88 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+
+#
+# This makefile drives the production of the nca driver
+# kernel module.
+#
+# sparc architecture dependent
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = socksdp
+OBJECTS = $(SDP_SOCK_MOD_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(SDP_SOCK_MOD_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(ROOT_SOCK_DIR)/$(MODULE)
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/sparc/Makefile.sparc
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE)
+
+#
+# lint pass one enforcement and OS version
+#
+CFLAGS += $(CCVERBOSE)
+
+LDFLAGS += -dy -Nfs/sockfs -Ndrv/ip
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/sparc/Makefile.targ
diff --git a/usr/src/uts/sparc/tcp/Makefile b/usr/src/uts/sparc/tcp/Makefile
index 192fda758f..7276ecfaeb 100644
--- a/usr/src/uts/sparc/tcp/Makefile
+++ b/usr/src/uts/sparc/tcp/Makefile
@@ -2,9 +2,8 @@
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
@@ -21,11 +20,9 @@
#
#
# uts/sparc/tcp/Makefile
-# Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#ident "%Z%%M% %I% %E% SMI"
-#
# This makefile drives the production of the tcp driver kernel module.
#
# sparc architecture dependent
@@ -43,7 +40,7 @@ MODULE = tcp
OBJECTS = $(TCP_OBJS:%=$(OBJS_DIR)/%)
LINTS = $(TCP_OBJS:%.o=$(LINTS_DIR)/%.ln)
ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE)
-ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE)
+ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE) $(ROOT_SOCK_DIR)/$(MODULE)
CONF_SRCDIR = $(UTSBASE)/common/inet/tcp
#
@@ -77,9 +74,9 @@ CFLAGS += $(CCVERBOSE)
CFLAGS += -xinline=tcp_set_ws_value,tcp_fill_header
#
-# depends on ip and md5
+# depends on ip, md5 and sockfs
#
-LDFLAGS += -dy -Ndrv/ip -Ncrypto/md5
+LDFLAGS += -dy -Ndrv/ip -Ncrypto/md5 -Nfs/sockfs
#
# Default build targets.
@@ -102,7 +99,7 @@ clean.lint: $(CLEAN_LINT_DEPS)
install: $(INSTALL_DEPS)
-$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOTMODULE)
+$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOT_SOCK_DIR) $(ROOTMODULE)
-$(RM) $@; ln $(ROOTMODULE) $@
#
diff --git a/usr/src/uts/sparc/udp/Makefile b/usr/src/uts/sparc/udp/Makefile
index c0deb87087..07a4435112 100644
--- a/usr/src/uts/sparc/udp/Makefile
+++ b/usr/src/uts/sparc/udp/Makefile
@@ -2,9 +2,8 @@
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
@@ -21,11 +20,9 @@
#
#
# uts/sparc/udp/Makefile
-# Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
-#ident "%Z%%M% %I% %E% SMI"
-#
# This makefile drives the production of the udp driver kernel module.
#
# sparc architecture dependent
@@ -43,7 +40,7 @@ MODULE = udp
OBJECTS = $(UDP_OBJS:%=$(OBJS_DIR)/%)
LINTS = $(UDP_OBJS:%.o=$(LINTS_DIR)/%.ln)
ROOTMODULE = $(ROOT_DRV_DIR)/$(MODULE)
-ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE)
+ROOTLINK = $(ROOT_STRMOD_DIR)/$(MODULE) $(ROOT_SOCK_DIR)/$(MODULE)
CONF_SRCDIR = $(UTSBASE)/common/inet/udp
#
@@ -71,9 +68,9 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOTLINK) $(ROOT_CONFFILE)
CFLAGS += $(CCVERBOSE)
#
-# depends on ip
+# depends on ip and sockfs
#
-LDFLAGS += -dy -Ndrv/ip
+LDFLAGS += -dy -Ndrv/ip -Nfs/sockfs
#
# Default build targets.
@@ -96,7 +93,7 @@ clean.lint: $(CLEAN_LINT_DEPS)
install: $(INSTALL_DEPS)
-$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOTMODULE)
+$(ROOTLINK): $(ROOT_STRMOD_DIR) $(ROOT_SOCK_DIR) $(ROOTMODULE)
-$(RM) $@; ln $(ROOTMODULE) $@
#