summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/cmd-inet/usr.bin/pppd/sys-solaris.c22
-rw-r--r--usr/src/cmd/cmd-inet/usr.lib/ncaconfd/ncaconfd.c14
-rw-r--r--usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c22
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/genunix.c3
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/net.c81
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/net.h3
-rw-r--r--usr/src/cmd/rcm_daemon/common/ip_rcm.c6
-rw-r--r--usr/src/pkgdefs/etc/exception_list_i3862
-rw-r--r--usr/src/pkgdefs/etc/exception_list_sparc2
-rw-r--r--usr/src/tools/scripts/bfu.sh7
-rw-r--r--usr/src/uts/common/Makefile.files22
-rw-r--r--usr/src/uts/common/fs/sockfs/sockstr.c20
-rw-r--r--usr/src/uts/common/fs/sockfs/socktpi.c321
-rw-r--r--usr/src/uts/common/fs/sockfs/sockvnops.c93
-rw-r--r--usr/src/uts/common/inet/Makefile2
-rw-r--r--usr/src/uts/common/inet/arp/arp.c24
-rw-r--r--usr/src/uts/common/inet/common.h6
-rw-r--r--usr/src/uts/common/inet/inet_common.c20
-rw-r--r--usr/src/uts/common/inet/ip.h101
-rw-r--r--usr/src/uts/common/inet/ip/igmp.c2
-rw-r--r--usr/src/uts/common/inet/ip/ip.c2070
-rw-r--r--usr/src/uts/common/inet/ip/ip6.c796
-rw-r--r--usr/src/uts/common/inet/ip/ip_if.c40
-rw-r--r--usr/src/uts/common/inet/ip/ip_multi.c38
-rw-r--r--usr/src/uts/common/inet/ip/ip_ndp.c21
-rw-r--r--usr/src/uts/common/inet/ip/ipclassifier.c69
-rw-r--r--usr/src/uts/common/inet/ip/tun.c6
-rw-r--r--usr/src/uts/common/inet/ip6.h3
-rw-r--r--usr/src/uts/common/inet/ip_impl.h493
-rw-r--r--usr/src/uts/common/inet/ipclassifier.h35
-rw-r--r--usr/src/uts/common/inet/ipp_common.h4
-rw-r--r--usr/src/uts/common/inet/led.h6
-rw-r--r--usr/src/uts/common/inet/optcom.c23
-rw-r--r--usr/src/uts/common/inet/optcom.h14
-rw-r--r--usr/src/uts/common/inet/snmpcom.c33
-rw-r--r--usr/src/uts/common/inet/squeue.c11
-rw-r--r--usr/src/uts/common/inet/tcp.h43
-rw-r--r--usr/src/uts/common/inet/tcp/tcp.c1626
-rw-r--r--usr/src/uts/common/inet/tcp/tcp6ddi.c10
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_fusion.c1087
-rw-r--r--usr/src/uts/common/inet/tcp/tcpddi.c10
-rw-r--r--usr/src/uts/common/inet/tcp_impl.h332
-rw-r--r--usr/src/uts/common/inet/udp/udp.c3018
-rw-r--r--usr/src/uts/common/inet/udp/udp6ddi.c10
-rw-r--r--usr/src/uts/common/inet/udp/udpddi.c15
-rw-r--r--usr/src/uts/common/inet/udp_impl.h159
-rw-r--r--usr/src/uts/common/io/gld.c2
-rw-r--r--usr/src/uts/common/io/stream.c26
-rw-r--r--usr/src/uts/common/io/strsun.c62
-rw-r--r--usr/src/uts/common/os/streamio.c84
-rw-r--r--usr/src/uts/common/os/strsubr.c38
-rw-r--r--usr/src/uts/common/sys/conf.h4
-rw-r--r--usr/src/uts/common/sys/dlpi.h2
-rw-r--r--usr/src/uts/common/sys/gld.h9
-rw-r--r--usr/src/uts/common/sys/multidata.h33
-rw-r--r--usr/src/uts/common/sys/multidata_impl.h17
-rw-r--r--usr/src/uts/common/sys/socketvar.h7
-rw-r--r--usr/src/uts/common/sys/sockio.h4
-rw-r--r--usr/src/uts/common/sys/stream.h2
-rw-r--r--usr/src/uts/common/sys/strsubr.h19
-rw-r--r--usr/src/uts/common/syscall/sendfile.c85
-rw-r--r--usr/src/uts/intel/ia32/ml/modstubs.s7
-rw-r--r--usr/src/uts/sparc/ml/modstubs.s7
63 files changed, 7606 insertions, 3547 deletions
diff --git a/usr/src/cmd/cmd-inet/usr.bin/pppd/sys-solaris.c b/usr/src/cmd/cmd-inet/usr.bin/pppd/sys-solaris.c
index 9d11ac72d8..0b1ce1eafd 100644
--- a/usr/src/cmd/cmd-inet/usr.bin/pppd/sys-solaris.c
+++ b/usr/src/cmd/cmd-inet/usr.bin/pppd/sys-solaris.c
@@ -76,6 +76,7 @@
#include <netinet/in.h>
#include <sys/tihdr.h>
#include <inet/mib2.h>
+#include <inet/ip.h>
#include <sys/ethernet.h>
#include <sys/ser_sync.h>
@@ -92,27 +93,6 @@
static const char rcsid[] = RCSID;
#endif
-/* Need to use UDP for ifconfig compatibility */
-#if !defined(UDP_DEV_NAME)
-#define UDP_DEV_NAME "/dev/udp"
-#endif /* UDP_DEV_NAME */
-
-#if !defined(IP_DEV_NAME)
-#define IP_DEV_NAME "/dev/ip"
-#endif /* IP_DEV_NAME */
-
-#if !defined(UDP6_DEV_NAME)
-#define UDP6_DEV_NAME "/dev/udp6"
-#endif /* UDP6_DEV_NAME */
-
-#if !defined(IP6_DEV_NAME)
-#define IP6_DEV_NAME "/dev/ip6"
-#endif /* IP6_DEV_NAME */
-
-#if !defined(IP_MOD_NAME)
-#define IP_MOD_NAME "ip"
-#endif /* IP_MOD_NAME */
-
#define PPPSTRTIMOUT 1 /* Timeout in seconds for ioctl */
#define MAX_POLLFDS 32
#define NMODULES 32
diff --git a/usr/src/cmd/cmd-inet/usr.lib/ncaconfd/ncaconfd.c b/usr/src/cmd/cmd-inet/usr.lib/ncaconfd/ncaconfd.c
index be2461b276..06972f53dc 100644
--- a/usr/src/cmd/cmd-inet/usr.lib/ncaconfd/ncaconfd.c
+++ b/usr/src/cmd/cmd-inet/usr.lib/ncaconfd/ncaconfd.c
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -55,18 +55,6 @@
#include "ncaconf.h"
/* NCA does not support IPv6... */
-#ifndef IP_DEV_NAME
-#define IP_DEV_NAME "/dev/ip"
-#endif
-
-#ifndef IP_MOD_NAME
-#define IP_MOD_NAME "ip"
-#endif
-
-#ifndef UDP_DEV_NAME
-#define UDP_DEV_NAME "/dev/udp"
-#endif
-
#ifndef NCA_MOD_NAME
#define NCA_MOD_NAME "nca"
#endif
diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c
index aa2107f3f4..ee577669b0 100644
--- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c
+++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c
@@ -18,6 +18,8 @@
#include <sys/dlpi.h>
#include <libdlpi.h>
+#include <inet/ip.h>
+
#define LOOPBACK_IF "lo0"
#define NONE_STR "none"
@@ -26,26 +28,6 @@
#define ARP_MOD_NAME "arp"
#endif
-#ifndef IP_DEV_NAME
-#define IP_DEV_NAME "/dev/ip"
-#endif
-
-#ifndef IP_MOD_NAME
-#define IP_MOD_NAME "ip"
-#endif
-
-#ifndef IP6_DEV_NAME
-#define IP6_DEV_NAME "/dev/ip6"
-#endif
-
-#ifndef UDP_DEV_NAME
-#define UDP_DEV_NAME "/dev/udp"
-#endif
-
-#ifndef UDP6_DEV_NAME
-#define UDP6_DEV_NAME "/dev/udp6"
-#endif
-
#define ADDRBITS_V4 32 /* number of bits in IPv4 address */
#define ADDRBITS_V6 128 /* number of bits in IPv6 address */
diff --git a/usr/src/cmd/mdb/common/modules/genunix/genunix.c b/usr/src/cmd/mdb/common/modules/genunix/genunix.c
index a36dfc8334..2a3b26ea8c 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/genunix.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/genunix.c
@@ -3883,9 +3883,6 @@ static const mdb_walker_t walkers[] = {
mi_walk_init, mi_walk_step, mi_walk_fini, NULL },
{ "sonode", "given a sonode, walk its children",
sonode_walk_init, sonode_walk_step, sonode_walk_fini, NULL },
- { "udp", "walk UDP connections using MI",
- mi_payload_walk_init, mi_payload_walk_step,
- mi_payload_walk_fini, &mi_udp_arg },
/* from nvpair.c */
{ NVPAIR_WALKER_NAME, NVPAIR_WALKER_DESCR,
diff --git a/usr/src/cmd/mdb/common/modules/genunix/net.c b/usr/src/cmd/mdb/common/modules/genunix/net.c
index 0b6d826491..209b207bd3 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/net.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/net.c
@@ -107,7 +107,8 @@ net_tcp_ipv6(const tcp_t *tcp)
static int
net_udp_active(const udp_t *udp)
{
- return ((udp->udp_state != TS_UNBND) && (udp->udp_state != TS_IDLE));
+ return ((udp->udp_state == TS_IDLE) ||
+ (udp->udp_state == TS_DATA_XFER));
}
static int
@@ -355,11 +356,6 @@ mi_payload_walk_fini(mdb_walk_state_t *wsp)
delete_mi_payload_walk_data(wsp->walk_data, arg->mi_pwa_size);
}
-const mi_payload_walk_arg_t mi_udp_arg = {
- "udp", "udp_g_head", sizeof (udp_t),
- MI_PAYLOAD_DEVICE | MI_PAYLOAD_MODULE
-};
-
const mi_payload_walk_arg_t mi_ar_arg = {
"arp", "ar_g_head", sizeof (ar_t),
MI_PAYLOAD_DEVICE | MI_PAYLOAD_MODULE
@@ -595,7 +591,7 @@ netstat_tcp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data, int af)
tcp = (tcp_t *)((uintptr_t)connp + (tcp_kaddr - kaddr));
if ((uintptr_t)tcp < (uintptr_t)connp ||
- (uintptr_t)&tcp->tcp_connp > (uintptr_t)connp + itc_size ||
+ (uintptr_t)(tcp + 1) > (uintptr_t)connp + itc_size ||
(uintptr_t)tcp->tcp_connp != kaddr) {
mdb_warn("conn_tcp %p is invalid", tcp_kaddr);
return (WALK_NEXT);
@@ -603,7 +599,7 @@ netstat_tcp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data, int af)
connp->conn_tcp = tcp;
tcp->tcp_connp = connp;
- if (!(opts & NETSTAT_ALL || net_tcp_active(tcp)) ||
+ if (!((opts & NETSTAT_ALL) || net_tcp_active(tcp)) ||
(af == AF_INET && !net_tcp_ipv4(tcp)) ||
(af == AF_INET6 && !net_tcp_ipv6(tcp))) {
return (WALK_NEXT);
@@ -639,45 +635,57 @@ netstat_tcpv6_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
return (netstat_tcp_cb(kaddr, walk_data, cb_data, AF_INET6));
}
+/*ARGSUSED*/
static int
-netstat_udpv4_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
+netstat_udp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data, int af)
{
- const udp_t *udp = walk_data;
const uintptr_t opts = (uintptr_t)cb_data;
+ udp_t udp;
+ conn_t connp;
+
+ if (mdb_vread(&udp, sizeof (udp_t), kaddr) == -1) {
+ mdb_warn("failed to read udp at %p", kaddr);
+ return (WALK_ERR);
+ }
- if (!((opts & NETSTAT_ALL || net_udp_active(udp)) && net_udp_ipv4(udp)))
+ if (mdb_vread(&connp, sizeof (conn_t),
+ (uintptr_t)udp.udp_connp) == -1) {
+ mdb_warn("failed to read udp_connp at %p",
+ (uintptr_t)udp.udp_connp);
+ return (WALK_ERR);
+ }
+
+ if (!((opts & NETSTAT_ALL) || net_udp_active(&udp)) ||
+ (af == AF_INET && !net_udp_ipv4(&udp)) ||
+ (af == AF_INET6 && !net_udp_ipv6(&udp))) {
return (WALK_NEXT);
+ }
- mdb_printf("%0?p %2i ", kaddr, udp->udp_state);
- net_ipv4addrport_pr(&udp->udp_v6src, udp->udp_port);
- mdb_printf(" ");
- net_ipv4addrport_pr(&udp->udp_v6dst, udp->udp_dstport);
- mdb_printf(" %4i\n", udp->udp_zoneid);
+ mdb_printf("%0?p %2i ", kaddr, udp.udp_state);
+ if (af == AF_INET) {
+ net_ipv4addrport_pr(&udp.udp_v6src, udp.udp_port);
+ mdb_printf(" ");
+ net_ipv4addrport_pr(&udp.udp_v6dst, udp.udp_dstport);
+ } else if (af == AF_INET6) {
+ net_ipv6addrport_pr(&udp.udp_v6src, udp.udp_port);
+ mdb_printf(" ");
+ net_ipv6addrport_pr(&udp.udp_v6dst, udp.udp_dstport);
+ }
+ mdb_printf(" %4i\n", connp.conn_zoneid);
return (WALK_NEXT);
}
static int
-netstat_udpv6_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
+netstat_udpv4_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
{
- const udp_t *udp = walk_data;
- const uintptr_t opts = (uintptr_t)cb_data;
-
- if (!((opts & NETSTAT_ALL || net_udp_active(udp)) && net_udp_ipv6(udp)))
- return (WALK_NEXT);
-
- mdb_printf("%0?p %2i ", kaddr, udp->udp_state);
- net_ipv6addrport_pr(&udp->udp_v6src, udp->udp_port);
- mdb_printf(" ");
-
- /* Remote */
- if (udp->udp_state == TS_DATA_XFER)
- net_ipv6addrport_pr(&udp->udp_v6dst, udp->udp_dstport);
- else
- mdb_printf("%*s.0 ", ADDR_V6_WIDTH, "0:0:0:0:0:0:0:0");
- mdb_printf(" %4i\n", udp->udp_zoneid);
+ return (netstat_udp_cb(kaddr, walk_data, cb_data, AF_INET));
+}
- return (WALK_NEXT);
+static int
+netstat_udpv6_cb(uintptr_t kaddr, const void *walk_data, void *cb_data)
+{
+ return (netstat_udp_cb(kaddr, walk_data, cb_data, AF_INET6));
}
/*
@@ -855,7 +863,7 @@ netstat(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
"UDPv4", ADDR_V4_WIDTH, "Local Address",
ADDR_V4_WIDTH, "Remote Address", "Zone");
- if (mdb_walk("genunix`udp", netstat_udpv4_cb,
+ if (mdb_walk("udp_cache", netstat_udpv4_cb,
(void *)(uintptr_t)opts) == -1) {
mdb_warn("failed to walk genunix`udp");
return (DCMD_ERR);
@@ -870,12 +878,11 @@ netstat(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
"UDPv6", ADDR_V6_WIDTH, "Local Address",
ADDR_V6_WIDTH, "Remote Address", "Zone");
- if (mdb_walk("genunix`udp", netstat_udpv6_cb,
+ if (mdb_walk("udp_cache", netstat_udpv6_cb,
(void *)(uintptr_t)opts) == -1) {
mdb_warn("failed to walk genunix`udp");
return (DCMD_ERR);
}
-
}
}
diff --git a/usr/src/cmd/mdb/common/modules/genunix/net.h b/usr/src/cmd/mdb/common/modules/genunix/net.h
index 04bf278638..45e03a5352 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/net.h
+++ b/usr/src/cmd/mdb/common/modules/genunix/net.h
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2000, 2003 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -33,7 +33,6 @@
extern "C" {
#endif
-extern struct mi_payload_walk_arg_s mi_udp_arg;
extern struct mi_payload_walk_arg_s mi_ar_arg;
extern struct mi_payload_walk_arg_s mi_icmp_arg;
extern struct mi_payload_walk_arg_s mi_ill_arg;
diff --git a/usr/src/cmd/rcm_daemon/common/ip_rcm.c b/usr/src/cmd/rcm_daemon/common/ip_rcm.c
index 95ed823a74..38de5ef96c 100644
--- a/usr/src/cmd/rcm_daemon/common/ip_rcm.c
+++ b/usr/src/cmd/rcm_daemon/common/ip_rcm.c
@@ -54,6 +54,7 @@
#include <libdevinfo.h>
#include <sys/systeminfo.h>
#include <netdb.h>
+#include <inet/ip.h>
#include <ipmp_mpathd.h>
#include "rcm_module.h"
@@ -70,12 +71,7 @@
/* Some generic well-knowns and defaults used in this module */
#define SLASH_DEV "/dev" /* /dev directory */
-#define IP_DEV_NAME "/dev/ip" /* IPV4 ip device */
-#define IP6_DEV_NAME "/dev/ip6" /* IPV6 ip device */
-#define IP_MOD_NAME "ip" /* ip module */
#define ARP_MOD_NAME "arp" /* arp module */
-#define UDP_DEV_NAME "/dev/udp" /* IPv4 udp device */
-#define UDP6_DEV_NAME "/dev/udp6" /* IPv6 udp device */
#define IP_MAX_MODS 9 /* max modules pushed on intr */
#define MAX_RECONFIG_SIZE 1024 /* Max. reconfig string size */
diff --git a/usr/src/pkgdefs/etc/exception_list_i386 b/usr/src/pkgdefs/etc/exception_list_i386
index 5d6fa5d32e..b8c682c9ed 100644
--- a/usr/src/pkgdefs/etc/exception_list_i386
+++ b/usr/src/pkgdefs/etc/exception_list_i386
@@ -347,6 +347,8 @@ usr/lib/llib-lipsecutil.ln i386
usr/include/inet/arp_impl.h i386
usr/include/inet/rawip_impl.h i386
usr/include/inet/udp_impl.h i386
+usr/include/inet/tcp_impl.h i386
+usr/include/inet/ip_impl.h i386
usr/include/inet/ip_ndp.h i386
usr/include/inet/ipdrop.h i386
usr/include/inet/tun.h i386
diff --git a/usr/src/pkgdefs/etc/exception_list_sparc b/usr/src/pkgdefs/etc/exception_list_sparc
index 74e0ee0273..8f1f4a40e3 100644
--- a/usr/src/pkgdefs/etc/exception_list_sparc
+++ b/usr/src/pkgdefs/etc/exception_list_sparc
@@ -336,6 +336,8 @@ usr/share/lib/locale/com/sun/dhcpmgr/cli/pntadm/ResourceBundle.properties sparc
usr/include/inet/arp_impl.h sparc
usr/include/inet/rawip_impl.h sparc
usr/include/inet/udp_impl.h sparc
+usr/include/inet/tcp_impl.h sparc
+usr/include/inet/ip_impl.h sparc
usr/include/inet/ip_ndp.h sparc
usr/include/inet/ipdrop.h sparc
usr/include/inet/tun.h sparc
diff --git a/usr/src/tools/scripts/bfu.sh b/usr/src/tools/scripts/bfu.sh
index c4aa388b40..5a102f0bea 100644
--- a/usr/src/tools/scripts/bfu.sh
+++ b/usr/src/tools/scripts/bfu.sh
@@ -2002,11 +2002,10 @@ if [[ ! -f $usr/lib/dns/libdns.so ]] && ! $ZCAT $cpiodir/generic.usr$ZFIX | \
fi
update_script="/ws/onnv-gate/public/bin/update_ce"
-if [ "$plat" = "SUNW,Sun-Fire-15000" ] && ifconfig -a | egrep '^ce' \
- >/dev/null 2>/dev/null; then
- # Sun Fire 12K/15K/20K/25K requires CE version 1.146 or later.
+if ifconfig -a | egrep '^ce' >/dev/null 2>/dev/null; then
+ # CE version 1.148 or later is required
cever=`modinfo | grep 'CE Ethernet' | sed 's/.*v1\.//' | tr -d ')' | \
- nawk '{ if ($1 < 146) print "BAD"; else print $1 }'`
+ nawk '{ if ($1 < 148) print "BAD"; else print $1 }'`
if [ "$cever" = "BAD" ]; then
fail "You must run $update_script to upgrade your ce driver."
fi
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 69a29625d4..8daf858a7c 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -416,13 +416,9 @@ ICMP6_OBJS += icmp6ddi.o
RTS_OBJS += rtsddi.o rts.o rts_opt_data.o
-IP_OBJS += igmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \
- ip_cksum.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \
- ip_multi.o ip_ndp.o ip_opt_data.o ip_rts.o ip_srcid.o \
- ipddi.o ipdrop.o mi.o nd.o optcom.o snmpcom.o ipsec_loader.o \
- spd.o ipclassifier.o inet_common.o ip_squeue.o tcp.o \
- tcp_trace.o tcp_opt_data.o tcp_sack.o squeue.o ip_sadb.o \
- sctp_crc32.o sctp.o sctp_opt_data.o sctp_output.o \
+IP_TCP_OBJS = tcp.o tcp_trace.o tcp_opt_data.o tcp_sack.o tcp_fusion.o
+IP_UDP_OBJS = udp.o udp_opt_data.o
+IP_SCTP_OBJS = sctp_crc32.o sctp.o sctp_opt_data.o sctp_output.o \
sctp_init.o sctp_input.o sctp_cookie.o \
sctp_conn.o sctp_error.o sctp_snmp.o \
sctp_param.o sctp_shutdown.o sctp_common.o \
@@ -430,6 +426,16 @@ IP_OBJS += igmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \
sctp_ioc.o sctp_bind.o sctp_notify.o sctp_asconf.o \
sctp_addr.o
+IP_OBJS += igmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \
+ ip_cksum.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \
+ ip_multi.o ip_ndp.o ip_opt_data.o ip_rts.o ip_srcid.o \
+ ipddi.o ipdrop.o mi.o nd.o optcom.o snmpcom.o ipsec_loader.o \
+ spd.o ipclassifier.o inet_common.o ip_squeue.o squeue.o \
+ ip_sadb.o \
+ $(IP_TCP_OBJS) \
+ $(IP_UDP_OBJS) \
+ $(IP_SCTP_OBJS)
+
IP6_OBJS += ip6ddi.o
KEYSOCK_OBJS += keysockddi.o keysock.o keysock_opt_data.o
@@ -467,7 +473,7 @@ ATUN_OBJS += atun.o
6TO4TUN_OBJS += 6to4tun.o
-UDP_OBJS += udpddi.o udp.o udp_opt_data.o
+UDP_OBJS += udpddi.o
UDP6_OBJS += udp6ddi.o
diff --git a/usr/src/uts/common/fs/sockfs/sockstr.c b/usr/src/uts/common/fs/sockfs/sockstr.c
index 6b934d9f0a..6c148d71b6 100644
--- a/usr/src/uts/common/fs/sockfs/sockstr.c
+++ b/usr/src/uts/common/fs/sockfs/sockstr.c
@@ -137,21 +137,23 @@ so_sock2stream(struct sonode *so)
ASSERT(so->so_version != SOV_STREAM);
- /* tell the transport below that sockmod is being popped */
- if ((so->so_state & SS_TCP_FAST_ACCEPT) != 0) {
- int rval;
- mblk_t **mpp;
+ if (so->so_state & SS_DIRECT) {
+ mblk_t **mpp;
+ int rval;
+ /*
+ * Tell the transport below that sockmod is being popped
+ */
mutex_exit(&so->so_lock);
- error = strioctl(vp, SIOCPOPSOCKFS, NULL, 0, K_TO_K, CRED(),
+ error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K, CRED(),
&rval);
mutex_enter(&so->so_lock);
if (error != 0) {
- dprintso(so, 0,
- ("so_sock2stream(%p): SIOCPOPSOCKFS failed\n", so));
+ dprintso(so, 0, ("so_sock2stream(%p): "
+ "_SIOCSOCKFALLBACK failed\n", so));
goto exit;
}
- so->so_state &= ~SS_TCP_FAST_ACCEPT;
+ so->so_state &= ~SS_DIRECT;
for (mpp = &so->so_conn_ind_head; (mp = *mpp) != NULL;
mpp = &mp->b_next) {
@@ -412,7 +414,7 @@ so_strinit(struct sonode *so, struct sonode *tso)
/* the following do_tcapability may update so->so_mode */
if ((tso->so_serv_type != T_CLTS) &&
- ((so->so_state & SS_TCP_FAST_ACCEPT) == 0)) {
+ !(so->so_state & SS_DIRECT)) {
error = do_tcapability(so, TC1_ACCEPTOR_ID);
if (error)
return (error);
diff --git a/usr/src/uts/common/fs/sockfs/socktpi.c b/usr/src/uts/common/fs/sockfs/socktpi.c
index 7b895f99a7..6a5e48464e 100644
--- a/usr/src/uts/common/fs/sockfs/socktpi.c
+++ b/usr/src/uts/common/fs/sockfs/socktpi.c
@@ -57,6 +57,7 @@
#include <sys/socket.h>
#include <sys/socketvar.h>
+#include <sys/sockio.h>
#include <netinet/in.h>
#include <sys/un.h>
#include <sys/strsun.h>
@@ -72,6 +73,7 @@
#include <inet/ip.h>
#include <inet/ip6.h>
#include <inet/tcp.h>
+#include <inet/udp_impl.h>
#include <fs/sockfs/nl7c.h>
#include <sys/zone.h>
@@ -185,6 +187,10 @@ static int sotpi_sendmsg(struct sonode *, struct nmsghdr *,
struct uio *);
static int sotpi_shutdown(struct sonode *, int);
static int sotpi_getsockname(struct sonode *);
+static int sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
+ struct uio *, void *, t_uscalar_t, int);
+static int sodgram_direct(struct sonode *, struct sockaddr *,
+ socklen_t, struct uio *, int);
sonodeops_t sotpi_sonodeops = {
sotpi_accept, /* sop_accept */
@@ -222,16 +228,40 @@ sotpi_create(vnode_t *accessvp, int domain, int type, int protocol, int version,
so = VTOSO(vp);
flags = FREAD|FWRITE;
- if (tso != NULL) {
- if ((tso->so_state & (SS_TCP_FAST_ACCEPT)) != 0) {
- flags |= SO_ACCEPTOR|SO_SOCKSTR;
- so->so_state |= SS_TCP_FAST_ACCEPT;
- }
- } else {
- if ((so->so_type == SOCK_STREAM) &&
- (so->so_family == AF_INET || so->so_family == AF_INET6)) {
- flags |= SO_SOCKSTR;
- so->so_state |= SS_TCP_FAST_ACCEPT;
+
+ if ((type == SOCK_STREAM || type == SOCK_DGRAM) &&
+ (domain == AF_INET || domain == AF_INET6) &&
+ (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP ||
+ protocol == IPPROTO_IP)) {
+ /* Tell tcp or udp that it's talking to sockets */
+ flags |= SO_SOCKSTR;
+
+ /*
+ * Here we indicate to socktpi_open() our attempt to
+ * make direct calls between sockfs and transport.
+ * The final decision is left to socktpi_open().
+ */
+ so->so_state |= SS_DIRECT;
+
+ ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
+ if (so->so_type == SOCK_STREAM && tso != NULL) {
+ if (tso->so_state & SS_DIRECT) {
+ /*
+ * Inherit SS_DIRECT from listener and pass
+ * SO_ACCEPTOR open flag to tcp, indicating
+ * that this is an accept fast-path instance.
+ */
+ flags |= SO_ACCEPTOR;
+ } else {
+ /*
+ * SS_DIRECT is not set on listener, meaning
+ * that the listener has been converted from
+ * a socket to a stream. Ensure that the
+ * acceptor inherits these settings.
+ */
+ so->so_state &= ~SS_DIRECT;
+ flags &= ~SO_SOCKSTR;
+ }
}
}
@@ -1052,7 +1082,7 @@ done:
}
/* bind the socket */
-int
+static int
sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
int flags)
{
@@ -1372,7 +1402,7 @@ again:
case AF_INET:
case AF_INET6:
if ((optlen == sizeof (intptr_t)) &&
- ((so->so_state & SS_TCP_FAST_ACCEPT) != 0)) {
+ ((so->so_state & SS_DIRECT) != 0)) {
bcopy(mp->b_rptr + conn_ind->OPT_offset,
&opt, conn_ind->OPT_length);
} else {
@@ -1385,7 +1415,19 @@ again:
* problems when sockfs sends a normal T_CONN_RES
* message down the new stream.
*/
- so->so_state &= ~SS_TCP_FAST_ACCEPT;
+ if (so->so_state & SS_DIRECT) {
+ int rval;
+ /*
+ * For consistency we inform tcp to disable
+ * direct interface on the listener, though
+ * we can certainly live without doing this
+ * because no data will ever travel upstream
+ * on the listening socket.
+ */
+ so->so_state &= ~SS_DIRECT;
+ (void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
+ 0, 0, K_TO_K, CRED(), &rval);
+ }
opt = NULL;
optlen = 0;
}
@@ -1554,9 +1596,10 @@ again:
if (nso->so_options & SO_LINGER)
nso->so_linger = so->so_linger;
- if ((so->so_state & SS_TCP_FAST_ACCEPT) != 0) {
+ if ((so->so_state & SS_DIRECT) != 0) {
mblk_t *ack_mp;
+ ASSERT(nso->so_state & SS_DIRECT);
ASSERT(opt != NULL);
conn_res->OPT_length = optlen;
@@ -3308,13 +3351,8 @@ err:
* Assumes caller has verified that SS_ISBOUND etc. are set.
*/
static int
-sosend_dgramcmsg(struct sonode *so,
- struct sockaddr *name,
- t_uscalar_t namelen,
- struct uio *uiop,
- void *control,
- t_uscalar_t controllen,
- int flags)
+sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
+ struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
{
struct T_unitdata_req tudr;
mblk_t *mp;
@@ -3636,11 +3674,8 @@ sosend_svccmsg(struct sonode *so,
* name and the source address is passed as an option.
*/
int
-sosend_dgram(struct sonode *so,
- struct sockaddr *name,
- socklen_t namelen,
- struct uio *uiop,
- int flags)
+sosend_dgram(struct sonode *so, struct sockaddr *name, socklen_t namelen,
+ struct uio *uiop, int flags)
{
struct T_unitdata_req tudr;
mblk_t *mp;
@@ -3651,7 +3686,7 @@ sosend_dgram(struct sonode *so,
socklen_t srclen;
ssize_t len;
- ASSERT(name && namelen);
+ ASSERT(name != NULL && namelen != 0);
len = uiop->uio_resid;
if (len > so->so_tidu_size) {
@@ -3659,14 +3694,14 @@ sosend_dgram(struct sonode *so,
goto done;
}
- /*
- * Length and family checks.
- */
+ /* Length and family checks */
error = so_addr_verify(so, name, namelen);
- if (error) {
- eprintsoline(so, error);
+ if (error != 0)
goto done;
- }
+
+ if (so->so_state & SS_DIRECT)
+ return (sodgram_direct(so, name, namelen, uiop, flags));
+
if (so->so_family == AF_UNIX) {
if (so->so_state & SS_FADDR_NOXLATE) {
/*
@@ -4061,8 +4096,7 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
if (msg->msg_controllen != 0) {
if (!(so_mode & SM_CONNREQUIRED)) {
error = sosend_dgramcmsg(so, name, namelen, uiop,
- msg->msg_control, msg->msg_controllen,
- flags);
+ msg->msg_control, msg->msg_controllen, flags);
} else {
if (flags & MSG_OOB) {
/* Can't generate T_EXDATA_REQ with options */
@@ -4080,7 +4114,7 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
if (!(so_mode & SM_CONNREQUIRED)) {
/*
* If there is no SO_DONTROUTE to turn off return immediately
- * from sosend_dgram. This can allow tail-call optimizations.
+ * from send_dgram. This can allow tail-call optimizations.
*/
if (!dontroute) {
return (sosend_dgram(so, name, namelen, uiop, flags));
@@ -4104,13 +4138,16 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
dprintso(so, 1, ("sotpi_sendmsg: write\n"));
/*
- * If there is no SO_DONTROUTE to turn off
- * return immediately from strwrite. This can
- * allow tail-call optimizations.
+ * If there is no SO_DONTROUTE to turn off,
+ * SS_DIRECT is on, and there is no flow
+ * control, we can take the fast path.
*/
- if (!dontroute)
- return (strwrite(SOTOV(so), uiop,
- CRED()));
+ if (!dontroute &&
+ (so_state & SS_DIRECT) &&
+ canputnext(SOTOV(so)->v_stream->sd_wrq)) {
+ return (sostream_direct(so, uiop,
+ NULL, CRED()));
+ }
error = strwrite(SOTOV(so), uiop, CRED());
goto done;
}
@@ -4140,6 +4177,206 @@ done:
}
/*
+ * Sending data on a datagram socket.
+ * Assumes caller has verified that SS_ISBOUND etc. are set.
+ */
+/* ARGSUSED */
+static int
+sodgram_direct(struct sonode *so, struct sockaddr *name,
+ socklen_t namelen, struct uio *uiop, int flags)
+{
+ struct T_unitdata_req tudr;
+ mblk_t *mp;
+ int error = 0;
+ void *addr;
+ socklen_t addrlen;
+ ssize_t len;
+ struct stdata *stp = SOTOV(so)->v_stream;
+ int so_state;
+ queue_t *udp_wq;
+
+ ASSERT(name != NULL && namelen != 0);
+ ASSERT(!(so->so_mode & SM_CONNREQUIRED));
+ ASSERT(!(so->so_mode & SM_EXDATA));
+ ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
+ ASSERT(SOTOV(so)->v_type == VSOCK);
+
+ /* Caller checked for proper length */
+ len = uiop->uio_resid;
+ ASSERT(len <= so->so_tidu_size);
+
+ /* Length and family checks have been done by caller */
+ ASSERT(name->sa_family == so->so_family);
+ ASSERT(so->so_family == AF_INET ||
+ (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
+ ASSERT(so->so_family == AF_INET6 ||
+ (namelen == (socklen_t)sizeof (struct sockaddr_in)));
+
+ addr = name;
+ addrlen = namelen;
+
+ if (stp->sd_sidp != NULL &&
+ (error = straccess(stp, JCWRITE)) != 0)
+ goto done;
+
+ so_state = so->so_state;
+
+ /*
+ * For UDP we don't break up the copyin into smaller pieces
+ * as in the TCP case. That means if ENOMEM is returned by
+ * mcopyinuio() then the uio vector has not been modified at
+ * all and we fallback to either strwrite() or kstrputmsg()
+ * below. Note also that we never generate priority messages
+ * from here.
+ */
+ udp_wq = stp->sd_wrq->q_next;
+ if (canput(udp_wq) &&
+ (mp = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
+ ASSERT(DB_TYPE(mp) == M_DATA);
+ ASSERT(uiop->uio_resid == 0);
+#ifdef C2_AUDIT
+ if (audit_active)
+ audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
+#endif /* C2_AUDIT */
+ udp_wput_data(udp_wq, mp, addr, addrlen);
+ return (0);
+ }
+ if (error != 0 && error != ENOMEM)
+ return (error);
+
+ /*
+ * For connected, let strwrite() handle the blocking case.
+ * Otherwise we fall thru and use kstrputmsg().
+ */
+ if (so_state & SS_ISCONNECTED)
+ return (strwrite(SOTOV(so), uiop, CRED()));
+
+ tudr.PRIM_type = T_UNITDATA_REQ;
+ tudr.DEST_length = addrlen;
+ tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
+ tudr.OPT_length = 0;
+ tudr.OPT_offset = 0;
+
+ mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0, _ALLOC_INTR);
+ if (mp == NULL) {
+ /*
+ * Caught a signal waiting for memory.
+ * Let send* return EINTR.
+ */
+ error = EINTR;
+ goto done;
+ }
+
+#ifdef C2_AUDIT
+ if (audit_active)
+ audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
+#endif /* C2_AUDIT */
+
+ error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
+done:
+#ifdef SOCK_DEBUG
+ if (error != 0) {
+ eprintsoline(so, error);
+ }
+#endif /* SOCK_DEBUG */
+ return (error);
+}
+
+int
+sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
+{
+ struct stdata *stp = SOTOV(so)->v_stream;
+ ssize_t iosize, rmax, maxblk;
+ queue_t *tcp_wq = stp->sd_wrq->q_next;
+ int error = 0, wflag = 0;
+
+ ASSERT(so->so_mode & SM_BYTESTREAM);
+ ASSERT(SOTOV(so)->v_type == VSOCK);
+
+ if (stp->sd_sidp != NULL &&
+ (error = straccess(stp, JCWRITE)) != 0)
+ return (error);
+
+ if (uiop == NULL) {
+ /*
+ * kstrwritemp() should have checked sd_flag and
+ * flow-control before coming here. If we end up
+ * here it means that we can simply pass down the
+ * data to tcp.
+ */
+ ASSERT(mp != NULL);
+ tcp_wput(tcp_wq, mp);
+ return (0);
+ }
+
+ /* Fallback to strwrite() to do proper error handling */
+ if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
+ return (strwrite(SOTOV(so), uiop, cr));
+
+ rmax = stp->sd_qn_maxpsz;
+ ASSERT(rmax >= 0 || rmax == INFPSZ);
+ if (rmax == 0 || uiop->uio_resid <= 0)
+ return (0);
+
+ if (rmax == INFPSZ)
+ rmax = uiop->uio_resid;
+
+ maxblk = stp->sd_maxblk;
+
+ for (;;) {
+ iosize = MIN(uiop->uio_resid, rmax);
+
+ mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
+ if (mp == NULL) {
+ /*
+ * Fallback to strwrite() for ENOMEM; if this
+ * is our first time in this routine and the uio
+ * vector has not been modified, we will end up
+ * calling strwrite() without any flag set.
+ */
+ if (error == ENOMEM)
+ goto slow_send;
+ else
+ return (error);
+ }
+ ASSERT(uiop->uio_resid >= 0);
+ /*
+ * If mp is non-NULL and ENOMEM is set, it means that
+ * mcopyinuio() was able to break down some of the user
+ * data into one or more mblks. Send the partial data
+ * to tcp and let the rest be handled in strwrite().
+ */
+ ASSERT(error == 0 || error == ENOMEM);
+ tcp_wput(tcp_wq, mp);
+
+ wflag |= NOINTR;
+
+ if (uiop->uio_resid == 0) { /* No more data; we're done */
+ ASSERT(error == 0);
+ break;
+ } else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
+ (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
+slow_send:
+ /*
+ * We were able to send down partial data using
+ * the direct call interface, but are now relying
+ * on strwrite() to handle the non-fastpath cases.
+ * If the socket is blocking we will sleep in
+ * strwaitq() until write is permitted, otherwise,
+ * we will need to return the amount of bytes
+ * written so far back to the app. This is the
+ * reason why we pass NOINTR flag to strwrite()
+ * for non-blocking socket, because we don't want
+ * to return EAGAIN when portion of the user data
+ * has actually been sent down.
+ */
+ return (strwrite_common(SOTOV(so), uiop, cr, wflag));
+ }
+ }
+ return (0);
+}
+
+/*
* Update so_faddr by asking the transport (unless AF_UNIX).
*/
int
diff --git a/usr/src/uts/common/fs/sockfs/sockvnops.c b/usr/src/uts/common/fs/sockfs/sockvnops.c
index b783dc77ac..817e9b5968 100644
--- a/usr/src/uts/common/fs/sockfs/sockvnops.c
+++ b/usr/src/uts/common/fs/sockfs/sockvnops.c
@@ -53,6 +53,7 @@
#include <sys/stropts.h>
#include <sys/stream.h>
#include <sys/strsubr.h>
+#include <sys/strsun.h>
#include <sys/suntpi.h>
#include <sys/ioctl.h>
#include <sys/sockio.h>
@@ -87,6 +88,9 @@
#include <fs/sockfs/nl7c.h>
+#include <inet/udp_impl.h>
+#include <inet/tcp_impl.h>
+
static int socktpi_close(struct vnode *, int, int, offset_t, struct cred *);
static int socktpi_read(struct vnode *, struct uio *, int, struct cred *,
struct caller_context *);
@@ -140,6 +144,15 @@ const fs_operation_def_t socknca_vnodeops_template[] = {
};
/*
+ * Do direct function call to the transport layer below; this would
+ * also allow the transport to utilize read-side synchronous stream
+ * interface if necessary. This is a /etc/system tunable that must
+ * not be modified on a running system. By default this is enabled
+ * for performance reasons and may be disabled for debugging purposes.
+ */
+boolean_t socktpi_direct = B_TRUE;
+
+/*
* Open routine used by socket() call. Note that vn_open checks for
* VSOCK and fails the open (and VOP_OPEN is fs_nosys). The VSOCK check is
* needed since VSOCK type vnodes exist in various underlying filesystems as
@@ -205,6 +218,56 @@ socktpi_open(struct vnode **vpp, int flag, struct cred *cr)
ASSERT(stp->sd_wrq != NULL);
so->so_provinfo = tpi_findprov(stp->sd_wrq);
+
+ /*
+ * If caller is interested in doing direct function call
+ * interface to/from transport module, probe the module
+ * directly beneath the streamhead to see if it qualifies.
+ *
+ * We turn off direct interface when qualifications fail;
+ * note that we do these checks for everything other than
+ * the tcp acceptor case, because the acceptor inherits
+ * the capabilities of the listener and we've already done
+ * the checks against the listening socket.
+ */
+ if (!(flag & SO_ACCEPTOR) && (so->so_state & SS_DIRECT)) {
+ queue_t *tq = stp->sd_wrq->q_next;
+
+ /*
+ * SS_DIRECT is currently supported and tested
+ * only for tcp/udp; this is the main reason to
+ * have the following assertions.
+ */
+ ASSERT(so->so_family == AF_INET ||
+ so->so_family == AF_INET6);
+ ASSERT(so->so_protocol == IPPROTO_UDP ||
+ so->so_protocol == IPPROTO_TCP ||
+ so->so_protocol == IPPROTO_IP);
+ ASSERT(so->so_type == SOCK_DGRAM ||
+ so->so_type == SOCK_STREAM);
+
+ /*
+ * Abort direct call interface if the module directly
+ * underneath the stream head is not defined with the
+ * _D_DIRECT flag. This could happen in the tcp or
+ * udp case, when some other module is autopushed
+ * above it, or for some reasons the expected module
+ * isn't purely D_MP (which is the main requirement).
+ */
+ if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
+ !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
+ int rval;
+
+ /* Continue on without direct calls */
+ so->so_state &= ~SS_DIRECT;
+ if ((error = strioctl(vp, _SIOCSOCKFALLBACK,
+ 0, 0, K_TO_K, CRED(), &rval)) != 0) {
+ (void) socktpi_close(vp, flag, 1,
+ (offset_t)0, cr);
+ return (error);
+ }
+ }
+ }
} else {
/*
* While the same socket can not be reopened (unlike specfs)
@@ -436,6 +499,11 @@ socktpi_write(
/* Give NL7C some data */
nl7c_data(so, uiop);
}
+
+ if ((so_state & SS_DIRECT) &&
+ canputnext(vp->v_stream->sd_wrq)) {
+ return (sostream_direct(so, uiop, NULL, cr));
+ }
return (strwrite(vp, uiop, cr));
} else {
/* Send T_DATA_REQ messages without MORE_flag set */
@@ -631,7 +699,7 @@ socktpi_ioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
case I_SENDFD:
case I_RECVFD:
case I_ATMARK:
- case SIOCPOPSOCKFS:
+ case _SIOCSOCKFALLBACK:
/*
* These ioctls do not apply to sockets. I_FDINSERT can be
* used to send M_PROTO messages without modifying the socket
@@ -639,8 +707,9 @@ socktpi_ioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
* descriptor passing since they assume a twisted stream.
* SIOCATMARK must be used instead of I_ATMARK.
*
- * SIOCPOPSOCKFS from an application should never be
- * processed. It is always generated in response to I_POP.
+ * _SIOCSOCKFALLBACK from an application should never be
+ * processed. It is only generated by socktpi_open() or
+ * in response to I_POP or I_PUSH.
*/
#ifdef DEBUG
cmn_err(CE_WARN, "Unsupported STREAMS ioctl 0x%x on socket. "
@@ -724,6 +793,24 @@ socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
switch (cmd) {
case I_PUSH:
+ if (so->so_state & SS_DIRECT) {
+ mutex_enter(&so->so_lock);
+ so_lock_single(so);
+ mutex_exit(&so->so_lock);
+
+ error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
+ CRED(), rvalp);
+
+ mutex_enter(&so->so_lock);
+ if (error == 0)
+ so->so_state &= ~SS_DIRECT;
+ so_unlock_single(so, SOLOCKED);
+ mutex_exit(&so->so_lock);
+
+ if (error != 0)
+ return (error);
+ }
+
error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
if (error == 0)
so->so_pushcnt++;
diff --git a/usr/src/uts/common/inet/Makefile b/usr/src/uts/common/inet/Makefile
index f43759686a..88afebe947 100644
--- a/usr/src/uts/common/inet/Makefile
+++ b/usr/src/uts/common/inet/Makefile
@@ -34,7 +34,7 @@ HDRS= arp.h common.h ipclassifier.h ip.h ip6.h ipdrop.h ipsecah.h ipsecesp.h \
ipsec_info.h ip6_asp.h ip_if.h ip_ire.h ip_multi.h ip_ndp.h ip_rts.h \
ipsec_impl.h keysock.h led.h mi.h mib2.h nd.h optcom.h sadb.h \
sctp_itf.h snmpcom.h tcp.h tcp_sack.h tun.h udp_impl.h arp_impl.h \
- rawip_impl.h ipp_common.h
+ rawip_impl.h ipp_common.h ip_impl.h tcp_impl.h
ROOTDIRS= $(ROOT)/usr/include/inet
diff --git a/usr/src/uts/common/inet/arp/arp.c b/usr/src/uts/common/inet/arp/arp.c
index 59dbbd6808..d4889559e2 100644
--- a/usr/src/uts/common/inet/arp/arp.c
+++ b/usr/src/uts/common/inet/arp/arp.c
@@ -194,7 +194,6 @@ static int ar_entry_add(queue_t *q, mblk_t *mp);
static int ar_entry_delete(queue_t *q, mblk_t *mp);
static int ar_entry_query(queue_t *q, mblk_t *mp);
static int ar_entry_squery(queue_t *q, mblk_t *mp);
-static void ar_freemsg(mblk_t *mp);
static int ar_interface_up(queue_t *q, mblk_t *mp);
static int ar_interface_down(queue_t *q, mblk_t *mp);
static int ar_interface_on(queue_t *q, mblk_t *mp);
@@ -1231,7 +1230,7 @@ ar_cmd_done(arl_t *arl)
ar_ip->ar_arl_ip_assoc = ar_arl;
}
}
- ar_freemsg(mp);
+ inet_freemsg(mp);
}
/*
@@ -1745,19 +1744,6 @@ ar_entry_squery(queue_t *q, mblk_t *mp_orig)
return (0);
}
-/* Make sure b_next and b_prev are null and then free the message */
-static void
-ar_freemsg(mblk_t *mp)
-{
- mblk_t *mp1;
-
- for (mp1 = mp; mp1; mp1 = mp1->b_cont) {
- mp1->b_prev = mp1->b_next = NULL;
- mp1->b_queue = NULL;
- }
- freemsg(mp);
-}
-
/* Process an interface down causing us to detach and unbind. */
/* ARGSUSED */
static int
@@ -1936,7 +1922,7 @@ ar_ll_cleanup_arl_queue(queue_t *q)
BUMP_IRE_STATS(ire_stats_v4,
ire_stats_freed);
}
- ar_freemsg(mp);
+ inet_freemsg(mp);
} else {
prev = mp;
}
@@ -2587,7 +2573,7 @@ ar_query_delete(ace_t *ace, uchar_t *ar)
*(uint32_t *)mp->b_rptr == AR_ENTRY_QUERY) {
BUMP_IRE_STATS(ire_stats_v4, ire_stats_freed);
}
- ar_freemsg(mp);
+ inet_freemsg(mp);
} else {
mpp = &mp->b_next;
}
@@ -2657,7 +2643,7 @@ ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr,
} else {
if (ret_val != 0) {
/* TODO: find some way to let the guy know? */
- ar_freemsg(mp);
+ inet_freemsg(mp);
BUMP_IRE_STATS(ire_stats_v4, ire_stats_freed);
continue;
}
@@ -2849,7 +2835,7 @@ ar_rput(queue_t *q, mblk_t *mp)
"arp_rput_end: q %p (%S)", q, "proto");
return;
default:
- ar_freemsg(mp);
+ inet_freemsg(mp);
return;
}
if ((mp->b_wptr - mp->b_rptr) < sizeof (dl_unitdata_ind_t) ||
diff --git a/usr/src/uts/common/inet/common.h b/usr/src/uts/common/inet/common.h
index 63c630718b..5ac15b3c4e 100644
--- a/usr/src/uts/common/inet/common.h
+++ b/usr/src/uts/common/inet/common.h
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 1992-2001, 2003 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -97,13 +97,13 @@ typedef void (*pfv_t)();
#define INET_MAXMINOR MAXMIN /* maximum device minor number */
#ifdef _KERNEL
+#include <sys/stream.h>
-extern void inet_init(void);
-extern void inet_destroy(void);
extern void *inet_minor_create(char *, dev_t, int);
extern void inet_minor_destroy(void *);
extern dev_t inet_minor_alloc(void *);
extern void inet_minor_free(void *, dev_t);
+extern void inet_freemsg(mblk_t *);
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/inet/inet_common.c b/usr/src/uts/common/inet/inet_common.c
index 0900852a64..e55abc6c01 100644
--- a/usr/src/uts/common/inet/inet_common.c
+++ b/usr/src/uts/common/inet/inet_common.c
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -103,3 +103,21 @@ inet_minor_free(void *a, dev_t dev)
ASSERT((dev != OPENFAIL) && (dev != 0) && (dev <= inet_maxminor));
vmem_free(((inet_arena_t *)a)->ineta_arena, (void *)dev, 1);
}
+
+/*
+ * This function is used to free a message that has gone through
+ * mi_copyin processing which modifies the M_IOCTL mblk's b_next
+ * and b_prev pointers. We use this function to set b_next/b_prev
+ * to NULL and free them.
+ */
+void
+inet_freemsg(mblk_t *mp)
+{
+ mblk_t *bp = mp;
+
+ for (; bp != NULL; bp = bp->b_cont) {
+ bp->b_prev = NULL;
+ bp->b_next = NULL;
+ }
+ freemsg(mp);
+}
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h
index 9caf225c41..23e3069934 100644
--- a/usr/src/uts/common/inet/ip.h
+++ b/usr/src/uts/common/inet/ip.h
@@ -52,6 +52,7 @@ extern "C" {
#include <sys/vmem.h>
#include <sys/squeue.h>
#include <sys/systm.h>
+#include <sys/multidata.h>
#ifdef DEBUG
#define ILL_DEBUG
@@ -67,7 +68,19 @@ extern "C" {
* of flags.
*/
#define IP_DEVMTFLAGS D_MP
-#endif
+#endif /* _KERNEL */
+
+#define IP_MOD_NAME "ip"
+#define IP_DEV_NAME "/dev/ip"
+#define IP6_DEV_NAME "/dev/ip6"
+
+#define UDP_MOD_NAME "udp"
+#define UDP_DEV_NAME "/dev/udp"
+#define UDP6_DEV_NAME "/dev/udp6"
+
+#define TCP_MOD_NAME "tcp"
+#define TCP_DEV_NAME "/dev/tcp"
+#define TCP6_DEV_NAME "/dev/tcp6"
/* Minor numbers */
#define IPV4_MINOR 0
@@ -101,8 +114,6 @@ typedef uint32_t ipaddr_t;
#define ILL_FRAG_HASH_TBL_COUNT ((unsigned int)64)
#define ILL_FRAG_HASH_TBL_SIZE (ILL_FRAG_HASH_TBL_COUNT * sizeof (ipfb_t))
-#define IP_DEV_NAME "/dev/ip"
-#define IP_MOD_NAME "ip"
#define IPV4_ADDR_LEN 4
#define IP_ADDR_LEN IPV4_ADDR_LEN
#define IP_ARP_PROTO_TYPE 0x0800
@@ -236,6 +247,7 @@ typedef struct ipoptp_s
#define Q_TO_CONN(q) ((conn_t *)(q)->q_ptr)
#define Q_TO_TCP(q) (Q_TO_CONN((q))->conn_tcp)
+#define Q_TO_UDP(q) (Q_TO_CONN((q))->conn_udp)
/*
* The following two macros are used by IP to get the appropriate
@@ -244,13 +256,10 @@ typedef struct ipoptp_s
* from a conn directly if it knows that the conn is not TCP.
*/
#define CONNP_TO_WQ(connp) \
- (((connp)->conn_tcp == NULL) ? (connp)->conn_wq : \
- (connp)->conn_tcp->tcp_wq)
+ (IPCL_IS_TCP(connp) ? (connp)->conn_tcp->tcp_wq : (connp)->conn_wq)
#define CONNP_TO_RQ(connp) RD(CONNP_TO_WQ(connp))
-#define IS_TCP_CONN(connp) (((connp)->conn_flags & IPCL_TCP) != 0)
-
#define GRAB_CONN_LOCK(q) { \
if (q != NULL && CONN_Q(q)) \
mutex_enter(&(Q_TO_CONN(q))->conn_lock); \
@@ -302,9 +311,8 @@ typedef struct ipoptp_s
*/
#define IP6_NO_IPPOLICY 0x800 /* Don't do IPQoS processing */
#define IP6_IN_LLMCAST 0x1000 /* Multicast */
-#define IP6_IN_NOCKSUM 0x2000 /* Don't compute checksum */
-#define IP_FF_LOOPBACK 0x4000 /* Loopback fanout */
+#define IP_FF_LOOPBACK 0x2000 /* Loopback fanout */
#ifndef IRE_DB_TYPE
#define IRE_DB_TYPE M_SIG
@@ -357,6 +365,8 @@ typedef struct ipf_s {
uint_t ipf_prev_nexthdr_offset; /* Offset for nexthdr value */
uint8_t ipf_ecn; /* ECN info for the fragments */
uint8_t ipf_num_dups; /* Number of times dup frags recvd */
+ uint16_t ipf_checksum_flags; /* Hardware checksum flags */
+ uint32_t ipf_checksum; /* Partial checksum of fragment data */
} ipf_t;
#define ipf_src V4_PART_OF_V6(ipf_v6src)
@@ -623,9 +633,10 @@ typedef struct ip_m_s {
* depends on the atomic 32 bit access to that field.
*/
#define CONN_CLOSING 0x01 /* ip_close waiting for ip_wsrv */
-#define CONN_IPSEC_LOAD_WAIT 0x10 /* waiting for load */
-#define CONN_CONDEMNED 0x40 /* conn is closing, no more refs */
-#define CONN_INCIPIENT 0x80 /* conn not yet visible, no refs */
+#define CONN_IPSEC_LOAD_WAIT 0x02 /* waiting for load */
+#define CONN_CONDEMNED 0x04 /* conn is closing, no more refs */
+#define CONN_INCIPIENT 0x08 /* conn not yet visible, no refs */
+#define CONN_QUIESCED 0x10 /* conn is now quiescent */
/*
* Parameter to ip_output giving the identity of the caller.
@@ -2593,6 +2604,7 @@ extern ipparam_t *ip_param_arr;
extern int ip_g_forward;
extern int ipv6_forward;
+extern vmem_t *ip_minor_arena;
#define ip_respond_to_address_mask_broadcast ip_param_arr[0].ip_param_value
#define ip_g_send_redirects ip_param_arr[5].ip_param_value
@@ -2697,18 +2709,11 @@ extern uint32_t ipsechw_debug;
#define ip1dbg(a) if (ip_debug > 2) printf a
#define ip2dbg(a) if (ip_debug > 3) printf a
#define ip3dbg(a) if (ip_debug > 4) printf a
-
-#define ipcsumdbg(a, b) \
- if (ip_debug == 1) \
- prom_printf(a); \
- else if (ip_debug > 1) \
- { prom_printf("%smp=%p\n", a, (void *)b); }
#else
#define ip0dbg(a) /* */
#define ip1dbg(a) /* */
#define ip2dbg(a) /* */
#define ip3dbg(a) /* */
-#define ipcsumdbg(a, b) /* */
#endif /* IP_DEBUG */
extern const char *dlpi_prim_str(int);
@@ -2717,7 +2722,6 @@ extern void ill_frag_timer(void *);
extern ill_t *ill_first(int, int, ill_walk_context_t *);
extern ill_t *ill_next(ill_walk_context_t *, ill_t *);
extern void ill_frag_timer_start(ill_t *);
-extern void ip_ioctl_freemsg(mblk_t *);
extern mblk_t *ip_carve_mp(mblk_t **, ssize_t);
extern mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t);
extern char *ip_dot_addr(ipaddr_t, char *);
@@ -2749,6 +2753,9 @@ extern void ip_input(ill_t *, ill_rx_ring_t *, mblk_t *, size_t);
extern void ip_rput_dlpi(queue_t *, mblk_t *);
extern void ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *);
extern void ip_rput_forward_multicast(ipaddr_t, mblk_t *, ipif_t *);
+
+extern int ip_snmpmod_close(queue_t *);
+extern void ip_snmpmod_wput(queue_t *, mblk_t *);
extern void ip_udp_input(queue_t *, mblk_t *, ipha_t *, ire_t *, ill_t *);
extern void ip_proto_input(queue_t *, mblk_t *, ipha_t *, ire_t *, ill_t *);
extern void ip_rput_other(ipsq_t *, queue_t *, mblk_t *, void *);
@@ -2821,6 +2828,7 @@ extern int ipsec_req_from_conn(conn_t *, ipsec_req_t *, int);
extern int ip_snmp_get(queue_t *q, mblk_t *mctl);
extern int ip_snmp_set(queue_t *q, int, int, uchar_t *, int);
extern void ip_process_ioctl(ipsq_t *, queue_t *, mblk_t *, void *);
+extern void ip_quiesce_conn(conn_t *);
extern void ip_reprocess_ioctl(ipsq_t *, queue_t *, mblk_t *, void *);
extern void ip_restart_optmgmt(ipsq_t *, queue_t *, mblk_t *, void *);
extern void ip_ioctl_finish(queue_t *, mblk_t *, int, int, ipif_t *,
@@ -2842,6 +2850,7 @@ extern boolean_t ip_md_hcksum_attr(struct multidata_s *, struct pdesc_s *,
uint32_t, uint32_t, uint32_t, uint32_t);
extern boolean_t ip_md_zcopy_attr(struct multidata_s *, struct pdesc_s *,
uint_t);
+extern mblk_t *ip_unbind(queue_t *, mblk_t *);
/* Hooks for CGTP (multirt routes) filtering module */
#define CGTP_FILTER_REV_1 1
@@ -2925,17 +2934,6 @@ struct ill_mdt_capab_s {
uint_t ill_mdt_span_limit; /* maximum payload span per packet */
};
-/*
- * ioctl identifier and structure for Multidata Transmit update
- * private M_CTL communication from IP to ULP.
- */
-#define MDT_IOC_INFO_UPDATE (('M' << 8) + 1020)
-
-typedef struct ip_mdt_info_s {
- uint_t mdt_info_id; /* MDT_IOC_INFO_UPDATE */
- ill_mdt_capab_t mdt_capab; /* ILL MDT capabilities */
-} ip_mdt_info_t;
-
struct ill_hcksum_capab_s {
uint_t ill_hcksum_version; /* interface version */
uint_t ill_hcksum_txflags; /* capabilities on transmit */
@@ -2991,35 +2989,6 @@ struct ill_poll_capab_s {
};
/*
- * Macro that determines whether or not a given ILL is allowed for MDT.
- */
-#define ILL_MDT_USABLE(ill) \
- ((ill->ill_capabilities & ILL_CAPAB_MDT) != 0 && \
- ill->ill_mdt_capab != NULL && \
- ill->ill_mdt_capab->ill_mdt_version == MDT_VERSION_2 && \
- ill->ill_mdt_capab->ill_mdt_on != 0)
-
-/*
- * Macro that determines whether or not a given CONN may be considered
- * for fast path prior to proceeding further with Multidata.
- */
-#define CONN_IS_MD_FASTPATH(connp) \
- ((connp)->conn_dontroute == 0 && /* SO_DONTROUTE */ \
- (connp)->conn_nofailover_ill == NULL && /* IPIF_NOFAILOVER */ \
- (connp)->conn_xmit_if_ill == NULL && /* IP_XMIT_IF */ \
- (connp)->conn_outgoing_pill == NULL && /* IP{V6}_BOUND_PIF */ \
- (connp)->conn_outgoing_ill == NULL) /* IP{V6}_BOUND_IF */
-
-/*
- * Macro that determines whether or not a given IPC requires
- * outbound IPSEC processing.
- */
-#define CONN_IPSEC_OUT_ENCAPSULATED(connp) \
- ((connp)->conn_out_enforce_policy || \
- ((connp)->conn_latch != NULL && \
- (connp)->conn_latch->ipl_out_policy != NULL))
-
-/*
* IP squeues exports
*/
extern int ip_squeue_profile;
@@ -3049,12 +3018,15 @@ extern void ip_squeue_get_pkts(squeue_t *);
extern int ip_squeue_bind_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *);
extern int ip_squeue_bind_get(queue_t *, mblk_t *, caddr_t, cred_t *);
extern void ip_squeue_clean(void *, mblk_t *, void *);
+extern void ip_resume_tcp_bind(void *, mblk_t *, void *);
+
+extern void tcp_wput(queue_t *, mblk_t *);
-extern void ip_resume_tcp_bind(void *, mblk_t *mp, void *);
extern int ip_fill_mtuinfo(struct in6_addr *, in_port_t,
struct ip6_mtuinfo *);
+extern ipif_t *conn_get_held_ipif(conn_t *, ipif_t **, int *);
-typedef void (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *);
+typedef void (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *);
/*
* Squeue tags. Tags only need to be unique when the callback function is the
@@ -3091,6 +3063,11 @@ typedef void (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *);
#define SQTAG_TCP_WPUT_OTHER 28
#define SQTAG_TCP_CONN_REQ_UNBOUND 29
#define SQTAG_TCP_SEND_PENDING 30
+#define SQTAG_BIND_RETRY 31
+#define SQTAG_UDP_FANOUT 32
+#define SQTAG_UDP_INPUT 33
+#define SQTAG_UDP_WPUT 34
+#define SQTAG_UDP_OUTPUT 35
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/inet/ip/igmp.c b/usr/src/uts/common/inet/ip/igmp.c
index e09aed5736..435f085d24 100644
--- a/usr/src/uts/common/inet/ip/igmp.c
+++ b/usr/src/uts/common/inet/ip/igmp.c
@@ -1925,6 +1925,8 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
igmpa->igmpa_group = ilm->ilm_addr;
igmpa->igmpa_cksum = 0;
igmpa->igmpa_cksum = IP_CSUM(mp, hdrlen, 0);
+ if (igmpa->igmpa_cksum == 0)
+ igmpa->igmpa_cksum = 0xffff;
rtralert[0] = IPOPT_COPY & IPOPT_RTRALERT;
rtralert[1] = RTRALERT_LEN;
diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c
index 89a5fdfaf8..a988b67cbb 100644
--- a/usr/src/uts/common/inet/ip/ip.c
+++ b/usr/src/uts/common/inet/ip/ip.c
@@ -75,9 +75,11 @@
#include <netinet/sctp.h>
#include <inet/ip.h>
+#include <inet/ip_impl.h>
#include <inet/ip6.h>
#include <inet/ip6_asp.h>
#include <inet/tcp.h>
+#include <inet/tcp_impl.h>
#include <inet/ip_multi.h>
#include <inet/ip_if.h>
#include <inet/ip_ire.h>
@@ -110,6 +112,7 @@
#include <inet/ipclassifier.h>
#include <inet/sctp_ip.h>
+#include <inet/udp_impl.h>
/*
* Values for squeue switch:
@@ -122,7 +125,8 @@ squeue_func_t ip_input_proc;
/*
* IP statistics.
*/
-#define IP_STAT(x) (ip_statistics.x.value.ui64++)
+#define IP_STAT(x) (ip_statistics.x.value.ui64++)
+#define IP_STAT_UPDATE(x, n) (ip_statistics.x.value.ui64 += (n))
typedef struct ip_stat {
kstat_named_t ipsec_fanout_proto;
@@ -158,42 +162,68 @@ typedef struct ip_stat {
kstat_named_t ip_ire_redirect_timer_expired;
kstat_named_t ip_ire_pmtu_timer_expired;
kstat_named_t ip_input_multi_squeue;
+ kstat_named_t ip_tcp_in_full_hw_cksum_err;
+ kstat_named_t ip_tcp_in_part_hw_cksum_err;
+ kstat_named_t ip_tcp_in_sw_cksum_err;
+ kstat_named_t ip_tcp_out_sw_cksum_bytes;
+ kstat_named_t ip_udp_in_full_hw_cksum_err;
+ kstat_named_t ip_udp_in_part_hw_cksum_err;
+ kstat_named_t ip_udp_in_sw_cksum_err;
+ kstat_named_t ip_udp_out_sw_cksum_bytes;
+ kstat_named_t ip_frag_mdt_pkt_out;
+ kstat_named_t ip_frag_mdt_discarded;
+ kstat_named_t ip_frag_mdt_allocfail;
+ kstat_named_t ip_frag_mdt_addpdescfail;
+ kstat_named_t ip_frag_mdt_allocd;
} ip_stat_t;
static ip_stat_t ip_statistics = {
- { "ipsec_fanout_proto", KSTAT_DATA_UINT64 },
- { "ip_udp_fannorm", KSTAT_DATA_UINT64 },
- { "ip_udp_fanmb", KSTAT_DATA_UINT64 },
- { "ip_udp_fanothers", KSTAT_DATA_UINT64 },
- { "ip_udp_fast_path", KSTAT_DATA_UINT64 },
- { "ip_udp_slow_path", KSTAT_DATA_UINT64 },
- { "ip_udp_input_err", KSTAT_DATA_UINT64 },
- { "ip_tcppullup", KSTAT_DATA_UINT64 },
- { "ip_tcpoptions", KSTAT_DATA_UINT64 },
- { "ip_multipkttcp", KSTAT_DATA_UINT64 },
- { "ip_tcp_fast_path", KSTAT_DATA_UINT64 },
- { "ip_tcp_slow_path", KSTAT_DATA_UINT64 },
- { "ip_tcp_input_error", KSTAT_DATA_UINT64 },
- { "ip_db_ref", KSTAT_DATA_UINT64 },
- { "ip_notaligned1", KSTAT_DATA_UINT64 },
- { "ip_notaligned2", KSTAT_DATA_UINT64 },
- { "ip_multimblk3", KSTAT_DATA_UINT64 },
- { "ip_multimblk4", KSTAT_DATA_UINT64 },
- { "ip_ipoptions", KSTAT_DATA_UINT64 },
- { "ip_classify_fail", KSTAT_DATA_UINT64 },
- { "ip_opt", KSTAT_DATA_UINT64 },
- { "ip_udp_rput_local", KSTAT_DATA_UINT64 },
- { "ipsec_proto_ahesp", KSTAT_DATA_UINT64 },
- { "ip_conn_flputbq", KSTAT_DATA_UINT64 },
- { "ip_conn_walk_drain", KSTAT_DATA_UINT64 },
- { "ip_out_sw_cksum", KSTAT_DATA_UINT64 },
- { "ip_in_sw_cksum", KSTAT_DATA_UINT64 },
- { "ip_trash_ire_reclaim_calls", KSTAT_DATA_UINT64 },
+ { "ipsec_fanout_proto", KSTAT_DATA_UINT64 },
+ { "ip_udp_fannorm", KSTAT_DATA_UINT64 },
+ { "ip_udp_fanmb", KSTAT_DATA_UINT64 },
+ { "ip_udp_fanothers", KSTAT_DATA_UINT64 },
+ { "ip_udp_fast_path", KSTAT_DATA_UINT64 },
+ { "ip_udp_slow_path", KSTAT_DATA_UINT64 },
+ { "ip_udp_input_err", KSTAT_DATA_UINT64 },
+ { "ip_tcppullup", KSTAT_DATA_UINT64 },
+ { "ip_tcpoptions", KSTAT_DATA_UINT64 },
+ { "ip_multipkttcp", KSTAT_DATA_UINT64 },
+ { "ip_tcp_fast_path", KSTAT_DATA_UINT64 },
+ { "ip_tcp_slow_path", KSTAT_DATA_UINT64 },
+ { "ip_tcp_input_error", KSTAT_DATA_UINT64 },
+ { "ip_db_ref", KSTAT_DATA_UINT64 },
+ { "ip_notaligned1", KSTAT_DATA_UINT64 },
+ { "ip_notaligned2", KSTAT_DATA_UINT64 },
+ { "ip_multimblk3", KSTAT_DATA_UINT64 },
+ { "ip_multimblk4", KSTAT_DATA_UINT64 },
+ { "ip_ipoptions", KSTAT_DATA_UINT64 },
+ { "ip_classify_fail", KSTAT_DATA_UINT64 },
+ { "ip_opt", KSTAT_DATA_UINT64 },
+ { "ip_udp_rput_local", KSTAT_DATA_UINT64 },
+ { "ipsec_proto_ahesp", KSTAT_DATA_UINT64 },
+ { "ip_conn_flputbq", KSTAT_DATA_UINT64 },
+ { "ip_conn_walk_drain", KSTAT_DATA_UINT64 },
+ { "ip_out_sw_cksum", KSTAT_DATA_UINT64 },
+ { "ip_in_sw_cksum", KSTAT_DATA_UINT64 },
+ { "ip_trash_ire_reclaim_calls", KSTAT_DATA_UINT64 },
{ "ip_trash_ire_reclaim_success", KSTAT_DATA_UINT64 },
- { "ip_ire_arp_timer_expired", KSTAT_DATA_UINT64 },
+ { "ip_ire_arp_timer_expired", KSTAT_DATA_UINT64 },
{ "ip_ire_redirect_timer_expired", KSTAT_DATA_UINT64 },
- { "ip_ire_pmtu_timer_expired", KSTAT_DATA_UINT64 },
- { "ip_input_multi_squeue", KSTAT_DATA_UINT64 },
+ { "ip_ire_pmtu_timer_expired", KSTAT_DATA_UINT64 },
+ { "ip_input_multi_squeue", KSTAT_DATA_UINT64 },
+ { "ip_tcp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 },
+ { "ip_tcp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 },
+ { "ip_tcp_in_sw_cksum_err", KSTAT_DATA_UINT64 },
+ { "ip_tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 },
+ { "ip_udp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 },
+ { "ip_udp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 },
+ { "ip_udp_in_sw_cksum_err", KSTAT_DATA_UINT64 },
+ { "ip_udp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 },
+ { "ip_frag_mdt_pkt_out", KSTAT_DATA_UINT64 },
+ { "ip_frag_mdt_discarded", KSTAT_DATA_UINT64 },
+ { "ip_frag_mdt_allocfail", KSTAT_DATA_UINT64 },
+ { "ip_frag_mdt_addpdescfail", KSTAT_DATA_UINT64 },
+ { "ip_frag_mdt_allocd", KSTAT_DATA_UINT64 },
};
static kstat_t *ip_kstat;
@@ -591,28 +621,12 @@ uint_t ip_max_frag_dups = 10;
/* RFC1122 Conformance */
#define IP_FORWARD_DEFAULT IP_FORWARD_NEVER
-#ifdef _BIG_ENDIAN
-#define IP_HDR_CSUM_TTL_ADJUST 256
-#define IP_TCP_CSUM_COMP IPPROTO_TCP
-#define IP_UDP_CSUM_COMP IPPROTO_UDP
-#else
-#define IP_HDR_CSUM_TTL_ADJUST 1
-#define IP_TCP_CSUM_COMP (IPPROTO_TCP << 8)
-#define IP_UDP_CSUM_COMP (IPPROTO_UDP << 8)
-#endif
-
-#define TCP_CHECKSUM_OFFSET 16
-#define UDP_CHECKSUM_OFFSET 6
-
#define ILL_MAX_NAMELEN LIFNAMSIZ
-#define UDPH_SIZE 8
-
/* Leave room for ip_newroute to tack on the src and target addresses */
#define OK_RESOLVER_MP(mp) \
((mp) && ((mp)->b_wptr - (mp)->b_rptr) >= (2 * IP_ADDR_LEN))
-static ipif_t *conn_get_held_ipif(conn_t *, ipif_t **, int *);
static int conn_set_held_ipif(conn_t *, ipif_t **, ipif_t *);
static mblk_t *ip_wput_attach_llhdr(mblk_t *, ire_t *, ip_proc_t, uint32_t);
@@ -668,6 +682,8 @@ static int ip_rput_forward_options(mblk_t *, ipha_t *, ire_t *);
static boolean_t ip_rput_local_options(queue_t *, mblk_t *, ipha_t *,
ire_t *);
static int ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *);
+static boolean_t ip_rput_fragment(queue_t *, mblk_t **, ipha_t *, uint32_t *,
+ uint16_t *);
int ip_snmp_get(queue_t *, mblk_t *);
static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *);
static mblk_t *ip_snmp_get_mib2_ip6(queue_t *, mblk_t *);
@@ -692,7 +708,6 @@ int ip_snmp_set(queue_t *, int, int, uchar_t *, int);
static boolean_t ip_source_routed(ipha_t *);
static boolean_t ip_source_route_included(ipha_t *);
-static void ip_unbind(queue_t *, mblk_t *);
static void ip_wput_frag(ire_t *, mblk_t *, ip_pkt_t, uint32_t, uint32_t);
static mblk_t *ip_wput_frag_copyhdr(uchar_t *, int, int);
static void ip_wput_local_options(ipha_t *);
@@ -767,6 +782,15 @@ uint_t icmp_pkt_err_sent = 0; /* Number of packets sent in burst */
time_t ip_g_frag_timeout = IP_FRAG_TIMEOUT;
clock_t ip_g_frag_timo_ms = IP_FRAG_TIMEOUT * 1000;
+/*
+ * Threshold which determines whether MDT should be used when
+ * generating IP fragments; payload size must be greater than
+ * this threshold for MDT to take place.
+ */
+#define IP_WPUT_FRAG_MDT_MIN 32768
+
+int ip_wput_frag_mdt_min = IP_WPUT_FRAG_MDT_MIN;
+
/* Protected by ip_mi_lock */
static void *ip_g_head; /* Instance Data List Head */
kmutex_t ip_mi_lock; /* Lock for list of instances */
@@ -1431,7 +1455,7 @@ static ipha_t icmp_ipha = {
};
struct module_info ip_mod_info = {
- 5701, "ip", 1, INFPSZ, 65536, 1024
+ IP_MOD_ID, IP_MOD_NAME, 1, INFPSZ, 65536, 1024
};
static struct qinit rinit = {
@@ -1930,6 +1954,8 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill,
/* Send out an ICMP packet */
icmph->icmph_checksum = 0;
icmph->icmph_checksum = IP_CSUM(mp, iph_hdr_length, 0);
+ if (icmph->icmph_checksum == 0)
+ icmph->icmph_checksum = 0xFFFF;
if (broadcast || CLASSD(ipha->ipha_dst)) {
ipif_t *ipif_chosen;
/*
@@ -3204,6 +3230,8 @@ icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len,
bcopy(stuff, icmph, len);
icmph->icmph_checksum = 0;
icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0);
+ if (icmph->icmph_checksum == 0)
+ icmph->icmph_checksum = 0xFFFF;
BUMP_MIB(&icmp_mib, icmpOutMsgs);
put(q, ipsec_mp);
}
@@ -3704,7 +3732,7 @@ ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp)
ASSERT(!connp->conn_af_isv6);
connp->conn_pkt_isv6 = B_FALSE;
- len = mp->b_wptr - mp->b_rptr;
+ len = MBLKL(mp);
if (len < (sizeof (*tbr) + 1)) {
(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
"ip_bind: bogus msg, len %ld", len);
@@ -3716,7 +3744,7 @@ ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp)
protocol = *mp->b_wptr & 0xFF;
tbr = (struct T_bind_req *)mp->b_rptr;
/* Reset the message type in preparation for shipping it back. */
- mp->b_datap->db_type = M_PCPROTO;
+ DB_TYPE(mp) = M_PCPROTO;
connp->conn_ulp = (uint8_t)protocol;
@@ -3762,8 +3790,8 @@ ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp)
*/
mp1 = mp->b_cont;
- ire_requested = (mp1 && mp1->b_datap->db_type == IRE_DB_REQ_TYPE);
- ipsec_policy_set = (mp1 && mp1->b_datap->db_type == IPSEC_POLICY_SET);
+ ire_requested = (mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE);
+ ipsec_policy_set = (mp1 != NULL && DB_TYPE(mp1) == IPSEC_POLICY_SET);
switch (tbr->ADDR_length) {
default:
@@ -4169,7 +4197,7 @@ ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp,
if (ip_multidata_outbound && !ipsec_policy_set && dst_ire != NULL &&
!(dst_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST)) &&
(md_ill = ire_to_ill(dst_ire), md_ill != NULL) &&
- (md_ill->ill_capabilities & ILL_CAPAB_MDT)) {
+ ILL_MDT_CAPABLE(md_ill)) {
md_dst_ire = dst_ire;
IRE_REFHOLD(md_dst_ire);
}
@@ -4689,43 +4717,19 @@ ip_modclose(ill_t *ill)
}
/*
- * IP has been configured as _D_QNEXTLESS for the client side i.e the driver
- * instance. This implies that
- * 1. IP cannot access the read side q_next pointer directly - it must
- * use routines like putnext and canputnext.
- * 2. ip_close must ensure that all sources of messages being putnext upstream
- * are gone before qprocsoff is called.
- *
- * #2 is handled by having ip_close do the ipcl_hash_remove and wait for
- * conn_ref to drop to zero before calling qprocsoff.
+ * This is called as part of close() for both IP and UDP
+ * in order to quiesce the conn.
*/
-
-/* ARGSUSED */
-int
-ip_close(queue_t *q, int flags)
+void
+ip_quiesce_conn(conn_t *connp)
{
- conn_t *connp;
boolean_t drain_cleanup_reqd = B_FALSE;
boolean_t conn_ioctl_cleanup_reqd = B_FALSE;
boolean_t ilg_cleanup_reqd = B_FALSE;
- TRACE_1(TR_FAC_IP, TR_IP_CLOSE, "ip_close: q %p", q);
+ ASSERT(!IPCL_IS_TCP(connp));
/*
- * Call the appropriate delete routine depending on whether this is
- * a module or device.
- */
- if (WR(q)->q_next != NULL) {
- /* This is a module close */
- return (ip_modclose((ill_t *)q->q_ptr));
- }
-
- connp = Q_TO_CONN(q);
- ASSERT(connp->conn_tcp == NULL);
-
- /*
- * We are being closed as /dev/ip or /dev/ip6.
- *
* Mark the conn as closing, and this conn must not be
* inserted in future into any list. Eg. conn_drain_insert(),
* won't insert this conn into the conn_drain_list.
@@ -4736,6 +4740,7 @@ ip_close(queue_t *q, int flags)
* cannot get set henceforth.
*/
mutex_enter(&connp->conn_lock);
+ ASSERT(!(connp->conn_state_flags & CONN_QUIESCED));
connp->conn_state_flags |= CONN_CLOSING;
if (connp->conn_idl != NULL)
drain_cleanup_reqd = B_TRUE;
@@ -4745,17 +4750,17 @@ ip_close(queue_t *q, int flags)
ilg_cleanup_reqd = B_TRUE;
mutex_exit(&connp->conn_lock);
+ if (IPCL_IS_UDP(connp))
+ udp_quiesce_conn(connp);
+
if (conn_ioctl_cleanup_reqd)
conn_ioctl_cleanup(connp);
/*
* Remove this conn from any fanout list it is on.
- * Then wait until the number of pending putnexts from
- * the fanout code drops to zero, before calling qprocsoff.
- * This is the guarantee a QNEXTLESS driver provides to
- * STREAMS, and is mentioned at the top of this function.
+ * and then wait for any threads currently operating
+ * on this endpoint to finish
*/
-
ipcl_hash_remove(connp);
/*
@@ -4776,7 +4781,6 @@ ip_close(queue_t *q, int flags)
conn_delete_ire(connp, NULL);
-
/*
* Now conn refcnt can increase only thru CONN_INC_REF_LOCKED.
* callers from write side can't be there now because close
@@ -4787,7 +4791,29 @@ ip_close(queue_t *q, int flags)
connp->conn_state_flags |= CONN_CONDEMNED;
while (connp->conn_ref != 1)
cv_wait(&connp->conn_cv, &connp->conn_lock);
+ connp->conn_state_flags |= CONN_QUIESCED;
mutex_exit(&connp->conn_lock);
+}
+
+/* ARGSUSED */
+int
+ip_close(queue_t *q, int flags)
+{
+ conn_t *connp;
+
+ TRACE_1(TR_FAC_IP, TR_IP_CLOSE, "ip_close: q %p", q);
+
+ /*
+ * Call the appropriate delete routine depending on whether this is
+ * a module or device.
+ */
+ if (WR(q)->q_next != NULL) {
+ /* This is a module close */
+ return (ip_modclose((ill_t *)q->q_ptr));
+ }
+
+ connp = q->q_ptr;
+ ip_quiesce_conn(connp);
qprocsoff(q);
@@ -4801,6 +4827,15 @@ ip_close(queue_t *q, int flags)
* has completed, and service has completed or won't run in
* future.
*/
+ ASSERT(connp->conn_ref == 1);
+
+ /*
+ * A conn which was previously marked as IPCL_UDP cannot
+ * retain the flag because it would have been cleared by
+ * udp_close().
+ */
+ ASSERT(!IPCL_IS_UDP(connp));
+
if (connp->conn_latch != NULL) {
IPLATCH_REFRELE(connp->conn_latch);
connp->conn_latch = NULL;
@@ -4827,6 +4862,83 @@ ip_close(queue_t *q, int flags)
return (0);
}
+int
+ip_snmpmod_close(queue_t *q)
+{
+ conn_t *connp = Q_TO_CONN(q);
+ ASSERT(connp->conn_flags & (IPCL_TCPMOD | IPCL_UDPMOD));
+
+ qprocsoff(q);
+
+ if (connp->conn_flags & IPCL_UDPMOD)
+ udp_close_free(connp);
+
+ if (connp->conn_cred != NULL) {
+ crfree(connp->conn_cred);
+ connp->conn_cred = NULL;
+ }
+ CONN_DEC_REF(connp);
+ q->q_ptr = WR(q)->q_ptr = NULL;
+ return (0);
+}
+
+/*
+ * Write side put procedure for TCP module or UDP module instance. TCP/UDP
+ * as a module is only used for MIB browsers that push TCP/UDP over IP or ARP.
+ * The only supported primitives are T_SVR4_OPTMGMT_REQ and T_OPTMGMT_REQ.
+ * M_FLUSH messages and ioctls are only passed downstream; we don't flush our
+ * queues as we never enqueue messages there and we don't handle any ioctls.
+ * Everything else is freed.
+ */
+void
+ip_snmpmod_wput(queue_t *q, mblk_t *mp)
+{
+ conn_t *connp = q->q_ptr;
+ pfi_t setfn;
+ pfi_t getfn;
+
+ ASSERT(connp->conn_flags & (IPCL_TCPMOD | IPCL_UDPMOD));
+
+ switch (DB_TYPE(mp)) {
+ case M_PROTO:
+ case M_PCPROTO:
+ if ((MBLKL(mp) >= sizeof (t_scalar_t)) &&
+ ((((union T_primitives *)mp->b_rptr)->type ==
+ T_SVR4_OPTMGMT_REQ) ||
+ (((union T_primitives *)mp->b_rptr)->type ==
+ T_OPTMGMT_REQ))) {
+ /*
+ * This is the only TPI primitive supported. Its
+ * handling does not require tcp_t, but it does require
+ * conn_t to check permissions.
+ */
+ cred_t *cr = DB_CREDDEF(mp, connp->conn_cred);
+
+ if (connp->conn_flags & IPCL_TCPMOD) {
+ setfn = tcp_snmp_set;
+ getfn = tcp_snmp_get;
+ } else {
+ setfn = udp_snmp_set;
+ getfn = udp_snmp_get;
+ }
+ if (!snmpcom_req(q, mp, setfn, getfn, cr)) {
+ freemsg(mp);
+ return;
+ }
+ } else if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, ENOTSUP))
+ != NULL)
+ qreply(q, mp);
+ break;
+ case M_FLUSH:
+ case M_IOCTL:
+ putnext(q, mp);
+ break;
+ default:
+ freemsg(mp);
+ break;
+ }
+}
+
/* Return the IP checksum for the IP header at "iph". */
uint16_t
ip_csum_hdr(ipha_t *ipha)
@@ -5081,7 +5193,7 @@ ip_dot_saddr(uchar_t *addr, char *buf)
* Send an ICMP error after patching up the packet appropriately. Returns
* non-zero if the appropriate MIB should be bumped; zero otherwise.
*/
-static int
+static boolean_t
ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags,
uint_t icmp_type, uint_t icmp_code, boolean_t mctl_present, zoneid_t zoneid)
{
@@ -5103,8 +5215,8 @@ ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags,
* ipsec_check_global_policy() assumes M_DATA as clear
* and M_CTL as secure.
*/
- db_type = mp->b_datap->db_type;
- mp->b_datap->db_type = M_DATA;
+ db_type = DB_TYPE(mp);
+ DB_TYPE(mp) = M_DATA;
secure = B_FALSE;
}
/*
@@ -5119,17 +5231,17 @@ ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags,
first_mp = ipsec_check_global_policy(first_mp, NULL,
ipha, NULL, mctl_present);
if (first_mp == NULL)
- return (0);
+ return (B_FALSE);
}
if (!mctl_present)
- mp->b_datap->db_type = db_type;
+ DB_TYPE(mp) = db_type;
if (flags & IP_FF_SEND_ICMP) {
if (flags & IP_FF_HDR_COMPLETE) {
if (ip_hdr_complete(ipha, zoneid)) {
freemsg(first_mp);
- return (1);
+ return (B_TRUE);
}
}
if (flags & IP_FF_CKSUM) {
@@ -5152,10 +5264,10 @@ ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags,
}
} else {
freemsg(first_mp);
- return (0);
+ return (B_FALSE);
}
- return (1);
+ return (B_TRUE);
}
#ifdef DEBUG
@@ -5592,7 +5704,7 @@ ip_fanout_tcp(queue_t *q, mblk_t *mp, ill_t *recv_ill, ipha_t *ipha,
}
mp->b_datap->db_struioflag |= STRUIO_EAGER;
- mp->b_datap->db_cksumstart = (intptr_t)sqp;
+ DB_CKSUMSTART(mp) = (intptr_t)sqp;
syn_present = B_TRUE;
}
}
@@ -5720,7 +5832,6 @@ ip_fanout_udp_conn(conn_t *connp, mblk_t *first_mp, mblk_t *mp,
boolean_t secure, ipha_t *ipha, uint_t flags, ill_t *recv_ill,
boolean_t ip_policy)
{
- queue_t *rq = connp->conn_rq;
boolean_t mctl_present = (first_mp != NULL);
uint32_t in_flags = 0; /* set to IP_RECVSLLA and/or IP_RECVIF */
uint32_t ill_index;
@@ -5730,7 +5841,7 @@ ip_fanout_udp_conn(conn_t *connp, mblk_t *first_mp, mblk_t *mp,
else
first_mp = mp;
- if (!canputnext(rq)) {
+ if (CONN_UDP_FLOWCTLD(connp)) {
BUMP_MIB(&ip_mib, udpInOverflows);
freemsg(first_mp);
return;
@@ -5776,7 +5887,9 @@ ip_fanout_udp_conn(conn_t *connp, mblk_t *first_mp, mblk_t *mp,
mp = ip_add_info(mp, recv_ill, in_flags);
}
BUMP_MIB(&ip_mib, ipInDelivers);
- putnext(rq, mp);
+
+ /* Send it upstream */
+ CONN_UDP_RECV(connp, mp);
}
/*
@@ -8454,7 +8567,6 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
return (ip_modopen(q, devp, flag, sflag, credp));
}
-
/*
* We are opening as a device. This is an IP client stream, and we
* allocate an conn_t as the instance data.
@@ -8463,6 +8575,9 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
connp->conn_upq = q;
q->q_ptr = WR(q)->q_ptr = connp;
+ if (flag & SO_SOCKSTR)
+ connp->conn_flags |= IPCL_SOCKET;
+
/* Minor tells us which /dev entry was opened */
if (geteminor(*devp) == IPV6_MINOR) {
connp->conn_flags |= IPCL_ISV6;
@@ -8474,9 +8589,7 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
connp->conn_pkt_isv6 = B_FALSE;
}
-
- if ((connp->conn_dev =
- inet_minor_alloc(ip_minor_arena)) == 0) {
+ if ((connp->conn_dev = inet_minor_alloc(ip_minor_arena)) == 0) {
q->q_ptr = WR(q)->q_ptr = NULL;
CONN_DEC_REF(connp);
return (EBUSY);
@@ -10734,381 +10847,455 @@ ip_udp_check(queue_t *q, conn_t *connp, ill_t *ill, ipha_t *ipha,
}
/*
- * Do fragmentation reassembly.
- * returns B_TRUE if successful else B_FALSE.
+ * Fragmentation reassembly. Each ILL has a hash table for
+ * queuing packets undergoing reassembly for all IPIFs
+ * associated with the ILL. The hash is based on the packet
+ * IP ident field. The ILL frag hash table was allocated
+ * as a timer block at the time the ILL was created. Whenever
+ * there is anything on the reassembly queue, the timer will
+ * be running. Returns B_TRUE if successful else B_FALSE;
* frees mp on failure.
*/
static boolean_t
-ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha)
+ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha,
+ uint32_t *cksum_val, uint16_t *cksum_flags)
{
uint32_t frag_offset_flags;
- ill_t *ill = (ill_t *)q->q_ptr;
- mblk_t *mp = *mpp;
- mblk_t *t_mp;
+ ill_t *ill = (ill_t *)q->q_ptr;
+ mblk_t *mp = *mpp;
+ mblk_t *t_mp;
ipaddr_t dst;
+ uint8_t proto = ipha->ipha_protocol;
+ uint32_t sum_val;
+ uint16_t sum_flags;
+ ipf_t *ipf;
+ ipf_t **ipfp;
+ ipfb_t *ipfb;
+ uint16_t ident;
+ uint32_t offset;
+ ipaddr_t src;
+ uint_t hdr_length;
+ uint32_t end;
+ mblk_t *mp1;
+ mblk_t *tail_mp;
+ size_t count;
+ size_t msg_len;
+ uint8_t ecn_info = 0;
+ uint32_t packet_size;
+ boolean_t pruned = B_FALSE;
+
+ if (cksum_val != NULL)
+ *cksum_val = 0;
+ if (cksum_flags != NULL)
+ *cksum_flags = 0;
/*
* Drop the fragmented as early as possible, if
* we don't have resource(s) to re-assemble.
*/
-
if (ip_reass_queue_bytes == 0) {
freemsg(mp);
return (B_FALSE);
}
- dst = ipha->ipha_dst;
-
- /* Clear hardware checksumming flag if set */
- mp->b_datap->db_struioun.cksum.flags = 0;
+ /* Check for fragmentation offset; return if there's none */
+ if ((frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) &
+ (IPH_MF | IPH_OFFSET)) == 0)
+ return (B_TRUE);
- /* Check for fragmentation offset. */
- frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) &
- (IPH_MF | IPH_OFFSET);
- if (frag_offset_flags) {
- ipf_t *ipf;
- ipf_t **ipfp;
- ipfb_t *ipfb;
- uint16_t ident;
- uint32_t offset;
- ipaddr_t src;
- uint_t hdr_length;
- uint32_t end;
- uint8_t proto;
- mblk_t *mp1;
- mblk_t *tail_mp;
- size_t count;
- size_t msg_len;
- uint8_t ecn_info = 0;
- uint32_t packet_size;
- boolean_t pruned = B_FALSE;
-
- ident = ipha->ipha_ident;
- offset = (frag_offset_flags << 3) & 0xFFFF;
- src = ipha->ipha_src;
- hdr_length = IPH_HDR_LENGTH(ipha);
- end = ntohs(ipha->ipha_length) - hdr_length;
+ /*
+ * We utilize hardware computed checksum info only for UDP since
+ * IP fragmentation is a normal occurence for the protocol. In
+ * addition, checksum offload support for IP fragments carrying
+ * UDP payload is commonly implemented across network adapters.
+ */
+ ASSERT(ill != NULL);
+ if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) &&
+ (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
+ mblk_t *mp1 = mp->b_cont;
+ int32_t len;
+
+ /* Record checksum information from the packet */
+ sum_val = (uint32_t)DB_CKSUM16(mp);
+ sum_flags = DB_CKSUMFLAGS(mp);
+
+ /* IP payload offset from beginning of mblk */
+ offset = ((uchar_t *)ipha + IPH_HDR_LENGTH(ipha)) - mp->b_rptr;
+
+ if ((sum_flags & HCK_PARTIALCKSUM) &&
+ (mp1 == NULL || mp1->b_cont == NULL) &&
+ offset >= DB_CKSUMSTART(mp) &&
+ ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) {
+ uint32_t adj;
+ /*
+ * Partial checksum has been calculated by hardware
+ * and attached to the packet; in addition, any
+ * prepended extraneous data is even byte aligned.
+ * If any such data exists, we adjust the checksum;
+ * this would also handle any postpended data.
+ */
+ IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp),
+ mp, mp1, len, adj);
- /*
- * if end == 0 then we have a packet with no data, so just
- * free it.
- */
- if (end == 0) {
- freemsg(mp);
- return (B_FALSE);
+ /* One's complement subtract extraneous checksum */
+ if (adj >= sum_val)
+ sum_val = ~(adj - sum_val) & 0xFFFF;
+ else
+ sum_val -= adj;
}
- proto = ipha->ipha_protocol;
+ } else {
+ sum_val = 0;
+ sum_flags = 0;
+ }
- /*
- * Fragmentation reassembly. Each ILL has a hash table for
- * queuing packets undergoing reassembly for all IPIFs
- * associated with the ILL. The hash is based on the packet
- * IP ident field. The ILL frag hash table was allocated
- * as a timer block at the time the ILL was created. Whenever
- * there is anything on the reassembly queue, the timer will
- * be running.
- */
- ASSERT(ill != NULL);
+ /* Clear hardware checksumming flag */
+ DB_CKSUMFLAGS(mp) = 0;
- /* Record the ECN field info. */
- ecn_info = (ipha->ipha_type_of_service & 0x3);
- if (offset != 0) {
- /*
- * If this isn't the first piece, strip the header, and
- * add the offset to the end value.
- */
- mp->b_rptr += hdr_length;
- end += offset;
- }
+ ident = ipha->ipha_ident;
+ offset = (frag_offset_flags << 3) & 0xFFFF;
+ src = ipha->ipha_src;
+ dst = ipha->ipha_dst;
+ hdr_length = IPH_HDR_LENGTH(ipha);
+ end = ntohs(ipha->ipha_length) - hdr_length;
- msg_len = mp->b_datap->db_lim - mp->b_datap->db_base;
- tail_mp = mp;
- while (tail_mp->b_cont != NULL) {
- tail_mp = tail_mp->b_cont;
- msg_len += tail_mp->b_datap->db_lim -
- tail_mp->b_datap->db_base;
- }
+ /* If end == 0 then we have a packet with no data, so just free it */
+ if (end == 0) {
+ freemsg(mp);
+ return (B_FALSE);
+ }
+ /* Record the ECN field info. */
+ ecn_info = (ipha->ipha_type_of_service & 0x3);
+ if (offset != 0) {
/*
- * If the reassembly list for this ILL will get too big
- * prune it.
+ * If this isn't the first piece, strip the header, and
+ * add the offset to the end value.
*/
- if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
- ip_reass_queue_bytes) {
- ill_frag_prune(ill,
- (ip_reass_queue_bytes < msg_len) ? 0 :
- (ip_reass_queue_bytes - msg_len));
- pruned = B_TRUE;
- }
+ mp->b_rptr += hdr_length;
+ end += offset;
+ }
- ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)];
- mutex_enter(&ipfb->ipfb_lock);
+ msg_len = MBLKSIZE(mp);
+ tail_mp = mp;
+ while (tail_mp->b_cont != NULL) {
+ tail_mp = tail_mp->b_cont;
+ msg_len += MBLKSIZE(tail_mp);
+ }
- ipfp = &ipfb->ipfb_ipf;
- /* Try to find an existing fragment queue for this packet. */
- for (;;) {
- ipf = ipfp[0];
- if (ipf != NULL) {
- /*
- * It has to match on ident and src/dst address.
- */
- if (ipf->ipf_ident == ident &&
- ipf->ipf_src == src &&
- ipf->ipf_dst == dst &&
- ipf->ipf_protocol == proto) {
- /*
- * If we have received too many
- * duplicate fragments for this packet
- * free it.
- */
- if (ipf->ipf_num_dups >
- ip_max_frag_dups) {
- ill_frag_free_pkts(ill, ipfb,
- ipf, 1);
- freemsg(mp);
- mutex_exit(&ipfb->ipfb_lock);
- return (B_FALSE);
- }
- /* Found it. */
- break;
- }
- ipfp = &ipf->ipf_hash_next;
- continue;
- }
+ /* If the reassembly list for this ILL will get too big, prune it */
+ if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >=
+ ip_reass_queue_bytes) {
+ ill_frag_prune(ill,
+ (ip_reass_queue_bytes < msg_len) ? 0 :
+ (ip_reass_queue_bytes - msg_len));
+ pruned = B_TRUE;
+ }
+
+ ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)];
+ mutex_enter(&ipfb->ipfb_lock);
+ ipfp = &ipfb->ipfb_ipf;
+ /* Try to find an existing fragment queue for this packet. */
+ for (;;) {
+ ipf = ipfp[0];
+ if (ipf != NULL) {
/*
- * If we pruned the list, do we want to store this new
- * fragment?. We apply an optimization here based on the
- * fact that most fragments will be received in order.
- * So if the offset of this incoming fragment is zero,
- * it is the first fragment of a new packet. We will
- * keep it. Otherwise drop the fragment, as we have
- * probably pruned the packet already (since the
- * packet cannot be found).
+ * It has to match on ident and src/dst address.
*/
- if (pruned && offset != 0) {
- mutex_exit(&ipfb->ipfb_lock);
- freemsg(mp);
- return (B_FALSE);
- }
-
- if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS) {
+ if (ipf->ipf_ident == ident &&
+ ipf->ipf_src == src &&
+ ipf->ipf_dst == dst &&
+ ipf->ipf_protocol == proto) {
/*
- * Too many fragmented packets in this hash
- * bucket. Free the oldest.
+ * If we have received too many
+ * duplicate fragments for this packet
+ * free it.
*/
- ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf,
- 1);
- }
-
- /* New guy. Allocate a frag message. */
- mp1 = allocb(sizeof (*ipf), BPRI_MED);
- if (mp1 == NULL) {
- BUMP_MIB(&ip_mib, ipInDiscards);
- freemsg(mp);
-reass_done:
- mutex_exit(&ipfb->ipfb_lock);
- return (B_FALSE);
+ if (ipf->ipf_num_dups > ip_max_frag_dups) {
+ ill_frag_free_pkts(ill, ipfb, ipf, 1);
+ freemsg(mp);
+ mutex_exit(&ipfb->ipfb_lock);
+ return (B_FALSE);
+ }
+ /* Found it. */
+ break;
}
+ ipfp = &ipf->ipf_hash_next;
+ continue;
+ }
+ /*
+ * If we pruned the list, do we want to store this new
+ * fragment?. We apply an optimization here based on the
+ * fact that most fragments will be received in order.
+ * So if the offset of this incoming fragment is zero,
+ * it is the first fragment of a new packet. We will
+ * keep it. Otherwise drop the fragment, as we have
+ * probably pruned the packet already (since the
+ * packet cannot be found).
+ */
+ if (pruned && offset != 0) {
+ mutex_exit(&ipfb->ipfb_lock);
+ freemsg(mp);
+ return (B_FALSE);
+ }
- BUMP_MIB(&ip_mib, ipReasmReqds);
- mp1->b_cont = mp;
-
- /* Initialize the fragment header. */
- ipf = (ipf_t *)mp1->b_rptr;
- ipf->ipf_mp = mp1;
- ipf->ipf_ptphn = ipfp;
- ipfp[0] = ipf;
- ipf->ipf_hash_next = NULL;
- ipf->ipf_ident = ident;
- ipf->ipf_protocol = proto;
- ipf->ipf_src = src;
- ipf->ipf_dst = dst;
- ipf->ipf_nf_hdr_len = 0;
- /* Record reassembly start time. */
- ipf->ipf_timestamp = gethrestime_sec();
- /* Record ipf generation and account for frag header */
- ipf->ipf_gen = ill->ill_ipf_gen++;
- ipf->ipf_count = mp1->b_datap->db_lim -
- mp1->b_datap->db_base;
- ipf->ipf_last_frag_seen = B_FALSE;
- ipf->ipf_ecn = ecn_info;
- ipf->ipf_num_dups = 0;
- ipfb->ipfb_frag_pkts++;
-
+ if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS) {
/*
- * We handle reassembly two ways. In the easy case,
- * where all the fragments show up in order, we do
- * minimal bookkeeping, and just clip new pieces on
- * the end. If we ever see a hole, then we go off
- * to ip_reassemble which has to mark the pieces and
- * keep track of the number of holes, etc. Obviously,
- * the point of having both mechanisms is so we can
- * handle the easy case as efficiently as possible.
+ * Too many fragmented packets in this hash
+ * bucket. Free the oldest.
*/
- if (offset == 0) {
- /* Easy case, in-order reassembly so far. */
- ipf->ipf_count += msg_len;
- ipf->ipf_tail_mp = tail_mp;
- /*
- * Keep track of next expected offset in
- * ipf_end.
- */
- ipf->ipf_end = end;
- ipf->ipf_nf_hdr_len = hdr_length;
- } else {
- /* Hard case, hole at the beginning. */
- ipf->ipf_tail_mp = NULL;
- /*
- * ipf_end == 0 means that we have given up
- * on easy reassembly.
- */
- ipf->ipf_end = 0;
- /*
- * ipf_hole_cnt is set by ip_reassemble.
- * ipf_count is updated by ip_reassemble.
- * No need to check for return value here
- * as we don't expect reassembly to complete
- * or fail for the first fragment itself.
- */
- (void) ip_reassemble(mp, ipf,
- (frag_offset_flags & IPH_OFFSET) << 3,
- (frag_offset_flags & IPH_MF), ill, msg_len);
- }
- /* Update per ipfb and ill byte counts */
- ipfb->ipfb_count += ipf->ipf_count;
- ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
- ill->ill_frag_count += ipf->ipf_count;
- ASSERT(ill->ill_frag_count > 0); /* Wraparound */
- /* If the frag timer wasn't already going, start it. */
- mutex_enter(&ill->ill_lock);
- ill_frag_timer_start(ill);
- mutex_exit(&ill->ill_lock);
- goto reass_done;
+ ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1);
}
- /*
- * We have a new piece of a datagram which is already being
- * reassembled. Update the ECN info if all IP fragments
- * are ECN capable. If there is one which is not, clear
- * all the info. If there is at least one which has CE
- * code point, IP needs to report that up to transport.
- */
- if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) {
- if (ecn_info == IPH_ECN_CE)
- ipf->ipf_ecn = IPH_ECN_CE;
- } else {
- ipf->ipf_ecn = IPH_ECN_NECT;
+ /* New guy. Allocate a frag message. */
+ mp1 = allocb(sizeof (*ipf), BPRI_MED);
+ if (mp1 == NULL) {
+ BUMP_MIB(&ip_mib, ipInDiscards);
+ freemsg(mp);
+reass_done:
+ mutex_exit(&ipfb->ipfb_lock);
+ return (B_FALSE);
}
- if (offset && ipf->ipf_end == offset) {
- /* The new fragment fits at the end */
- ipf->ipf_tail_mp->b_cont = mp;
- /* Update the byte count */
+
+
+ BUMP_MIB(&ip_mib, ipReasmReqds);
+ mp1->b_cont = mp;
+
+ /* Initialize the fragment header. */
+ ipf = (ipf_t *)mp1->b_rptr;
+ ipf->ipf_mp = mp1;
+ ipf->ipf_ptphn = ipfp;
+ ipfp[0] = ipf;
+ ipf->ipf_hash_next = NULL;
+ ipf->ipf_ident = ident;
+ ipf->ipf_protocol = proto;
+ ipf->ipf_src = src;
+ ipf->ipf_dst = dst;
+ ipf->ipf_nf_hdr_len = 0;
+ /* Record reassembly start time. */
+ ipf->ipf_timestamp = gethrestime_sec();
+ /* Record ipf generation and account for frag header */
+ ipf->ipf_gen = ill->ill_ipf_gen++;
+ ipf->ipf_count = MBLKSIZE(mp1);
+ ipf->ipf_last_frag_seen = B_FALSE;
+ ipf->ipf_ecn = ecn_info;
+ ipf->ipf_num_dups = 0;
+ ipfb->ipfb_frag_pkts++;
+ ipf->ipf_checksum = 0;
+ ipf->ipf_checksum_flags = 0;
+
+ /* Store checksum value in fragment header */
+ if (sum_flags != 0) {
+ sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+ sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+ ipf->ipf_checksum = sum_val;
+ ipf->ipf_checksum_flags = sum_flags;
+ }
+
+ /*
+ * We handle reassembly two ways. In the easy case,
+ * where all the fragments show up in order, we do
+ * minimal bookkeeping, and just clip new pieces on
+ * the end. If we ever see a hole, then we go off
+ * to ip_reassemble which has to mark the pieces and
+ * keep track of the number of holes, etc. Obviously,
+ * the point of having both mechanisms is so we can
+ * handle the easy case as efficiently as possible.
+ */
+ if (offset == 0) {
+ /* Easy case, in-order reassembly so far. */
ipf->ipf_count += msg_len;
- /* Update per ipfb and ill byte counts */
- ipfb->ipfb_count += msg_len;
- ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
- ill->ill_frag_count += msg_len;
- ASSERT(ill->ill_frag_count > 0); /* Wraparound */
- if (frag_offset_flags & IPH_MF) {
- /* More to come. */
- ipf->ipf_end = end;
- ipf->ipf_tail_mp = tail_mp;
- goto reass_done;
- }
+ ipf->ipf_tail_mp = tail_mp;
+ /*
+ * Keep track of next expected offset in
+ * ipf_end.
+ */
+ ipf->ipf_end = end;
+ ipf->ipf_nf_hdr_len = hdr_length;
} else {
- /* Go do the hard cases. */
- int ret;
+ /* Hard case, hole at the beginning. */
+ ipf->ipf_tail_mp = NULL;
+ /*
+ * ipf_end == 0 means that we have given up
+ * on easy reassembly.
+ */
+ ipf->ipf_end = 0;
- if (offset == 0)
- ipf->ipf_nf_hdr_len = hdr_length;
+ /* Forget checksum offload from now on */
+ ipf->ipf_checksum_flags = 0;
- /* Save current byte count */
- count = ipf->ipf_count;
- ret = ip_reassemble(mp, ipf,
+ /*
+ * ipf_hole_cnt is set by ip_reassemble.
+ * ipf_count is updated by ip_reassemble.
+ * No need to check for return value here
+ * as we don't expect reassembly to complete
+ * or fail for the first fragment itself.
+ */
+ (void) ip_reassemble(mp, ipf,
(frag_offset_flags & IPH_OFFSET) << 3,
(frag_offset_flags & IPH_MF), ill, msg_len);
- /* Count of bytes added and subtracted (freeb()ed) */
- count = ipf->ipf_count - count;
- if (count) {
- /* Update per ipfb and ill byte counts */
- ipfb->ipfb_count += count;
- ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
- ill->ill_frag_count += count;
- ASSERT(ill->ill_frag_count > 0);
- }
- if (ret == IP_REASS_PARTIAL) {
- goto reass_done;
- } else if (ret == IP_REASS_FAILED) {
- /* Reassembly failed. Free up all resources */
- ill_frag_free_pkts(ill, ipfb, ipf, 1);
- for (t_mp = mp; t_mp != NULL;
- t_mp = t_mp->b_cont) {
- IP_REASS_SET_START(t_mp, 0);
- IP_REASS_SET_END(t_mp, 0);
- }
- freemsg(mp);
- goto reass_done;
- }
- /* We will reach here iff 'ret' is IP_REASS_COMPLETE */
}
- /*
- * We have completed reassembly. Unhook the frag header from
- * the reassembly list.
- *
- * Before we free the frag header, record the ECN info
- * to report back to the transport.
- */
- ecn_info = ipf->ipf_ecn;
- BUMP_MIB(&ip_mib, ipReasmOKs);
- ipfp = ipf->ipf_ptphn;
- mp1 = ipf->ipf_mp;
- count = ipf->ipf_count;
- ipf = ipf->ipf_hash_next;
- if (ipf)
- ipf->ipf_ptphn = ipfp;
- ipfp[0] = ipf;
- ill->ill_frag_count -= count;
- ASSERT(ipfb->ipfb_count >= count);
- ipfb->ipfb_count -= count;
- ipfb->ipfb_frag_pkts--;
- mutex_exit(&ipfb->ipfb_lock);
- /* Ditch the frag header. */
- mp = mp1->b_cont;
+ /* Update per ipfb and ill byte counts */
+ ipfb->ipfb_count += ipf->ipf_count;
+ ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
+ ill->ill_frag_count += ipf->ipf_count;
+ ASSERT(ill->ill_frag_count > 0); /* Wraparound */
+ /* If the frag timer wasn't already going, start it. */
+ mutex_enter(&ill->ill_lock);
+ ill_frag_timer_start(ill);
+ mutex_exit(&ill->ill_lock);
+ goto reass_done;
+ }
- freeb(mp1);
+ /*
+ * If the packet's flag has changed (it could be coming up
+ * from an interface different than the previous, therefore
+ * possibly different checksum capability), then forget about
+ * any stored checksum states. Otherwise add the value to
+ * the existing one stored in the fragment header.
+ */
+ if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) {
+ sum_val += ipf->ipf_checksum;
+ sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+ sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+ ipf->ipf_checksum = sum_val;
+ } else if (ipf->ipf_checksum_flags != 0) {
+ /* Forget checksum offload from now on */
+ ipf->ipf_checksum_flags = 0;
+ }
- /* Restore original IP length in header. */
- packet_size = (uint32_t)msgdsize(mp);
- if (packet_size > IP_MAXPACKET) {
- freemsg(mp);
- BUMP_MIB(&ip_mib, ipInHdrErrors);
- return (B_FALSE);
+ /*
+ * We have a new piece of a datagram which is already being
+ * reassembled. Update the ECN info if all IP fragments
+ * are ECN capable. If there is one which is not, clear
+ * all the info. If there is at least one which has CE
+ * code point, IP needs to report that up to transport.
+ */
+ if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) {
+ if (ecn_info == IPH_ECN_CE)
+ ipf->ipf_ecn = IPH_ECN_CE;
+ } else {
+ ipf->ipf_ecn = IPH_ECN_NECT;
+ }
+ if (offset && ipf->ipf_end == offset) {
+ /* The new fragment fits at the end */
+ ipf->ipf_tail_mp->b_cont = mp;
+ /* Update the byte count */
+ ipf->ipf_count += msg_len;
+ /* Update per ipfb and ill byte counts */
+ ipfb->ipfb_count += msg_len;
+ ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
+ ill->ill_frag_count += msg_len;
+ ASSERT(ill->ill_frag_count > 0); /* Wraparound */
+ if (frag_offset_flags & IPH_MF) {
+ /* More to come. */
+ ipf->ipf_end = end;
+ ipf->ipf_tail_mp = tail_mp;
+ goto reass_done;
}
+ } else {
+ /* Go do the hard cases. */
+ int ret;
- if (mp->b_datap->db_ref > 1) {
- mblk_t *mp2;
+ if (offset == 0)
+ ipf->ipf_nf_hdr_len = hdr_length;
- mp2 = copymsg(mp);
- freemsg(mp);
- if (!mp2) {
- BUMP_MIB(&ip_mib, ipInDiscards);
- return (B_FALSE);
+ /* Save current byte count */
+ count = ipf->ipf_count;
+ ret = ip_reassemble(mp, ipf,
+ (frag_offset_flags & IPH_OFFSET) << 3,
+ (frag_offset_flags & IPH_MF), ill, msg_len);
+ /* Count of bytes added and subtracted (freeb()ed) */
+ count = ipf->ipf_count - count;
+ if (count) {
+ /* Update per ipfb and ill byte counts */
+ ipfb->ipfb_count += count;
+ ASSERT(ipfb->ipfb_count > 0); /* Wraparound */
+ ill->ill_frag_count += count;
+ ASSERT(ill->ill_frag_count > 0);
+ }
+ if (ret == IP_REASS_PARTIAL) {
+ goto reass_done;
+ } else if (ret == IP_REASS_FAILED) {
+ /* Reassembly failed. Free up all resources */
+ ill_frag_free_pkts(ill, ipfb, ipf, 1);
+ for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) {
+ IP_REASS_SET_START(t_mp, 0);
+ IP_REASS_SET_END(t_mp, 0);
}
- mp = mp2;
+ freemsg(mp);
+ goto reass_done;
}
- ipha = (ipha_t *)mp->b_rptr;
+ /* We will reach here iff 'ret' is IP_REASS_COMPLETE */
+ }
+ /*
+ * We have completed reassembly. Unhook the frag header from
+ * the reassembly list.
+ *
+ * Before we free the frag header, record the ECN info
+ * to report back to the transport.
+ */
+ ecn_info = ipf->ipf_ecn;
+ BUMP_MIB(&ip_mib, ipReasmOKs);
+ ipfp = ipf->ipf_ptphn;
- ipha->ipha_length = htons((uint16_t)packet_size);
- /* We're now complete, zip the frag state */
- ipha->ipha_fragment_offset_and_flags = 0;
- /* Record the ECN info. */
- ipha->ipha_type_of_service &= 0xFC;
- ipha->ipha_type_of_service |= ecn_info;
- *mpp = mp;
+ /* We need to supply these to caller */
+ if ((sum_flags = ipf->ipf_checksum_flags) != 0)
+ sum_val = ipf->ipf_checksum;
+ else
+ sum_val = 0;
+
+ mp1 = ipf->ipf_mp;
+ count = ipf->ipf_count;
+ ipf = ipf->ipf_hash_next;
+ if (ipf != NULL)
+ ipf->ipf_ptphn = ipfp;
+ ipfp[0] = ipf;
+ ill->ill_frag_count -= count;
+ ASSERT(ipfb->ipfb_count >= count);
+ ipfb->ipfb_count -= count;
+ ipfb->ipfb_frag_pkts--;
+ mutex_exit(&ipfb->ipfb_lock);
+ /* Ditch the frag header. */
+ mp = mp1->b_cont;
+
+ freeb(mp1);
+
+ /* Restore original IP length in header. */
+ packet_size = (uint32_t)msgdsize(mp);
+ if (packet_size > IP_MAXPACKET) {
+ freemsg(mp);
+ BUMP_MIB(&ip_mib, ipInHdrErrors);
+ return (B_FALSE);
+ }
+ if (DB_REF(mp) > 1) {
+ mblk_t *mp2 = copymsg(mp);
+
+ freemsg(mp);
+ if (mp2 == NULL) {
+ BUMP_MIB(&ip_mib, ipInDiscards);
+ return (B_FALSE);
+ }
+ mp = mp2;
}
+ ipha = (ipha_t *)mp->b_rptr;
+
+ ipha->ipha_length = htons((uint16_t)packet_size);
+ /* We're now complete, zip the frag state */
+ ipha->ipha_fragment_offset_and_flags = 0;
+ /* Record the ECN info. */
+ ipha->ipha_type_of_service &= 0xFC;
+ ipha->ipha_type_of_service |= ecn_info;
+ *mpp = mp;
+
+ /* Reassembly is successful; return checksum information if needed */
+ if (cksum_val != NULL)
+ *cksum_val = sum_val;
+ if (cksum_flags != NULL)
+ *cksum_flags = sum_flags;
+
return (B_TRUE);
}
@@ -11156,16 +11343,12 @@ ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
{
uint32_t sum;
uint32_t u1;
- uint32_t u2;
boolean_t mctl_present;
conn_t *connp;
mblk_t *first_mp;
- mblk_t *mp1;
- dblk_t *dp;
uint16_t *up;
ill_t *ill = (ill_t *)q->q_ptr;
- uint32_t ports;
- boolean_t cksum_computed = B_FALSE;
+ uint16_t reass_hck_flags = 0;
#define rptr ((uchar_t *)ipha)
@@ -11182,19 +11365,13 @@ ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
IP_SIMPLE_HDR_LENGTH_IN_WORDS);
/* IP options present */
- if (u1)
+ if (u1 != 0)
goto ipoptions;
-#define IS_IPHDR_HWCKSUM(mctl_present, mp, ill) \
- ((!mctl_present) && (mp->b_datap->db_struioun.cksum.flags & \
- HCK_IPV4_HDRCKSUM) && (ill->ill_capabilities & \
- ILL_CAPAB_HCKSUM) && dohwcksum)
-
/* Check the IP header checksum. */
- if (IS_IPHDR_HWCKSUM(mctl_present, mp, ill)) {
+ if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) {
/* Clear the IP header h/w cksum flag */
- mp->b_datap->db_struioun.cksum.flags &=
- ~HCK_IPV4_HDRCKSUM;
+ DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
} else {
#define uph ((uint16_t *)ipha)
sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + uph[5] +
@@ -11207,7 +11384,7 @@ ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
* Don't verify header checksum if this packet is coming
* back from AH/ESP as we already did it.
*/
- if (!mctl_present && (sum && sum != 0xFFFF)) {
+ if (!mctl_present && sum != 0 && sum != 0xFFFF) {
BUMP_MIB(&ip_mib, ipInCksumErrs);
freemsg(first_mp);
return;
@@ -11236,133 +11413,52 @@ ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
/* packet does not contain complete IP & UDP headers */
if ((mp->b_wptr - rptr) < (IP_SIMPLE_HDR_LENGTH + UDPH_SIZE))
goto udppullup;
+
/* up points to UDP header */
up = (uint16_t *)((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH);
#define iphs ((uint16_t *)ipha)
-#define IP_CKSUM_RECV(len, u1, u2, mp, mp1, error, dp) { \
- boolean_t doswcksum = B_TRUE; \
- uint_t hcksumflags = 0; \
- \
- hcksumflags = dp->db_struioun.cksum.flags; \
- \
- /* Clear the hardware checksum flags; they have been consumed */\
- dp->db_struioun.cksum.flags = 0; \
- if (hcksumflags && (ill->ill_capabilities & ILL_CAPAB_HCKSUM) &&\
- dohwcksum) { \
- if (hcksumflags & HCK_FULLCKSUM) { \
- /* \
- * Full checksum has been computed by the \
- * hardware and has been attached. \
- */ \
- doswcksum = B_FALSE; \
- if (!(hcksumflags & HCK_FULLCKSUM_OK) && \
- (dp->db_cksum16 != 0xffff)) { \
- ipcsumdbg("full hwcksumerr\n", mp); \
- goto error; \
- } \
- } else if ((hcksumflags & HCK_PARTIALCKSUM) && \
- (((len = (IP_SIMPLE_HDR_LENGTH - dp->db_cksumstart))\
- & 1) == 0)) { \
- uint32_t tot_len = 0; \
- \
- doswcksum = B_FALSE; \
- /* Partial checksum computed */ \
- u1 += dp->db_cksum16; \
- tot_len = mp->b_wptr - mp->b_rptr; \
- if (!mp1) \
- mp1 = mp; \
- else \
- tot_len += mp1->b_wptr - mp1->b_rptr; \
- if (len > 0) { \
- /* \
- * Prepended extraneous data. Adjust \
- * checksum. \
- */ \
- u2 = IP_BCSUM_PARTIAL((uchar_t *)(rptr +\
- dp->db_cksumstart), (int32_t)len, \
- 0); \
- } else \
- u2 = 0; \
- if ((len = (dp->db_cksumend - tot_len)) > 0) { \
- /* \
- * Postpended extraneous data. Adjust \
- * checksum. \
- */ \
- uint32_t u3; \
- \
- u3 = IP_BCSUM_PARTIAL(mp1->b_wptr, \
- (int32_t)len, 0); \
- if ((uintptr_t)mp1->b_wptr & 1) \
- /* \
- * Postpended extraneous data \
- * was odd byte aligned, so \
- * swap resulting checksum \
- * bytes. \
- */ \
- u2 += ((u3 << 8) & 0xffff) | \
- (u3 >> 8); \
- else \
- u2 += u3; \
- u2 = (u2 & 0xFFFF) + ((int)(u2) >> 16); \
- } \
- /* \
- * One's complement subtract extraneous checksum\
- */ \
- if (u2 >= u1) \
- u1 = ~(u2 - u1) & 0xFFFF; \
- else \
- u1 -= u2; \
- u1 = (u1 & 0xFFFF) + ((int)u1 >> 16); \
- if (~(u1) & 0xFFFF) { \
- ipcsumdbg("partial hwcksumerr\n", mp); \
- goto error; \
- } \
- } \
- } \
- if (doswcksum) { \
- IP_STAT(ip_in_sw_cksum); \
- if ((IP_CSUM(mp, (int32_t)((uchar_t *)up - \
- (uchar_t *)ipha), u1)) != 0) { \
- ipcsumdbg("swcksumerr\n", mp); \
- goto error; \
- } \
- } \
-}
-
- dp = mp->b_datap;
/* if udp hdr cksum != 0, then need to checksum udp packet */
- if (up[3]) {
- cksum_computed = B_TRUE;
- /* multiple mblks of udp data? */
- if ((mp1 = mp->b_cont) != NULL) {
- /* more than two? */
- if (mp1->b_cont)
- goto multipktudp;
- }
+ if (up[3] != 0) {
+ mblk_t *mp1 = mp->b_cont;
+ boolean_t cksum_err;
+ uint16_t hck_flags = 0;
/* Pseudo-header checksum */
u1 = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] +
iphs[9] + up[2];
- if (!mctl_present) {
- ssize_t len = 0;
- IP_CKSUM_RECV(len, u1, u2, mp, mp1, udpcksumerr, dp);
- } else {
-multipktudp:
+ /*
+ * Revert to software checksum calculation if the interface
+ * isn't capable of checksum offload or if IPsec is present.
+ */
+ if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum)
+ hck_flags = DB_CKSUMFLAGS(mp);
+
+ if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
IP_STAT(ip_in_sw_cksum);
- if ((IP_CSUM(mp, (int32_t)((uchar_t *)up -
- (uchar_t *)ipha), u1)) != 0) {
-udpcksumerr:
- ip1dbg(("ip_udp_input: bad udp checksum\n"));
- BUMP_MIB(&ip_mib, udpInCksumErrs);
- freemsg(first_mp);
- return;
- }
+
+ IP_CKSUM_RECV(hck_flags, u1,
+ (uchar_t *)(rptr + DB_CKSUMSTART(mp)),
+ (int32_t)((uchar_t *)up - rptr),
+ mp, mp1, cksum_err);
+
+ if (cksum_err) {
+ BUMP_MIB(&ip_mib, udpInCksumErrs);
+
+ if (hck_flags & HCK_FULLCKSUM)
+ IP_STAT(ip_udp_in_full_hw_cksum_err);
+ else if (hck_flags & HCK_PARTIALCKSUM)
+ IP_STAT(ip_udp_in_part_hw_cksum_err);
+ else
+ IP_STAT(ip_udp_in_sw_cksum_err);
+
+ freemsg(first_mp);
+ return;
}
}
- /* broadcast IP packet? */
+ /* Non-fragmented broadcast or multicast packet? */
if (ire->ire_type == IRE_BROADCAST)
goto udpslowpath;
@@ -11371,7 +11467,7 @@ udpcksumerr:
ASSERT(connp->conn_upq != NULL);
IP_STAT(ip_udp_fast_path);
- if (!canputnext(connp->conn_upq)) {
+ if (CONN_UDP_FLOWCTLD(connp)) {
freemsg(mp);
BUMP_MIB(&ip_mib, udpInOverflows);
} else {
@@ -11383,7 +11479,8 @@ udpcksumerr:
*/
if (ip_udp_check(q, connp, recv_ill,
ipha, &mp, &first_mp, mctl_present)) {
- putnext(connp->conn_upq, mp);
+ /* Send it upstream */
+ CONN_UDP_RECV(connp, mp);
}
}
/*
@@ -11416,9 +11513,13 @@ ipoptions:
u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
if (u1 & (IPH_MF | IPH_OFFSET)) {
fragmented:
- if (!ip_rput_fragment(q, &mp, ipha)) {
+ /*
+ * "sum" and "reass_hck_flags" are non-zero if the
+ * reassembled packet has a valid hardware computed
+ * checksum information associated with it.
+ */
+ if (!ip_rput_fragment(q, &mp, ipha, &sum, &reass_hck_flags))
goto slow_done;
- }
/*
* Make sure that first_mp points back to mp as
* the mp we came in with could have changed in
@@ -11432,7 +11533,7 @@ fragmented:
/* Now we have a complete datagram, destined for this machine. */
u1 = IPH_HDR_LENGTH(ipha);
/* Pull up the UDP header, if necessary. */
- if ((mp->b_wptr - mp->b_rptr) < (u1 + UDPH_SIZE)) {
+ if ((MBLKL(mp)) < (u1 + UDPH_SIZE)) {
udppullup:
if (!pullupmsg(mp, u1 + UDPH_SIZE)) {
BUMP_MIB(&ip_mib, ipInDiscards);
@@ -11441,30 +11542,43 @@ udppullup:
}
ipha = (ipha_t *)mp->b_rptr;
}
+
/*
- * Validate the checksum. This code is a bit funny looking
- * but may help out the compiler in this crucial spot.
+ * Validate the checksum for the reassembled packet; for the
+ * pullup case we calculate the payload checksum in software.
*/
up = (uint16_t *)((uchar_t *)ipha + u1 + UDP_PORTS_OFFSET);
- if (!cksum_computed && up[3]) {
- IP_STAT(ip_in_sw_cksum);
- sum = IP_CSUM(mp, (int32_t)((uchar_t *)up - (uchar_t *)ipha),
- IP_UDP_CSUM_COMP + iphs[6] +
- iphs[7] + iphs[8] +
- iphs[9] + up[2]);
- if (sum != 0) {
- ip1dbg(("ip_udp_input: bad udp checksum\n"));
- BUMP_MIB(&ip_mib, udpInCksumErrs);
- freemsg(first_mp);
- goto slow_done;
+ if (up[3] != 0) {
+ boolean_t cksum_err;
+
+ if ((reass_hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
+ IP_STAT(ip_in_sw_cksum);
+
+ IP_CKSUM_RECV_REASS(reass_hck_flags,
+ (int32_t)((uchar_t *)up - (uchar_t *)ipha),
+ IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] +
+ iphs[9] + up[2], sum, cksum_err);
+
+ if (cksum_err) {
+ BUMP_MIB(&ip_mib, udpInCksumErrs);
+
+ if (reass_hck_flags & HCK_FULLCKSUM)
+ IP_STAT(ip_udp_in_full_hw_cksum_err);
+ else if (reass_hck_flags & HCK_PARTIALCKSUM)
+ IP_STAT(ip_udp_in_part_hw_cksum_err);
+ else
+ IP_STAT(ip_udp_in_sw_cksum_err);
+
+ freemsg(first_mp);
+ goto slow_done;
}
}
udpslowpath:
- ports = *(uint32_t *)up;
- /* Clear hardware checksum flag */
- mp->b_datap->db_struioun.cksum.flags = 0;
- ip_fanout_udp(q, first_mp, ill, ipha, ports,
+ /* Clear hardware checksum flag to be safe */
+ DB_CKSUMFLAGS(mp) = 0;
+
+ ip_fanout_udp(q, first_mp, ill, ipha, *(uint32_t *)up,
(ire->ire_type == IRE_BROADCAST),
IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_IP6INFO,
mctl_present, B_TRUE, recv_ill, ire->ire_zoneid);
@@ -11473,6 +11587,7 @@ slow_done:
IP_STAT(ip_udp_slow_path);
return;
+#undef iphs
#undef rptr
}
@@ -11485,17 +11600,17 @@ ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present,
conn_t *connp;
uint32_t sum;
uint32_t u1;
- uint32_t u2;
uint16_t *up;
int offset;
ssize_t len;
mblk_t *mp1;
- dblk_t *dp;
boolean_t syn_present = B_FALSE;
tcph_t *tcph;
uint_t ip_hdr_len;
ill_t *ill = (ill_t *)q->q_ptr;
zoneid_t zoneid = ire->ire_zoneid;
+ boolean_t cksum_err;
+ uint16_t hck_flags = 0;
#define rptr ((uchar_t *)ipha)
@@ -11514,10 +11629,9 @@ ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present,
goto ipoptions;
} else {
/* Check the IP header checksum. */
- if (IS_IPHDR_HWCKSUM(mctl_present, mp, ill)) {
+ if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) {
/* Clear the IP header h/w cksum flag */
- mp->b_datap->db_struioun.cksum.flags &=
- ~HCK_IPV4_HDRCKSUM;
+ DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
} else {
#define uph ((uint16_t *)ipha)
sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] +
@@ -11596,30 +11710,32 @@ ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present,
#endif
u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9];
-
/*
- * If the packet has gone through AH/ESP, do the checksum here
- * itself.
- *
- * If it has not gone through IPSEC processing and not a duped
- * mblk, then look for driver checksummed mblk. We validate or
- * postpone the checksum to TCP for single copy checksum.
- *
- * Note that we only honor HW cksum in the fastpath.
+ * Revert to software checksum calculation if the interface
+ * isn't capable of checksum offload or if IPsec is present.
*/
- dp = mp->b_datap;
- if (!mctl_present) {
- IP_CKSUM_RECV(len, u1, u2, mp, mp1, tcpcksumerr, dp);
- } else {
+ if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum)
+ hck_flags = DB_CKSUMFLAGS(mp);
+
+ if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
IP_STAT(ip_in_sw_cksum);
- if ((IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr),
- u1)) != 0) {
-tcpcksumerr:
- BUMP_MIB(&ip_mib, tcpInErrs);
- ip1dbg(("ip_tcp_input: bad tcp checksum \n"));
- freemsg(first_mp);
- goto slow_done;
- }
+
+ IP_CKSUM_RECV(hck_flags, u1,
+ (uchar_t *)(rptr + DB_CKSUMSTART(mp)),
+ (int32_t)((uchar_t *)up - rptr),
+ mp, mp1, cksum_err);
+
+ if (cksum_err) {
+ BUMP_MIB(&ip_mib, tcpInErrs);
+
+ if (hck_flags & HCK_FULLCKSUM)
+ IP_STAT(ip_tcp_in_full_hw_cksum_err);
+ else if (hck_flags & HCK_PARTIALCKSUM)
+ IP_STAT(ip_tcp_in_part_hw_cksum_err);
+ else
+ IP_STAT(ip_tcp_in_sw_cksum_err);
+
+ goto error;
}
try_again:
@@ -11654,7 +11770,7 @@ try_again:
if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) {
if (IPCL_IS_TCP(connp)) {
mp->b_datap->db_struioflag |= STRUIO_EAGER;
- mp->b_datap->db_cksumstart =
+ DB_CKSUMSTART(mp) =
(intptr_t)ip_squeue_get(ill_ring);
if (IPCL_IS_FULLY_BOUND(connp) && !mctl_present &&
!CONN_INBOUND_POLICY_PRESENT(connp)) {
@@ -11800,7 +11916,7 @@ ipoptions:
u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
if (u1 & (IPH_MF | IPH_OFFSET)) {
fragmented:
- if (!ip_rput_fragment(q, &mp, ipha)) {
+ if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) {
if (mctl_present)
freeb(first_mp);
goto slow_done;
@@ -11876,9 +11992,10 @@ multipkttcp:
* ICMP's back, then this flag may need to be cleared in
* other places as well.
*/
- mp->b_datap->db_struioun.cksum.flags = 0;
+ DB_CKSUMFLAGS(mp) = 0;
up = (uint16_t *)(rptr + u1 + TCP_PORTS_OFFSET);
+
u1 = (uint32_t)(len - u1); /* TCP datagram length. */
#ifdef _BIG_ENDIAN
u1 += IPPROTO_TCP;
@@ -11890,7 +12007,7 @@ multipkttcp:
* Not M_DATA mblk or its a dup, so do the checksum now.
*/
IP_STAT(ip_in_sw_cksum);
- if (IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr), u1)) {
+ if (IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr), u1) != 0) {
BUMP_MIB(&ip_mib, tcpInErrs);
goto error;
}
@@ -11937,12 +12054,12 @@ ip_sctp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present,
goto ipoptions;
} else {
/* Check the IP header checksum. */
- if (IS_IPHDR_HWCKSUM(mctl_present, mp, ill)) {
+ if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) {
/*
* Since there is no SCTP h/w cksum support yet, just
* clear the flag.
*/
- mp->b_datap->db_struioun.cksum.flags = 0;
+ DB_CKSUMFLAGS(mp) = 0;
} else {
#define uph ((uint16_t *)ipha)
sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] +
@@ -12031,7 +12148,7 @@ no_conn:
return;
ipoptions:
- mp->b_datap->db_struioun.cksum.flags = 0;
+ DB_CKSUMFLAGS(mp) = 0;
if (!ip_options_cksum(q, first_mp, ipha, ire))
goto slow_done;
@@ -12041,7 +12158,7 @@ ipoptions:
u1 = ntohs(ipha->ipha_fragment_offset_and_flags);
if (u1 & (IPH_MF | IPH_OFFSET)) {
fragmented:
- if (!ip_rput_fragment(q, &mp, ipha))
+ if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL))
goto slow_done;
/*
* Make sure that first_mp points back to mp as
@@ -12183,7 +12300,7 @@ ip_rput_noire(queue_t *q, ill_t *in_ill, mblk_t *mp, int ll_multicast,
* Clear the indication that this may have a hardware checksum
* as we are not using it
*/
- mp->b_datap->db_struioun.cksum.flags = 0;
+ DB_CKSUMFLAGS(mp) = 0;
/*
* Now hand the packet to ip_newroute.
@@ -12351,7 +12468,7 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
* Clear the indication that this may have
* hardware checksum as we are not using it.
*/
- mp->b_datap->db_struioun.cksum.flags = 0;
+ DB_CKSUMFLAGS(mp) = 0;
icmp_unreachable(q, mp,
ICMP_SOURCE_ROUTE_FAILED);
ire_refrele(ire);
@@ -12361,7 +12478,7 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha,
}
/* Packet is being forwarded. Turning off hwcksum flag. */
- mp->b_datap->db_struioun.cksum.flags = 0;
+ DB_CKSUMFLAGS(mp) = 0;
if (ip_g_send_redirects) {
/*
* Check whether the incoming interface and outgoing
@@ -12435,15 +12552,17 @@ ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t **irep, ipha_t *ipha,
{
queue_t *q;
ire_t *ire;
+ uint16_t hcksumflags;
q = *qp;
ire = *irep;
/*
* Clear the indication that this may have hardware
- * checksum as we are not using it.
+ * checksum as we are not using it for forwarding.
*/
- mp->b_datap->db_struioun.cksum.flags = 0;
+ hcksumflags = DB_CKSUMFLAGS(mp);
+ DB_CKSUMFLAGS(mp) = 0;
/*
* Directed broadcast forwarding: if the packet came in over a
@@ -12613,6 +12732,9 @@ ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t **irep, ipha_t *ipha,
}
*irep = ire;
+
+ /* Restore any hardware checksum flags */
+ DB_CKSUMFLAGS(mp) = hcksumflags;
return (B_FALSE);
}
@@ -12632,7 +12754,7 @@ ip_rput_process_multicast(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha,
* Clear the indication that this may have hardware
* checksum as we are not using it.
*/
- mp->b_datap->db_struioun.cksum.flags = 0;
+ DB_CKSUMFLAGS(mp) = 0;
retval = ip_mforward(ill, ipha, mp);
/* ip_mforward updates mib variables if needed */
/* clear b_prev - used by ip_mroute_decap */
@@ -12951,7 +13073,7 @@ ip_rput(queue_t *q, mblk_t *mp)
/*
* Also SIOC[GS]TUN* ioctls can come here.
*/
- ip_ioctl_freemsg(mp);
+ inet_freemsg(mp);
TRACE_2(TR_FAC_IP, TR_IP_RPUT_END,
"ip_input_end: q %p (%S)", q, "uninit");
return;
@@ -13300,9 +13422,20 @@ ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, size_t hdrlen)
continue;
}
- /* broadcast? */
+ /*
+ * Broadcast IRE may indicate either broadcast or
+ * multicast packet
+ */
if (ire->ire_type == IRE_BROADCAST) {
- if (ip_rput_process_broadcast(&q, mp, &ire, ipha, ill,
+ /*
+ * Skip broadcast checks if packet is UDP multicast;
+ * we'd rather not enter ip_rput_process_broadcast()
+ * unless the packet is broadcast for real, since
+ * that routine is a no-op for multicast.
+ */
+ if ((ipha->ipha_protocol != IPPROTO_UDP ||
+ !CLASSD(ipha->ipha_dst)) &&
+ ip_rput_process_broadcast(&q, mp, &ire, ipha, ill,
dst, cgtp_flt_pkt, ll_multicast)) {
continue;
}
@@ -13533,24 +13666,6 @@ ip_rput_dlpi(queue_t *q, mblk_t *mp)
}
/*
- * This function is used to free a message that has gone through
- * mi_copyin processing which modifies the M_IOCTL mblk's b_next
- * and b_prev pointers. We use this function to set b_next/b_prev
- * to NULL and free them.
- */
-void
-ip_ioctl_freemsg(mblk_t *mp)
-{
- mblk_t *bp = mp;
-
- for (; bp != NULL; bp = bp->b_cont) {
- bp->b_prev = NULL;
- bp->b_next = NULL;
- }
- freemsg(mp);
-}
-
-/*
* Handling of DLPI messages that require exclusive access to the ipsq.
*
* Need to do ill_pending_mp_release on ioctl completion, which could
@@ -14483,7 +14598,7 @@ ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
mp->b_cont->b_prev =
mp1->b_cont->b_prev;
}
- ip_ioctl_freemsg(mp1);
+ inet_freemsg(mp1);
ASSERT(ipsq->ipsq_current_ipif != NULL);
ASSERT(connp != NULL);
ip_ioctl_finish(CONNP_TO_WQ(connp), mp,
@@ -14515,7 +14630,7 @@ ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
mp->b_cont->b_prev =
mp1->b_cont->b_prev;
}
- ip_ioctl_freemsg(mp1);
+ inet_freemsg(mp1);
if (iocp->ioc_error == 0)
mp->b_datap->db_type = M_IOCDATA;
ASSERT(connp != NULL);
@@ -14596,7 +14711,7 @@ ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
mp->b_cont->b_prev =
mp1->b_cont->b_prev;
}
- ip_ioctl_freemsg(mp1);
+ inet_freemsg(mp1);
if (iocp->ioc_error == 0)
iocp->ioc_error = EINVAL;
ASSERT(connp != NULL);
@@ -15321,7 +15436,7 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
*/
ASSERT(!mctl_present);
ASSERT(first_mp == mp);
- if (!ip_rput_fragment(q, &mp, ipha)) {
+ if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) {
return;
}
/*
@@ -15337,7 +15452,7 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire,
* Clear hardware checksumming flag as it is currently only
* used by TCP and UDP.
*/
- mp->b_datap->db_struioun.cksum.flags = 0;
+ DB_CKSUMFLAGS(mp) = 0;
/* Now we have a complete datagram, destined for this machine. */
u1 = IPH_HDR_LENGTH(ipha);
@@ -15839,7 +15954,7 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire)
bad_src_route:
q = WR(q);
/* make sure we clear any indication of a hardware checksum */
- mp->b_datap->db_struioun.cksum.flags = 0;
+ DB_CKSUMFLAGS(mp) = 0;
icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED);
return (B_FALSE);
@@ -16022,14 +16137,14 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp)
param_prob:
q = WR(q);
/* make sure we clear any indication of a hardware checksum */
- mp->b_datap->db_struioun.cksum.flags = 0;
+ DB_CKSUMFLAGS(mp) = 0;
icmp_param_problem(q, mp, (uint8_t)code);
return (-1);
bad_src_route:
q = WR(q);
/* make sure we clear any indication of a hardware checksum */
- mp->b_datap->db_struioun.cksum.flags = 0;
+ DB_CKSUMFLAGS(mp) = 0;
icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED);
return (-1);
}
@@ -17571,7 +17686,7 @@ ip_trash_ire_reclaim(void *args)
* upper level protocol. We remove this conn from any fanout hash list it is
* on, and zero out the bind information. No reply is expected up above.
*/
-static void
+mblk_t *
ip_unbind(queue_t *q, mblk_t *mp)
{
conn_t *connp = Q_TO_CONN(q);
@@ -17591,7 +17706,7 @@ ip_unbind(queue_t *q, mblk_t *mp)
* original message.
*/
if (mp == NULL)
- return;
+ return (NULL);
/*
* Don't bzero the ports if its TCP since TCP still needs the
@@ -17601,7 +17716,7 @@ ip_unbind(queue_t *q, mblk_t *mp)
if (!IPCL_IS_TCP(connp))
bzero(&connp->u_port, sizeof (connp->u_port));
- qreply(q, mp);
+ return (mp);
}
/*
@@ -17657,7 +17772,9 @@ ip_output(void *arg, mblk_t *mp, void *arg2, int caller)
/* is queue flow controlled? */
if ((q->q_first != NULL || connp->conn_draining) &&
(caller == IP_WPUT)) {
- goto doputq;
+ ASSERT(!need_decref);
+ (void) putq(q, mp);
+ return;
}
/* Multidata transmit? */
@@ -17992,11 +18109,6 @@ standard_path:
CONN_DEC_REF(connp);
return;
-doputq:
- ASSERT(!need_decref);
- (void) putq(q, mp);
- return;
-
qnext:
/*
* Upper Level Protocols pass down complete IP datagrams
@@ -18933,7 +19045,7 @@ ip_wput(queue_t *q, mblk_t *mp)
* the ipif refcnt has gone to zero and holding the ill_g_lock and ill_lock
* the above holds.
*/
-static ipif_t *
+ipif_t *
conn_get_held_ipif(conn_t *connp, ipif_t **ipifp, int *err)
{
ipif_t *ipif;
@@ -19414,7 +19526,6 @@ ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller)
boolean_t multirt_send = B_FALSE;
int err;
zoneid_t zoneid;
- boolean_t iphdrhwcksum = B_FALSE;
TRACE_1(TR_FAC_IP, TR_IP_WPUT_IRE_START,
"ip_wput_ire_start: q %p", q);
@@ -19749,102 +19860,6 @@ another:;
/* pseudo checksum (do it in parts for IP header checksum) */
cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
-#define FRAGMENT_NEEDED(mtu, size) \
- (((mtu) < (unsigned int)(size)) ? B_TRUE : B_FALSE)
-
-#define IS_FASTPATH(ire, bp) \
- ((ire)->ire_fp_mp != NULL && \
- (MBLKHEAD((bp)) >= (MBLKL((ire)->ire_fp_mp)))) \
-
-#define IPH_UDPH_CHECKSUMP(ipha, hlen) \
- ((uint16_t *)(((uchar_t *)ipha)+(hlen + UDP_CHECKSUM_OFFSET)))
-#define IPH_TCPH_CHECKSUMP(ipha, hlen) \
- ((uint16_t *)(((uchar_t *)ipha)+(hlen+TCP_CHECKSUM_OFFSET)))
-
-#define IP_CKSUM_XMIT(ill, ire, mp, up, proto, hlen, max_frag, \
- ipsec_len) { \
- uint32_t sum; \
- uint32_t xmit_capab = HCKSUM_INET_FULL_V4 | \
- HCKSUM_INET_PARTIAL | HCKSUM_IPHDRCKSUM; \
- boolean_t cksum_offload = B_FALSE; \
- \
- /* \
- * The ire fp mp can change due to the arrival of a \
- * DL_NOTE_FASTPATH_FLUSH in the case of IRE_BROADCAST \
- * and IRE_MIPRTUN. Hence the ire_fp_mp has to be accessed \
- * only under the ire_lock in such cases. \
- */ \
- LOCK_IRE_FP_MP(ire); \
- if ((ill) && (ill->ill_capabilities & ILL_CAPAB_HCKSUM) && \
- (ill->ill_hcksum_capab->ill_hcksum_txflags & \
- xmit_capab) && (!FRAGMENT_NEEDED(max_frag, \
- (LENGTH + ipsec_len))) && (!(ire->ire_flags & \
- RTF_MULTIRT)) && (ipsec_len == 0) && \
- IS_FASTPATH((ire), (mp)) && (dohwcksum)) { \
- /* \
- * Underlying interface supports hardware checksumming. \
- * So postpone the checksum to the interface driver \
- */ \
- \
- if ((hlen) == IP_SIMPLE_HDR_LENGTH) { \
- if (ill->ill_hcksum_capab->ill_hcksum_txflags & \
- HCKSUM_IPHDRCKSUM) { \
- mp->b_datap->db_struioun.cksum.flags |= \
- HCK_IPV4_HDRCKSUM; \
- /* seed the cksum field to 0 */ \
- ipha->ipha_hdr_checksum = 0; \
- iphdrhwcksum = B_TRUE; \
- } \
- /* \
- * If underlying h/w supports full h/w checksumming \
- * and no IP options are present, then offload \
- * full checksumming to the hardware. \
- * \
- * If h/w can do partial checksumming then offload \
- * unless the startpoint offset, including mac-header, \
- * is too big for the interface to some of our \
- * hardware (CE and ERI) which have 6 bit fields. \
- * Sigh. \
- * Unhappily we don't have the mac-header size here \
- * so punt for any options. \
- */ \
- if (ill->ill_hcksum_capab->ill_hcksum_txflags & \
- HCKSUM_INET_FULL_V4) { \
- UNLOCK_IRE_FP_MP(ire); \
- /* Seed the checksum field to 0 */ \
- *up = 0; \
- mp->b_datap->db_struioun.cksum.flags |= \
- HCK_FULLCKSUM; \
- cksum_offload = B_TRUE; \
- } else if (ill->ill_hcksum_capab->ill_hcksum_txflags & \
- HCKSUM_INET_PARTIAL) { \
- UNLOCK_IRE_FP_MP(ire); \
- sum = *up + cksum + proto; \
- sum = (sum & 0xFFFF) + (sum >> 16); \
- *up = (sum & 0xFFFF) + (sum >> 16); \
- /* \
- * All offsets are relative to the beginning \
- * of the IP header. \
- */ \
- mp->b_datap->db_cksumstart = hlen; \
- mp->b_datap->db_cksumstuff = \
- (PROTO == IPPROTO_UDP) ? \
- (hlen) + UDP_CHECKSUM_OFFSET : \
- (hlen) + TCP_CHECKSUM_OFFSET; \
- mp->b_datap->db_cksumend = ipha->ipha_length; \
- mp->b_datap->db_struioun.cksum.flags |= \
- HCK_PARTIALCKSUM; \
- cksum_offload = B_TRUE; \
- } \
- } \
- } \
- if (!cksum_offload) { \
- UNLOCK_IRE_FP_MP(ire); \
- IP_STAT(ip_out_sw_cksum); \
- (sum) = IP_CSUM((mp), (hlen), cksum + proto); \
- *(up) = (uint16_t)((sum) ? (sum) : ~(sum)); \
- } \
-}
if (!IP_FLOW_CONTROLLED_ULP(PROTO)) {
queue_t *dev_q = stq->q_next;
@@ -19856,10 +19871,16 @@ another:;
(ip_hdr_included != IP_HDR_INCLUDED)) {
hlen = (V_HLEN & 0xF) << 2;
up = IPH_UDPH_CHECKSUMP(ipha, hlen);
- if (*up) {
- IP_CKSUM_XMIT(ill, ire, mp, up,
- IP_UDP_CSUM_COMP, hlen, max_frag,
- ipsec_len);
+ if (*up != 0) {
+ IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO,
+ hlen, LENGTH, max_frag, ipsec_len, cksum);
+ /* Software checksum? */
+ if (DB_CKSUMFLAGS(mp) == 0) {
+ IP_STAT(ip_out_sw_cksum);
+ IP_STAT_UPDATE(
+ ip_udp_out_sw_cksum_bytes,
+ LENGTH - hlen);
+ }
}
}
} else if (ip_hdr_included != IP_HDR_INCLUDED) {
@@ -19873,8 +19894,14 @@ another:;
* replicated via several interfaces, and not all of
* them may have this capability.
*/
- IP_CKSUM_XMIT(ill, ire, mp, up,
- IP_TCP_CSUM_COMP, hlen, max_frag, ipsec_len);
+ IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, hlen,
+ LENGTH, max_frag, ipsec_len, cksum);
+ /* Software checksum? */
+ if (DB_CKSUMFLAGS(mp) == 0) {
+ IP_STAT(ip_out_sw_cksum);
+ IP_STAT_UPDATE(ip_tcp_out_sw_cksum_bytes,
+ LENGTH - hlen);
+ }
} else {
sctp_hdr_t *sctph;
@@ -19904,7 +19931,7 @@ another:;
cksum += ttl_protocol;
/* fragment the packet */
- if (FRAGMENT_NEEDED(max_frag, (LENGTH + ipsec_len)))
+ if (max_frag < (uint_t)(LENGTH + ipsec_len))
goto fragmentit;
/*
* Don't use frag_flag if packet is pre-built or source
@@ -19918,8 +19945,8 @@ another:;
ipha->ipha_fragment_offset_and_flags |=
htons(ire->ire_frag_flag);
- if (!iphdrhwcksum) {
- /* checksum */
+ if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
+ /* calculate IP header checksum */
cksum += ipha->ipha_ident;
cksum += (v_hlen_tos_len >> 16)+(v_hlen_tos_len & 0xFFFF);
cksum += ipha->ipha_fragment_offset_and_flags;
@@ -20258,7 +20285,11 @@ broadcast:
hlen = (V_HLEN & 0xF) << 2;
up = IPH_TCPH_CHECKSUMP(ipha, hlen);
IP_STAT(ip_out_sw_cksum);
+ IP_STAT_UPDATE(ip_tcp_out_sw_cksum_bytes,
+ LENGTH - hlen);
*up = IP_CSUM(mp, hlen, cksum + IP_TCP_CSUM_COMP);
+ if (*up == 0)
+ *up = 0xFFFF;
} else if (PROTO == IPPROTO_SCTP &&
(ip_hdr_included != IP_HDR_INCLUDED)) {
sctp_hdr_t *sctph;
@@ -20338,17 +20369,18 @@ broadcast:
*/
hlen = (V_HLEN & 0xF) << 2;
up = IPH_UDPH_CHECKSUMP(ipha, hlen);
- if (*up) {
- uint_t sum;
-
- /*
- * NOTE: watch out for compiler high
- * bits
- */
- IP_STAT(ip_out_sw_cksum);
- sum = IP_CSUM(mp, hlen,
- cksum + IP_UDP_CSUM_COMP);
- *up = (uint16_t)(sum ? sum : ~sum);
+ max_frag = ire->ire_max_frag;
+ if (*up != 0) {
+ IP_CKSUM_XMIT(ire_ill, ire, mp, ipha,
+ up, PROTO, hlen, LENGTH, max_frag,
+ ipsec_len, cksum);
+ /* Software checksum? */
+ if (DB_CKSUMFLAGS(mp) == 0) {
+ IP_STAT(ip_out_sw_cksum);
+ IP_STAT_UPDATE(
+ ip_udp_out_sw_cksum_bytes,
+ LENGTH - hlen);
+ }
}
}
}
@@ -20369,9 +20401,7 @@ broadcast:
conn_multicast_loop));
/* Forget header checksum offload */
- mp->b_datap->db_struioun.cksum.flags &=
- ~HCK_IPV4_HDRCKSUM;
- iphdrhwcksum = B_FALSE;
+ DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM;
/*
* Local loopback of multicasts? Check the
@@ -20459,10 +20489,8 @@ broadcast:
}
max_frag = ire->ire_max_frag;
cksum += ttl_protocol;
- if (!FRAGMENT_NEEDED(max_frag, (LENGTH + ipsec_len))) {
+ if (max_frag >= (uint_t)(LENGTH + ipsec_len)) {
/* No fragmentation required for this one. */
- /* Complete the IP header checksum. */
- cksum += ipha->ipha_ident;
/*
* Don't use frag_flag if packet is pre-built or source
* routed or if multicast (since multicast packets do
@@ -20475,26 +20503,32 @@ broadcast:
ipha->ipha_fragment_offset_and_flags |=
htons(ire->ire_frag_flag);
- cksum += (v_hlen_tos_len >> 16)+
- (v_hlen_tos_len & 0xFFFF);
- cksum += ipha->ipha_fragment_offset_and_flags;
- hlen = (V_HLEN & 0xF) - IP_SIMPLE_HDR_LENGTH_IN_WORDS;
- if (hlen) {
- checksumoptions:
- /*
- * Account for the IP Options in the IP
- * header checksum.
- */
- up = (uint16_t *)(rptr+IP_SIMPLE_HDR_LENGTH);
- do {
- cksum += up[0];
- cksum += up[1];
- up += 2;
- } while (--hlen);
- }
- cksum = ((cksum & 0xFFFF) + (cksum >> 16));
- cksum = ~(cksum + (cksum >> 16));
- ipha->ipha_hdr_checksum = (uint16_t)cksum;
+ if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
+ /* Complete the IP header checksum. */
+ cksum += ipha->ipha_ident;
+ cksum += (v_hlen_tos_len >> 16)+
+ (v_hlen_tos_len & 0xFFFF);
+ cksum += ipha->ipha_fragment_offset_and_flags;
+ hlen = (V_HLEN & 0xF) -
+ IP_SIMPLE_HDR_LENGTH_IN_WORDS;
+ if (hlen) {
+ checksumoptions:
+ /*
+ * Account for the IP Options in the IP
+ * header checksum.
+ */
+ up = (uint16_t *)(rptr+
+ IP_SIMPLE_HDR_LENGTH);
+ do {
+ cksum += up[0];
+ cksum += up[1];
+ up += 2;
+ } while (--hlen);
+ }
+ cksum = ((cksum & 0xFFFF) + (cksum >> 16));
+ cksum = ~(cksum + (cksum >> 16));
+ ipha->ipha_hdr_checksum = (uint16_t)cksum;
+ }
if (ipsec_len != 0) {
ipsec_out_process(q, first_mp, ire, ill_index);
if (!next_mp) {
@@ -20991,6 +21025,298 @@ ip_md_zcopy_attr(multidata_t *mmd, pdesc_t *pd, uint_t flags)
}
/*
+ * Check if ip_wput_frag_mdt() and ip_wput_frag_mdt_v6() can handle a message
+ * block chain. We could rewrite to handle arbitrary message block chains but
+ * that would make the code complicated and slow. Right now there three
+ * restrictions:
+ *
+ * 1. The first message block must contain the complete IP header and
+ * at least 1 byte of payload data.
+ * 2. At most MULTIDATA_MAX_PBUFS non-empty message blocks are allowed
+ * so that we can use a single Multidata message.
+ * 3. No frag must be distributed over two or more message blocks so
+ * that we don't need more than two packet descriptors per frag.
+ *
+ * The above restrictions allow us to support userland applications (which
+ * will send down a single message block) and NFS over UDP (which will
+ * send down a chain of at most three message blocks).
+ *
+ * We also don't use MDT for payloads with less than or equal to
+ * ip_wput_frag_mdt_min bytes because it would cause too much overhead.
+ */
+boolean_t
+ip_can_frag_mdt(mblk_t *mp, ssize_t hdr_len, ssize_t len)
+{
+ int blocks;
+ ssize_t total, missing, size;
+
+ ASSERT(mp != NULL);
+ ASSERT(hdr_len > 0);
+
+ size = MBLKL(mp) - hdr_len;
+ if (size <= 0)
+ return (B_FALSE);
+
+ /* The first mblk contains the header and some payload. */
+ blocks = 1;
+ total = size;
+ size %= len;
+ missing = (size == 0) ? 0 : (len - size);
+ mp = mp->b_cont;
+
+ while (mp != NULL) {
+ /*
+ * Give up if we encounter a zero length message block.
+ * In practice, this should rarely happen and therefore
+ * not worth the trouble of freeing and re-linking the
+ * mblk from the chain to handle such case.
+ */
+ if ((size = MBLKL(mp)) == 0)
+ return (B_FALSE);
+
+ /* Too many payload buffers for a single Multidata message? */
+ if (++blocks > MULTIDATA_MAX_PBUFS)
+ return (B_FALSE);
+
+ total += size;
+ /* Is a frag distributed over two or more message blocks? */
+ if (missing > size)
+ return (B_FALSE);
+ size -= missing;
+
+ size %= len;
+ missing = (size == 0) ? 0 : (len - size);
+
+ mp = mp->b_cont;
+ }
+
+ return (total > ip_wput_frag_mdt_min);
+}
+
+/*
+ * Outbound IPv4 fragmentation routine using MDT.
+ */
+static void
+ip_wput_frag_mdt(ire_t *ire, mblk_t *mp, ip_pkt_t pkt_type, int len,
+ uint32_t frag_flag, int offset)
+{
+ ipha_t *ipha_orig;
+ int i1, ip_data_end;
+ uint_t pkts, wroff, hdr_chunk_len, pbuf_idx;
+ mblk_t *hdr_mp, *md_mp = NULL;
+ unsigned char *hdr_ptr, *pld_ptr;
+ multidata_t *mmd;
+ ip_pdescinfo_t pdi;
+
+ ASSERT(DB_TYPE(mp) == M_DATA);
+ ASSERT(MBLKL(mp) > sizeof (ipha_t));
+
+ ipha_orig = (ipha_t *)mp->b_rptr;
+ mp->b_rptr += sizeof (ipha_t);
+
+ /* Calculate how many packets we will send out */
+ i1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgsize(mp);
+ pkts = (i1 + len - 1) / len;
+ ASSERT(pkts > 1);
+
+ /* Allocate a message block which will hold all the IP Headers. */
+ wroff = ip_wroff_extra;
+ hdr_chunk_len = wroff + IP_SIMPLE_HDR_LENGTH;
+
+ i1 = pkts * hdr_chunk_len;
+ /*
+ * Create the header buffer, Multidata and destination address
+ * and SAP attribute that should be associated with it.
+ */
+ if ((hdr_mp = allocb(i1, BPRI_HI)) == NULL ||
+ ((hdr_mp->b_wptr += i1),
+ (mmd = mmd_alloc(hdr_mp, &md_mp, KM_NOSLEEP)) == NULL) ||
+ !ip_md_addr_attr(mmd, NULL, ire->ire_dlureq_mp)) {
+ freemsg(mp);
+ if (md_mp == NULL) {
+ freemsg(hdr_mp);
+ } else {
+free_mmd: IP_STAT(ip_frag_mdt_discarded);
+ freemsg(md_mp);
+ }
+ IP_STAT(ip_frag_mdt_allocfail);
+ UPDATE_MIB(&ip_mib, ipOutDiscards, pkts);
+ return;
+ }
+ IP_STAT(ip_frag_mdt_allocd);
+
+ /*
+ * Add a payload buffer to the Multidata; this operation must not
+ * fail, or otherwise our logic in this routine is broken. There
+ * is no memory allocation done by the routine, so any returned
+ * failure simply tells us that we've done something wrong.
+ *
+ * A failure tells us that either we're adding the same payload
+ * buffer more than once, or we're trying to add more buffers than
+ * allowed. None of the above cases should happen, and we panic
+ * because either there's horrible heap corruption, and/or
+ * programming mistake.
+ */
+ if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
+ goto pbuf_panic;
+
+ hdr_ptr = hdr_mp->b_rptr;
+ pld_ptr = mp->b_rptr;
+
+ /* Establish the ending byte offset, based on the starting offset. */
+ offset <<= 3;
+ ip_data_end = offset + ntohs(ipha_orig->ipha_length) -
+ IP_SIMPLE_HDR_LENGTH;
+
+ pdi.flags = PDESC_HBUF_REF | PDESC_PBUF_REF;
+
+ while (pld_ptr < mp->b_wptr) {
+ ipha_t *ipha;
+ uint16_t offset_and_flags;
+ uint16_t ip_len;
+ int error;
+
+ ASSERT((hdr_ptr + hdr_chunk_len) <= hdr_mp->b_wptr);
+ ipha = (ipha_t *)(hdr_ptr + wroff);
+ ASSERT(OK_32PTR(ipha));
+ *ipha = *ipha_orig;
+
+ if (ip_data_end - offset > len) {
+ offset_and_flags = IPH_MF;
+ } else {
+ /*
+ * Last frag. Set len to the length of this last piece.
+ */
+ len = ip_data_end - offset;
+ /* A frag of a frag might have IPH_MF non-zero */
+ offset_and_flags =
+ ntohs(ipha->ipha_fragment_offset_and_flags) &
+ IPH_MF;
+ }
+ offset_and_flags |= (uint16_t)(offset >> 3);
+ offset_and_flags |= (uint16_t)frag_flag;
+ /* Store the offset and flags in the IP header. */
+ ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags);
+
+ /* Store the length in the IP header. */
+ ip_len = (uint16_t)(len + IP_SIMPLE_HDR_LENGTH);
+ ipha->ipha_length = htons(ip_len);
+
+ /*
+ * Set the IP header checksum. Note that mp is just
+ * the header, so this is easy to pass to ip_csum.
+ */
+ ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
+
+ /*
+ * Record offset and size of header and data of the next packet
+ * in the multidata message.
+ */
+ PDESC_HDR_ADD(&pdi, hdr_ptr, wroff, IP_SIMPLE_HDR_LENGTH, 0);
+ PDESC_PLD_INIT(&pdi);
+ i1 = MIN(mp->b_wptr - pld_ptr, len);
+ ASSERT(i1 > 0);
+ PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, pld_ptr, i1);
+ if (i1 == len) {
+ pld_ptr += len;
+ } else {
+ i1 = len - i1;
+ mp = mp->b_cont;
+ ASSERT(mp != NULL);
+ ASSERT(MBLKL(mp) >= i1);
+ /*
+ * Attach the next payload message block to the
+ * multidata message.
+ */
+ if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
+ goto pbuf_panic;
+ PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, mp->b_rptr, i1);
+ pld_ptr = mp->b_rptr + i1;
+ }
+
+ if ((mmd_addpdesc(mmd, (pdescinfo_t *)&pdi, &error,
+ KM_NOSLEEP)) == NULL) {
+ /*
+ * Any failure other than ENOMEM indicates that we
+ * have passed in invalid pdesc info or parameters
+ * to mmd_addpdesc, which must not happen.
+ *
+ * EINVAL is a result of failure on boundary checks
+ * against the pdesc info contents. It should not
+ * happen, and we panic because either there's
+ * horrible heap corruption, and/or programming
+ * mistake.
+ */
+ if (error != ENOMEM) {
+ cmn_err(CE_PANIC, "ip_wput_frag_mdt: "
+ "pdesc logic error detected for "
+ "mmd %p pinfo %p (%d)\n",
+ (void *)mmd, (void *)&pdi, error);
+ /* NOTREACHED */
+ }
+ IP_STAT(ip_frag_mdt_addpdescfail);
+ /* Free unattached payload message blocks as well */
+ md_mp->b_cont = mp->b_cont;
+ goto free_mmd;
+ }
+
+ /* Advance fragment offset. */
+ offset += len;
+
+ /* Advance to location for next header in the buffer. */
+ hdr_ptr += hdr_chunk_len;
+
+ /* Did we reach the next payload message block? */
+ if (pld_ptr == mp->b_wptr && mp->b_cont != NULL) {
+ mp = mp->b_cont;
+ /*
+ * Attach the next message block with payload
+ * data to the multidata message.
+ */
+ if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
+ goto pbuf_panic;
+ pld_ptr = mp->b_rptr;
+ }
+ }
+
+ ASSERT(hdr_mp->b_wptr == hdr_ptr);
+ ASSERT(mp->b_wptr == pld_ptr);
+
+ /* Update IP statistics */
+ UPDATE_MIB(&ip_mib, ipFragCreates, pkts);
+ BUMP_MIB(&ip_mib, ipFragOKs);
+ IP_STAT_UPDATE(ip_frag_mdt_pkt_out, pkts);
+
+ if (pkt_type == OB_PKT) {
+ ire->ire_ob_pkt_count += pkts;
+ if (ire->ire_ipif != NULL)
+ atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, pkts);
+ } else {
+ /*
+ * The type is IB_PKT in the forwarding path and in
+ * the mobile IP case when the packet is being reverse-
+ * tunneled to the home agent.
+ */
+ ire->ire_ib_pkt_count += pkts;
+ ASSERT(!IRE_IS_LOCAL(ire));
+ if (ire->ire_type & IRE_BROADCAST)
+ atomic_add_32(&ire->ire_ipif->ipif_ib_pkt_count, pkts);
+ else
+ atomic_add_32(&ire->ire_ipif->ipif_fo_pkt_count, pkts);
+ }
+ ire->ire_last_used_time = lbolt;
+ /* Send it down */
+ putnext(ire->ire_stq, md_mp);
+ return;
+
+pbuf_panic:
+ cmn_err(CE_PANIC, "ip_wput_frag_mdt: payload buffer logic "
+ "error for mmd %p pbuf %p (%d)", (void *)mmd, (void *)mp,
+ pbuf_idx);
+ /* NOTREACHED */
+}
+
+/*
* Outbound IP fragmentation routine.
*
* NOTE : This routine does not ire_refrele the ire that is passed in
@@ -21000,29 +21326,30 @@ static void
ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
uint32_t frag_flag)
{
- int i1;
- mblk_t *ll_hdr_mp;
- int ll_hdr_len;
- int hdr_len;
- mblk_t *hdr_mp;
- ipha_t *ipha;
- int ip_data_end;
- int len;
- mblk_t *mp = mp_orig;
- int offset;
- queue_t *q;
+ int i1;
+ mblk_t *ll_hdr_mp;
+ int ll_hdr_len;
+ int hdr_len;
+ mblk_t *hdr_mp;
+ ipha_t *ipha;
+ int ip_data_end;
+ int len;
+ mblk_t *mp = mp_orig;
+ int offset;
+ queue_t *q;
uint32_t v_hlen_tos_len;
- mblk_t *first_mp;
- boolean_t mctl_present;
- mblk_t *xmit_mp;
- mblk_t *carve_mp;
- ire_t *ire1 = NULL;
- ire_t *save_ire = NULL;
- mblk_t *next_mp = NULL;
- boolean_t last_frag = B_FALSE;
- boolean_t multirt_send = B_FALSE;
- ire_t *first_ire = NULL;
- irb_t *irb = NULL;
+ mblk_t *first_mp;
+ boolean_t mctl_present;
+ ill_t *ill;
+ mblk_t *xmit_mp;
+ mblk_t *carve_mp;
+ ire_t *ire1 = NULL;
+ ire_t *save_ire = NULL;
+ mblk_t *next_mp = NULL;
+ boolean_t last_frag = B_FALSE;
+ boolean_t multirt_send = B_FALSE;
+ ire_t *first_ire = NULL;
+ irb_t *irb = NULL;
TRACE_0(TR_FAC_IP, TR_IP_WPUT_FRAG_START,
"ip_wput_frag_start:");
@@ -21036,6 +21363,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
mctl_present = B_FALSE;
}
+ ASSERT(MBLKL(mp) >= sizeof (ipha_t));
ipha = (ipha_t *)mp->b_rptr;
/*
@@ -21079,8 +21407,37 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
}
hdr_len = (V_HLEN & 0xF) << 2;
+
ipha->ipha_hdr_checksum = 0;
+ /*
+ * Establish the number of bytes maximum per frag, after putting
+ * in the header.
+ */
+ len = (max_frag - hdr_len) & ~7;
+
+ /* Check if we can use MDT to send out the frags. */
+ ASSERT(!IRE_IS_LOCAL(ire));
+ if (hdr_len == IP_SIMPLE_HDR_LENGTH && ip_multidata_outbound &&
+ !(ire->ire_flags & RTF_MULTIRT) && !IPP_ENABLED(IPP_LOCAL_OUT) &&
+ (ill = ire_to_ill(ire)) != NULL && ILL_MDT_CAPABLE(ill) &&
+ IP_CAN_FRAG_MDT(mp, IP_SIMPLE_HDR_LENGTH, len)) {
+ ASSERT(ill->ill_mdt_capab != NULL);
+ if (!ill->ill_mdt_capab->ill_mdt_on) {
+ /*
+ * If MDT has been previously turned off in the past,
+ * and we currently can do MDT (due to IPQoS policy
+ * removal, etc.) then enable it for this interface.
+ */
+ ill->ill_mdt_capab->ill_mdt_on = 1;
+ ip1dbg(("ip_wput_frag: enabled MDT for interface %s\n",
+ ill->ill_name));
+ }
+ ip_wput_frag_mdt(ire, mp, pkt_type, len, frag_flag,
+ offset);
+ return;
+ }
+
/* Get a copy of the header for the trailing frags */
hdr_mp = ip_wput_frag_copyhdr((uchar_t *)ipha, hdr_len, offset);
if (!hdr_mp) {
@@ -21100,12 +21457,6 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
offset <<= 3;
ip_data_end = offset + ntohs(ipha->ipha_length) - hdr_len;
- /*
- * Establish the number of bytes maximum per frag, after putting
- * in the header.
- */
- len = (max_frag - hdr_len) & ~7;
-
/* Store the length of the first fragment in the IP header. */
i1 = len + hdr_len;
ASSERT(i1 <= IP_MAXPACKET);
@@ -22565,8 +22916,6 @@ ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill,
zoneid_t zoneid;
uint32_t cksum;
uint16_t *up;
- /* Hack until the UDP merge into IP happens. */
- extern boolean_t udp_compute_checksum(void);
#ifdef _BIG_ENDIAN
#define LENGTH (v_hlen_tos_len & 0xFFFF)
#else
@@ -22741,6 +23090,8 @@ send:
offset = IP_SIMPLE_HDR_LENGTH + UDP_CHECKSUM_OFFSET;
IP_STAT(ip_out_sw_cksum);
+ IP_STAT_UPDATE(ip_udp_out_sw_cksum_bytes,
+ ntohs(htons(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH));
#define iphs ((uint16_t *)ipha)
cksum = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] +
iphs[9] + ntohs(htons(ipha->ipha_length) -
@@ -23790,10 +24141,10 @@ ip_ioctl_finish(queue_t *q, mblk_t *mp, int err, int mode,
void
ip_resume_tcp_bind(void *arg, mblk_t *mp, void *arg2)
{
- conn_t *connp = (conn_t *)arg;
+ conn_t *connp = arg;
tcp_t *tcp;
- ASSERT(connp != NULL && connp->conn_tcp != NULL);
+ ASSERT(connp != NULL && IPCL_IS_TCP(connp) && connp->conn_tcp != NULL);
tcp = connp->conn_tcp;
if (connp->conn_tcp->tcp_state == TCPS_CLOSED)
@@ -23801,7 +24152,6 @@ ip_resume_tcp_bind(void *arg, mblk_t *mp, void *arg2)
else
tcp_rput_other(tcp, mp);
CONN_OPER_PENDING_DONE(connp);
-
}
/* Called from ip_wput for all non data messages */
@@ -24031,31 +24381,48 @@ nak:
case T_BIND_REQ: {
/* Request can get queued in bind */
ASSERT(connp != NULL);
+ /*
+ * Both TCP and UDP call ip_bind_{v4,v6}() directly
+ * instead of going through this path. We only get
+ * here in the following cases:
+ *
+ * a. Bind retries, where ipsq is non-NULL.
+ * b. T_BIND_REQ is issued from non TCP/UDP
+ * transport, e.g. icmp for raw socket,
+ * in which case ipsq will be NULL.
+ */
+ ASSERT(ipsq != NULL ||
+ (!IPCL_IS_TCP(connp) && !IPCL_IS_UDP(connp)));
+
/* Don't increment refcnt if this is a re-entry */
if (ipsq == NULL)
CONN_INC_REF(connp);
- mp = connp->conn_af_isv6 ?
- ip_bind_v6(q, mp, connp, NULL) :
- ip_bind_v4(q, mp, connp);
- if (mp != NULL) {
- tcp_t *tcp;
-
- tcp = connp->conn_tcp;
- if (tcp != NULL) {
- if (ipsq == NULL) {
- tcp_rput_other(tcp, mp);
- } else {
- CONN_INC_REF(connp);
- squeue_fill(connp->conn_sqp, mp,
- ip_resume_tcp_bind,
- connp, SQTAG_TCP_RPUTOTHER);
- return;
- }
- } else {
- qreply(q, mp);
- }
- CONN_OPER_PENDING_DONE(connp);
+ mp = connp->conn_af_isv6 ? ip_bind_v6(q, mp,
+ connp, NULL) : ip_bind_v4(q, mp, connp);
+ if (mp == NULL)
+ return;
+ if (IPCL_IS_TCP(connp)) {
+ /*
+ * In the case of TCP endpoint we
+ * come here only for bind retries
+ */
+ ASSERT(ipsq != NULL);
+ CONN_INC_REF(connp);
+ squeue_fill(connp->conn_sqp, mp,
+ ip_resume_tcp_bind, connp,
+ SQTAG_BIND_RETRY);
+ return;
+ } else if (IPCL_IS_UDP(connp)) {
+ /*
+ * In the case of UDP endpoint we
+ * come here only for bind retries
+ */
+ ASSERT(ipsq != NULL);
+ udp_resume_bind(connp, mp);
+ return;
}
+ qreply(q, mp);
+ CONN_OPER_PENDING_DONE(connp);
return;
}
case T_SVR4_OPTMGMT_REQ:
@@ -24111,7 +24478,8 @@ nak:
}
return;
case T_UNBIND_REQ:
- ip_unbind(q, mp);
+ mp = ip_unbind(q, mp);
+ qreply(q, mp);
return;
default:
/*
diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c
index d701d170d1..8788d12aa6 100644
--- a/usr/src/uts/common/inet/ip/ip6.c
+++ b/usr/src/uts/common/inet/ip/ip6.c
@@ -58,6 +58,7 @@
#include <sys/policy.h>
#include <net/if.h>
#include <net/if_arp.h>
+#include <net/if_types.h>
#include <net/route.h>
#include <net/if_dl.h>
#include <sys/sockio.h>
@@ -74,9 +75,12 @@
#include <inet/snmpcom.h>
#include <inet/ip.h>
+#include <inet/ip_impl.h>
#include <inet/ip6.h>
#include <inet/ip6_asp.h>
#include <inet/tcp.h>
+#include <inet/tcp_impl.h>
+#include <inet/udp_impl.h>
#include <inet/ipp_common.h>
#include <inet/ip_multi.h>
@@ -103,20 +107,51 @@ extern squeue_func_t ip_input_proc;
/*
* IP statistics.
*/
-#define IP6_STAT(x) (ip6_statistics.x.value.ui64++)
+#define IP6_STAT(x) (ip6_statistics.x.value.ui64++)
+#define IP6_STAT_UPDATE(x, n) (ip6_statistics.x.value.ui64 += (n))
typedef struct ip6_stat {
kstat_named_t ip6_udp_fast_path;
kstat_named_t ip6_udp_slow_path;
kstat_named_t ip6_udp_fannorm;
kstat_named_t ip6_udp_fanmb;
+ kstat_named_t ip6_out_sw_cksum;
+ kstat_named_t ip6_in_sw_cksum;
+ kstat_named_t ip6_tcp_in_full_hw_cksum_err;
+ kstat_named_t ip6_tcp_in_part_hw_cksum_err;
+ kstat_named_t ip6_tcp_in_sw_cksum_err;
+ kstat_named_t ip6_tcp_out_sw_cksum_bytes;
+ kstat_named_t ip6_udp_in_full_hw_cksum_err;
+ kstat_named_t ip6_udp_in_part_hw_cksum_err;
+ kstat_named_t ip6_udp_in_sw_cksum_err;
+ kstat_named_t ip6_udp_out_sw_cksum_bytes;
+ kstat_named_t ip6_frag_mdt_pkt_out;
+ kstat_named_t ip6_frag_mdt_discarded;
+ kstat_named_t ip6_frag_mdt_allocfail;
+ kstat_named_t ip6_frag_mdt_addpdescfail;
+ kstat_named_t ip6_frag_mdt_allocd;
} ip6_stat_t;
static ip6_stat_t ip6_statistics = {
- { "ip6_udp_fast_path", KSTAT_DATA_UINT64 },
- { "ip6_udp_slow_path", KSTAT_DATA_UINT64 },
- { "ip6_udp_fannorm", KSTAT_DATA_UINT64 },
- { "ip6_udp_fanmb", KSTAT_DATA_UINT64 },
+ { "ip6_udp_fast_path", KSTAT_DATA_UINT64 },
+ { "ip6_udp_slow_path", KSTAT_DATA_UINT64 },
+ { "ip6_udp_fannorm", KSTAT_DATA_UINT64 },
+ { "ip6_udp_fanmb", KSTAT_DATA_UINT64 },
+ { "ip6_out_sw_cksum", KSTAT_DATA_UINT64 },
+ { "ip6_in_sw_cksum", KSTAT_DATA_UINT64 },
+ { "ip6_tcp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 },
+ { "ip6_tcp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 },
+ { "ip6_tcp_in_sw_cksum_err", KSTAT_DATA_UINT64 },
+ { "ip6_tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 },
+ { "ip6_udp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 },
+ { "ip6_udp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 },
+ { "ip6_udp_in_sw_cksum_err", KSTAT_DATA_UINT64 },
+ { "ip6_udp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 },
+ { "ip6_frag_mdt_pkt_out", KSTAT_DATA_UINT64 },
+ { "ip6_frag_mdt_discarded", KSTAT_DATA_UINT64 },
+ { "ip6_frag_mdt_allocfail", KSTAT_DATA_UINT64 },
+ { "ip6_frag_mdt_addpdescfail", KSTAT_DATA_UINT64 },
+ { "ip6_frag_mdt_allocd", KSTAT_DATA_UINT64 },
};
static kstat_t *ip6_kstat;
@@ -221,7 +256,7 @@ static void ip_fanout_udp_v6(queue_t *, mblk_t *, ip6_t *, uint32_t,
static int ip_process_options_v6(queue_t *, mblk_t *, ip6_t *,
uint8_t *, uint_t, uint8_t);
static mblk_t *ip_rput_frag_v6(queue_t *, mblk_t *, ip6_t *,
- ip6_frag_t *, uint_t, uint_t *);
+ ip6_frag_t *, uint_t, uint_t *, uint32_t *, uint16_t *);
static boolean_t ip_source_routed_v6(ip6_t *, mblk_t *);
static void ip_wput_ire_v6(queue_t *, mblk_t *, ire_t *, int, int,
conn_t *, int, int, int);
@@ -2302,7 +2337,8 @@ ip_bind_v6(queue_t *q, mblk_t *mp, conn_t *connp, ip6_pkt_t *ipp)
connp->conn_recv = tcp_input;
}
/* Update qinfo if v4/v6 changed */
- if ((orig_pkt_isv6 != connp->conn_pkt_isv6) && !IS_TCP_CONN(connp)) {
+ if ((orig_pkt_isv6 != connp->conn_pkt_isv6) &&
+ !(IPCL_IS_TCP(connp) || IPCL_IS_UDP(connp))) {
if (connp->conn_pkt_isv6)
ip_setqinfo(RD(q), IPV6_MINOR, B_TRUE);
else
@@ -2531,7 +2567,6 @@ ip_bind_connected_resume_v6(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
void *dummy_arg)
{
conn_t *connp = NULL;
- tcp_t *tcp;
t_scalar_t prim;
ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
@@ -2543,24 +2578,24 @@ ip_bind_connected_resume_v6(ipsq_t *ipsq, queue_t *q, mblk_t *mp,
prim = ((union T_primitives *)mp->b_rptr)->type;
ASSERT(prim == O_T_BIND_REQ || prim == T_BIND_REQ);
- tcp = connp->conn_tcp;
- if (tcp != NULL) {
+ if (IPCL_IS_TCP(connp)) {
/* Pass sticky_ipp for scope_id and pktinfo */
- mp = ip_bind_v6(q, mp, connp, &tcp->tcp_sticky_ipp);
+ mp = ip_bind_v6(q, mp, connp, &connp->conn_tcp->tcp_sticky_ipp);
} else {
/* For UDP and ICMP */
mp = ip_bind_v6(q, mp, connp, NULL);
}
if (mp != NULL) {
- if (tcp != NULL) {
+ if (IPCL_IS_TCP(connp)) {
CONN_INC_REF(connp);
- squeue_fill(connp->conn_sqp, mp,
- ip_resume_tcp_bind, connp, SQTAG_TCP_RPUTOTHER);
- return;
+ squeue_fill(connp->conn_sqp, mp, ip_resume_tcp_bind,
+ connp, SQTAG_TCP_RPUTOTHER);
+ } else if (IPCL_IS_UDP(connp)) {
+ udp_resume_bind(connp, mp);
} else {
qreply(q, mp);
+ CONN_OPER_PENDING_DONE(connp);
}
- CONN_OPER_PENDING_DONE(connp);
}
}
@@ -2719,7 +2754,7 @@ ip_bind_connected_v6(conn_t *connp, mblk_t *mp, in6_addr_t *v6src,
if (ip_multidata_outbound && !ipsec_policy_set && dst_ire != NULL &&
!(dst_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST)) &&
(md_ill = ire_to_ill(dst_ire), md_ill != NULL) &&
- (md_ill->ill_capabilities & ILL_CAPAB_MDT)) {
+ ILL_MDT_CAPABLE(md_ill)) {
md_dst_ire = dst_ire;
IRE_REFHOLD(md_dst_ire);
}
@@ -2936,7 +2971,7 @@ ip_bind_connected_v6(conn_t *connp, mblk_t *mp, in6_addr_t *v6src,
*/
error = ipcl_conn_insert_v6(connp, protocol, v6src, v6dst,
connp->conn_ports,
- IS_TCP_CONN(connp) ? connp->conn_tcp->tcp_bound_if : 0);
+ IPCL_IS_TCP(connp) ? connp->conn_tcp->tcp_bound_if : 0);
}
if (error == 0) {
connp->conn_fully_bound = B_TRUE;
@@ -3411,8 +3446,7 @@ ip_fanout_tcp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill, ill_t *inill,
ASSERT((dp->db_struioflag & STRUIO_IP) == 0);
/* Initiate IPPf processing, if needed. */
- if (IPP_ENABLED(IPP_LOCAL_IN) &&
- (flags & (IP6_NO_IPPOLICY|IP6_IN_NOCKSUM))) {
+ if (IPP_ENABLED(IPP_LOCAL_IN) && (flags & IP6_NO_IPPOLICY)) {
ill_index = ill->ill_phyint->phyint_ifindex;
ip_process(IPP_LOCAL_IN, &first_mp, ill_index);
if (first_mp == NULL) {
@@ -3447,14 +3481,14 @@ ip_fanout_tcp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill, ill_t *inill,
}
mp->b_datap->db_struioflag |= STRUIO_EAGER;
- mp->b_datap->db_cksumstart = (intptr_t)sqp;
+ DB_CKSUMSTART(mp) = (intptr_t)sqp;
/*
* db_cksumstuff is unused in the incoming
* path; Thus store the ifindex here. It will
* be cleared in tcp_conn_create_v6().
*/
- mp->b_datap->db_cksumstuff =
+ DB_CKSUMSTUFF(mp) =
(intptr_t)ill->ill_phyint->phyint_ifindex;
syn_present = B_TRUE;
}
@@ -3587,7 +3621,6 @@ ip_fanout_udp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, uint32_t ports,
ill_t *ill, ill_t *inill, uint_t flags, boolean_t mctl_present,
zoneid_t zoneid)
{
- queue_t *rq;
uint32_t dstport, srcport;
in6_addr_t dst;
mblk_t *first_mp;
@@ -3637,9 +3670,8 @@ ip_fanout_udp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, uint32_t ports,
/* Found a client */
CONN_INC_REF(connp);
mutex_exit(&connfp->connf_lock);
- rq = connp->conn_rq;
- if (!canputnext(rq)) {
+ if (CONN_UDP_FLOWCTLD(connp)) {
freemsg(first_mp);
BUMP_MIB(ill->ill_ip6_mib, udpInOverflows);
CONN_DEC_REF(connp);
@@ -3691,7 +3723,10 @@ ip_fanout_udp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, uint32_t ports,
}
}
BUMP_MIB(ill->ill_ip6_mib, ipv6InDelivers);
- putnext(rq, mp);
+
+ /* Send it upstream */
+ CONN_UDP_RECV(connp, mp);
+
IP6_STAT(ip6_udp_fannorm);
CONN_DEC_REF(connp);
if (mctl_present)
@@ -3746,7 +3781,6 @@ ip_fanout_udp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, uint32_t ports,
mp1 = mctl_present ? first_mp1->b_cont : first_mp1;
CONN_INC_REF(connp);
mutex_exit(&connfp->connf_lock);
- rq = connp->conn_rq;
/*
* For link-local always add ifindex so that transport
* can set sin6_scope_id. Avoid it for ICMP error
@@ -3762,7 +3796,7 @@ ip_fanout_udp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, uint32_t ports,
BUMP_MIB(ill->ill_ip6_mib, ipv6InDiscards);
goto next_one;
}
- if (!canputnext(rq)) {
+ if (CONN_UDP_FLOWCTLD(connp)) {
BUMP_MIB(ill->ill_ip6_mib, udpInOverflows);
freemsg(mp1);
goto next_one;
@@ -3778,7 +3812,9 @@ ip_fanout_udp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, uint32_t ports,
if (mctl_present)
freeb(first_mp1);
BUMP_MIB(ill->ill_ip6_mib, ipv6InDelivers);
- putnext(rq, mp1);
+
+ /* Send it upstream */
+ CONN_UDP_RECV(connp, mp1);
}
next_one:
mutex_enter(&connfp->connf_lock);
@@ -3791,7 +3827,6 @@ next_one:
/* Last one. Send it upstream. */
mutex_exit(&connfp->connf_lock);
- rq = connp->conn_rq;
/* Initiate IPPF processing */
if (IP6_IN_IPP(flags)) {
@@ -3830,7 +3865,7 @@ next_one:
first_mp = mp;
}
}
- if (!canputnext(rq)) {
+ if (CONN_UDP_FLOWCTLD(connp)) {
BUMP_MIB(ill->ill_ip6_mib, udpInOverflows);
freemsg(mp);
} else {
@@ -3844,7 +3879,9 @@ next_one:
}
}
BUMP_MIB(ill->ill_ip6_mib, ipv6InDelivers);
- putnext(rq, mp);
+
+ /* Send it upstream */
+ CONN_UDP_RECV(connp, mp);
}
IP6_STAT(ip6_udp_fanmb);
CONN_DEC_REF(connp);
@@ -6447,7 +6484,7 @@ ip_rput_v6(queue_t *q, mblk_t *mp)
*/
if ((mp->b_datap->db_type != M_PCPROTO) ||
(dl->dl_primitive == DL_UNITDATA_IND)) {
- ip_ioctl_freemsg(mp);
+ inet_freemsg(mp);
return;
}
}
@@ -6835,14 +6872,16 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
mblk_t *first_mp1;
boolean_t no_forward;
ip6_hbh_t *hbhhdr;
- boolean_t no_cksum = (flags & IP6_IN_NOCKSUM);
boolean_t ll_multicast = (flags & IP6_IN_LLMCAST);
conn_t *connp;
- int off;
ilm_t *ilm;
uint32_t ports;
uint_t ipif_id = 0;
zoneid_t zoneid = GLOBAL_ZONEID;
+ uint16_t hck_flags, reass_hck_flags;
+ uint32_t reass_sum;
+ boolean_t cksum_err;
+ mblk_t *mp1;
EXTRACT_PKT_MP(mp, first_mp, mctl_present);
@@ -6899,11 +6938,14 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
pkt_len -= diff;
}
- /*
- * XXX When zero-copy support is added, this turning off of
- * checksum flag will need to be done more selectively.
- */
- mp->b_datap->db_struioun.cksum.flags &= ~HCK_PARTIALCKSUM;
+ if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum)
+ hck_flags = DB_CKSUMFLAGS(mp);
+ else
+ hck_flags = 0;
+
+ /* Clear checksum flags in case we need to forward */
+ DB_CKSUMFLAGS(mp) = 0;
+ reass_sum = reass_hck_flags = 0;
nexthdr = ip6h->ip6_nxt;
@@ -7168,7 +7210,6 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h,
/* TBD add site-local check at site boundary? */
} else if (ipv6_send_redirects) {
in6_addr_t *v6targ;
- mblk_t *mp1;
in6_addr_t gw_addr_v6;
ire_t *src_ire_v6 = NULL;
@@ -7313,7 +7354,6 @@ ipv6forus:
case IPPROTO_TCP: {
uint16_t *up;
uint32_t sum;
- dblk_t *dp;
int offset;
hdr_len = pkt_len - remlen;
@@ -7336,6 +7376,7 @@ ipv6forus:
freemsg(first_mp);
return;
}
+ hck_flags = 0;
ip6h = (ip6_t *)mp->b_rptr;
whereptr = (uint8_t *)ip6h + hdr_len;
}
@@ -7368,30 +7409,12 @@ ipv6forus:
freemsg(first_mp);
return;
}
+ hck_flags = 0;
ip6h = (ip6_t *)mp->b_rptr;
whereptr = (uint8_t *)ip6h + hdr_len;
}
}
- /*
- * If packet is being looped back locally checksums
- * aren't used
- */
- if (no_cksum) {
- if (mp->b_datap->db_type == M_DATA) {
- /*
- * M_DATA mblk, so init mblk (chain)
- * for no struio().
- */
- mblk_t *mp1 = mp;
-
- do {
- mp1->b_datap->db_struioflag = 0;
- } while ((mp1 = mp1->b_cont) != NULL);
- }
- goto tcp_fanout;
- }
-
up = (uint16_t *)&ip6h->ip6_src;
/*
* TCP checksum calculation. First sum up the
@@ -7400,44 +7423,38 @@ ipv6forus:
* - Destination IPv6 address
* - TCP payload length
* - TCP protocol ID
- * XXX need zero-copy support here
*/
sum = htons(IPPROTO_TCP + remlen) +
up[0] + up[1] + up[2] + up[3] +
up[4] + up[5] + up[6] + up[7] +
up[8] + up[9] + up[10] + up[11] +
up[12] + up[13] + up[14] + up[15];
+
+ /* Fold initial sum */
sum = (sum & 0xffff) + (sum >> 16);
- dp = mp->b_datap;
- if (dp->db_type != M_DATA || dp->db_ref > 1) {
- /*
- * Not M_DATA mblk or its a dup, so do the
- * checksum now.
- */
- sum = IP_CSUM(mp, hdr_len, sum);
- if (sum) {
- /* checksum failed */
- ip1dbg(("ip_rput_data_v6: TCP checksum"
- " failed %x off %d\n",
- sum, hdr_len));
- BUMP_MIB(&ip_mib, tcpInErrs);
- freemsg(first_mp);
- return;
- }
- } else {
- /*
- * M_DATA mblk and not a dup
- * compute checksum here
- */
- off = (int)(whereptr - mp->b_rptr);
- if (IP_CSUM(mp, off, sum)) {
- BUMP_MIB(&ip_mib, tcpInErrs);
- ipcsumdbg("ip_rput_data_v6 "
- "swcksumerr\n", mp);
- freemsg(first_mp);
- return;
- }
+ mp1 = mp->b_cont;
+
+ if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
+ IP6_STAT(ip6_in_sw_cksum);
+
+ IP_CKSUM_RECV(hck_flags, sum, (uchar_t *)
+ ((uchar_t *)mp->b_rptr + DB_CKSUMSTART(mp)),
+ (int32_t)(whereptr - (uchar_t *)mp->b_rptr),
+ mp, mp1, cksum_err);
+
+ if (cksum_err) {
+ BUMP_MIB(&ip_mib, tcpInErrs);
+
+ if (hck_flags & HCK_FULLCKSUM)
+ IP6_STAT(ip6_tcp_in_full_hw_cksum_err);
+ else if (hck_flags & HCK_PARTIALCKSUM)
+ IP6_STAT(ip6_tcp_in_part_hw_cksum_err);
+ else
+ IP6_STAT(ip6_tcp_in_sw_cksum_err);
+
+ freemsg(first_mp);
+ return;
}
tcp_fanout:
ip_fanout_tcp_v6(q, first_mp, ip6h, ill, inill,
@@ -7468,18 +7485,16 @@ tcp_fanout:
}
sctph = (sctp_hdr_t *)(mp->b_rptr + hdr_len);
- if (!no_cksum) {
- /* checksum */
- pktsum = sctph->sh_chksum;
- sctph->sh_chksum = 0;
- calcsum = sctp_cksum(mp, hdr_len);
- if (calcsum != pktsum) {
- BUMP_MIB(&sctp_mib, sctpChecksumError);
- freemsg(mp);
- return;
- }
- sctph->sh_chksum = pktsum;
+ /* checksum */
+ pktsum = sctph->sh_chksum;
+ sctph->sh_chksum = 0;
+ calcsum = sctp_cksum(mp, hdr_len);
+ if (calcsum != pktsum) {
+ BUMP_MIB(&sctp_mib, sctpChecksumError);
+ freemsg(mp);
+ return;
}
+ sctph->sh_chksum = pktsum;
ports = *(uint32_t *)(mp->b_rptr + hdr_len);
if ((connp = sctp_find_conn(&ip6h->ip6_src,
&ip6h->ip6_dst, ports, ipif_id, zoneid)) == NULL) {
@@ -7501,8 +7516,6 @@ tcp_fanout:
hdr_len = pkt_len - remlen;
-#define UDPH_SIZE 8
-
if (hada_mp != NULL) {
ip0dbg(("udp hada drop\n"));
goto hada_drop;
@@ -7519,16 +7532,10 @@ tcp_fanout:
freemsg(first_mp);
return;
}
+ hck_flags = 0;
ip6h = (ip6_t *)mp->b_rptr;
whereptr = (uint8_t *)ip6h + hdr_len;
}
-#undef UDPH_SIZE
- /*
- * If packet is being looped back locally checksums
- * aren't used
- */
- if (no_cksum)
- goto udp_fanout;
/*
* Before going through the regular checksum
@@ -7568,15 +7575,37 @@ tcp_fanout:
up[8] + up[9] + up[10] + up[11] +
up[12] + up[13] + up[14] + up[15];
+ /* Fold initial sum */
sum = (sum & 0xffff) + (sum >> 16);
- /* Next sum in the UDP packet */
- sum = IP_CSUM(mp, hdr_len, sum);
- if (sum) {
- /* UDP checksum failed */
- ip1dbg(("ip_rput_data_v6: UDP checksum "
- "failed %x\n",
- sum));
+
+ if (reass_hck_flags != 0) {
+ hck_flags = reass_hck_flags;
+
+ IP_CKSUM_RECV_REASS(hck_flags,
+ (int32_t)(whereptr - (uchar_t *)mp->b_rptr),
+ sum, reass_sum, cksum_err);
+ } else {
+ mp1 = mp->b_cont;
+
+ IP_CKSUM_RECV(hck_flags, sum, (uchar_t *)
+ ((uchar_t *)mp->b_rptr + DB_CKSUMSTART(mp)),
+ (int32_t)(whereptr - (uchar_t *)mp->b_rptr),
+ mp, mp1, cksum_err);
+ }
+
+ if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0)
+ IP6_STAT(ip6_in_sw_cksum);
+
+ if (cksum_err) {
BUMP_MIB(ill->ill_ip6_mib, udpInCksumErrs);
+
+ if (hck_flags & HCK_FULLCKSUM)
+ IP6_STAT(ip6_udp_in_full_hw_cksum_err);
+ else if (hck_flags & HCK_PARTIALCKSUM)
+ IP6_STAT(ip6_udp_in_part_hw_cksum_err);
+ else
+ IP6_STAT(ip6_udp_in_sw_cksum_err);
+
freemsg(first_mp);
return;
}
@@ -7592,13 +7621,6 @@ tcp_fanout:
goto hada_drop;
}
- /*
- * If packet is being looped back locally checksums
- * aren't used
- */
- if (no_cksum)
- goto icmp_fanout;
-
up = (uint16_t *)&ip6h->ip6_src;
sum = htons(IPPROTO_ICMPV6 + remlen) +
up[0] + up[1] + up[2] + up[3] +
@@ -7607,7 +7629,7 @@ tcp_fanout:
up[12] + up[13] + up[14] + up[15];
sum = (sum & 0xffff) + (sum >> 16);
sum = IP_CSUM(mp, hdr_len, sum);
- if (sum) {
+ if (sum != 0) {
/* IPv6 ICMP checksum failed */
ip1dbg(("ip_rput_data_v6: ICMPv6 checksum "
"failed %x\n",
@@ -7795,6 +7817,7 @@ tcp_fanout:
freemsg(mp);
return;
}
+ hck_flags = 0;
ip6h = (ip6_t *)mp->b_rptr;
whereptr = (uint8_t *)ip6h + pkt_len - remlen;
}
@@ -7820,8 +7843,12 @@ tcp_fanout:
}
}
+ /* Restore the flags */
+ DB_CKSUMFLAGS(mp) = hck_flags;
+
mp = ip_rput_frag_v6(q, mp, ip6h, fraghdr,
- remlen - used, &prev_nexthdr_offset);
+ remlen - used, &prev_nexthdr_offset,
+ &reass_sum, &reass_hck_flags);
if (mp == NULL) {
/* Reassembly is still pending */
return;
@@ -8032,7 +8059,7 @@ udp_fanout:
return;
}
- if (!canputnext(connp->conn_upq)) {
+ if (CONN_UDP_FLOWCTLD(connp)) {
freemsg(first_mp);
BUMP_MIB(ill->ill_ip6_mib, udpInOverflows);
CONN_DEC_REF(connp);
@@ -8062,7 +8089,9 @@ udp_fanout:
IP6_STAT(ip6_udp_fast_path);
BUMP_MIB(ill->ill_ip6_mib, ipv6InReceives);
BUMP_MIB(ill->ill_ip6_mib, ipv6InDelivers);
- putnext(connp->conn_upq, mp);
+
+ /* Send it upstream */
+ CONN_UDP_RECV(connp, mp);
CONN_DEC_REF(connp);
freemsg(hada_mp);
@@ -8086,7 +8115,8 @@ hada_drop:
*/
static mblk_t *
ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
- ip6_frag_t *fraghdr, uint_t remlen, uint_t *prev_nexthdr_offset)
+ ip6_frag_t *fraghdr, uint_t remlen, uint_t *prev_nexthdr_offset,
+ uint32_t *cksum_val, uint16_t *cksum_flags)
{
ill_t *ill = (ill_t *)q->q_ptr;
uint32_t ident = ntohl(fraghdr->ip6f_ident);
@@ -8107,6 +8137,62 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
mblk_t *tail_mp;
mblk_t *t_mp;
boolean_t pruned = B_FALSE;
+ uint32_t sum_val;
+ uint16_t sum_flags;
+
+
+ if (cksum_val != NULL)
+ *cksum_val = 0;
+ if (cksum_flags != NULL)
+ *cksum_flags = 0;
+
+ /*
+ * We utilize hardware computed checksum info only for UDP since
+ * IP fragmentation is a normal occurence for the protocol. In
+ * addition, checksum offload support for IP fragments carrying
+ * UDP payload is commonly implemented across network adapters.
+ */
+ ASSERT(ill != NULL);
+ if (nexthdr == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) &&
+ (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) {
+ mblk_t *mp1 = mp->b_cont;
+ int32_t len;
+
+ /* Record checksum information from the packet */
+ sum_val = (uint32_t)DB_CKSUM16(mp);
+ sum_flags = DB_CKSUMFLAGS(mp);
+
+ /* fragmented payload offset from beginning of mblk */
+ offset = (uint16_t)((uchar_t *)&fraghdr[1] - mp->b_rptr);
+
+ if ((sum_flags & HCK_PARTIALCKSUM) &&
+ (mp1 == NULL || mp1->b_cont == NULL) &&
+ offset >= (uint16_t)DB_CKSUMSTART(mp) &&
+ ((len = offset - (uint16_t)DB_CKSUMSTART(mp)) & 1) == 0) {
+ uint32_t adj;
+ /*
+ * Partial checksum has been calculated by hardware
+ * and attached to the packet; in addition, any
+ * prepended extraneous data is even byte aligned.
+ * If any such data exists, we adjust the checksum;
+ * this would also handle any postpended data.
+ */
+ IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp),
+ mp, mp1, len, adj);
+
+ /* One's complement subtract extraneous checksum */
+ if (adj >= sum_val)
+ sum_val = ~(adj - sum_val) & 0xFFFF;
+ else
+ sum_val -= adj;
+ }
+ } else {
+ sum_val = 0;
+ sum_flags = 0;
+ }
+
+ /* Clear hardware checksumming flag */
+ DB_CKSUMFLAGS(mp) = 0;
/*
* Note: Fragment offset in header is in 8-octet units.
@@ -8159,7 +8245,6 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
* Drop the fragmented as early as possible, if
* we don't have resource(s) to re-assemble.
*/
-
if (ip_reass_queue_bytes == 0) {
freemsg(mp);
return (NULL);
@@ -8183,12 +8268,11 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
* there is anything on the reassembly queue, the timer will
* be running.
*/
- msg_len = mp->b_datap->db_lim - mp->b_datap->db_base;
+ msg_len = MBLKSIZE(mp);
tail_mp = mp;
while (tail_mp->b_cont != NULL) {
tail_mp = tail_mp->b_cont;
- msg_len += tail_mp->b_datap->db_lim -
- tail_mp->b_datap->db_base;
+ msg_len += MBLKSIZE(tail_mp);
}
/*
* If the reassembly list for this ILL will get too big
@@ -8287,7 +8371,7 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
ipf->ipf_timestamp = gethrestime_sec();
/* Record ipf generation and account for frag header */
ipf->ipf_gen = ill->ill_ipf_gen++;
- ipf->ipf_count = mp1->b_datap->db_lim - mp1->b_datap->db_base;
+ ipf->ipf_count = MBLKSIZE(mp1);
ipf->ipf_protocol = nexthdr;
ipf->ipf_nf_hdr_len = 0;
ipf->ipf_prev_nexthdr_offset = 0;
@@ -8295,6 +8379,16 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
ipf->ipf_ecn = ecn_info;
ipf->ipf_num_dups = 0;
ipfb->ipfb_frag_pkts++;
+ ipf->ipf_checksum = 0;
+ ipf->ipf_checksum_flags = 0;
+
+ /* Store checksum value in fragment header */
+ if (sum_flags != 0) {
+ sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+ sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+ ipf->ipf_checksum = sum_val;
+ ipf->ipf_checksum_flags = sum_flags;
+ }
/*
* We handle reassembly two ways. In the easy case,
@@ -8326,6 +8420,10 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
* on easy reassembly.
*/
ipf->ipf_end = 0;
+
+ /* Forget checksum offload from now on */
+ ipf->ipf_checksum_flags = 0;
+
/*
* ipf_hole_cnt is set by ip_reassemble.
* ipf_count is updated by ip_reassemble.
@@ -8349,6 +8447,23 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
}
/*
+ * If the packet's flag has changed (it could be coming up
+ * from an interface different than the previous, therefore
+ * possibly different checksum capability), then forget about
+ * any stored checksum states. Otherwise add the value to
+ * the existing one stored in the fragment header.
+ */
+ if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) {
+ sum_val += ipf->ipf_checksum;
+ sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+ sum_val = (sum_val & 0xFFFF) + (sum_val >> 16);
+ ipf->ipf_checksum = sum_val;
+ } else if (ipf->ipf_checksum_flags != 0) {
+ /* Forget checksum offload from now on */
+ ipf->ipf_checksum_flags = 0;
+ }
+
+ /*
* We have a new piece of a datagram which is already being
* reassembled. Update the ECN info if all IP fragments
* are ECN capable. If there is one which is not, clear
@@ -8443,6 +8558,13 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h,
nexthdr = ipf->ipf_protocol;
*prev_nexthdr_offset = ipf->ipf_prev_nexthdr_offset;
ipfp = ipf->ipf_ptphn;
+
+ /* We need to supply these to caller */
+ if ((sum_flags = ipf->ipf_checksum_flags) != 0)
+ sum_val = ipf->ipf_checksum;
+ else
+ sum_val = 0;
+
mp1 = ipf->ipf_mp;
count = ipf->ipf_count;
ipf = ipf->ipf_hash_next;
@@ -8508,6 +8630,12 @@ reass_done:
ip6h->ip6_vcf &= htonl(0xFFCFFFFF);
ip6h->ip6_vcf |= htonl(ecn_info << 20);
+ /* Reassembly is successful; return checksum information if needed */
+ if (cksum_val != NULL)
+ *cksum_val = sum_val;
+ if (cksum_flags != NULL)
+ *cksum_flags = sum_flags;
+
return (mp);
}
@@ -9954,7 +10082,7 @@ notv6:
if (q->q_next == NULL) {
connp = Q_TO_CONN(q);
- if (IS_TCP_CONN(connp)) {
+ if (IPCL_IS_TCP(connp)) {
/* change conn_send for the tcp_v4_connections */
connp->conn_send = ip_output;
} else if (connp->conn_ulp == IPPROTO_SCTP) {
@@ -10426,12 +10554,52 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
uint32_t sum;
uint_t ill_index = ((ill_t *)ire->ire_stq->q_ptr)->
ill_phyint->phyint_ifindex;
+ queue_t *dev_q = ire->ire_stq->q_next;
/*
* non-NULL send-to queue - packet is to be sent
* out an interface.
*/
+ /* Driver is flow-controlling? */
+ if (!IP_FLOW_CONTROLLED_ULP(nexthdr) &&
+ ((dev_q->q_next || dev_q->q_first) && !canput(dev_q))) {
+ /*
+ * Queue packet if we have an conn to give back
+ * pressure. We can't queue packets intended for
+ * hardware acceleration since we've tossed that
+ * state already. If the packet is being fed back
+ * from ire_send_v6, we don't know the position in
+ * the queue to enqueue the packet and we discard
+ * the packet.
+ */
+ ASSERT(mp == first_mp);
+ if (ip_output_queue && connp != NULL &&
+ !mctl_present && caller != IRE_SEND) {
+ if (caller == IP_WSRV) {
+ connp->conn_did_putbq = 1;
+ (void) putbq(connp->conn_wq, mp);
+ conn_drain_insert(connp);
+ /*
+ * caller == IP_WSRV implies we are
+ * the service thread, and the
+ * queue is already noenabled.
+ * The check for canput and
+ * the putbq is not atomic.
+ * So we need to check again.
+ */
+ if (canput(dev_q))
+ connp->conn_did_putbq = 0;
+ } else {
+ (void) putq(connp->conn_wq, mp);
+ }
+ return;
+ }
+ BUMP_MIB(mibptr, ipv6OutDiscards);
+ freemsg(mp);
+ return;
+ }
+
/*
* Look for reachability confirmations from the transport.
*/
@@ -10490,20 +10658,20 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
up[12] + up[13] + up[14] + up[15];
sum = (sum & 0xffff) + (sum >> 16);
*insp = IP_CSUM(mp, hdr_length, sum);
+ if (*insp == 0)
+ *insp = 0xFFFF;
} else if (nexthdr == IPPROTO_TCP) {
uint16_t *up;
/*
* Check for full IPv6 header + enough TCP header
* to get at the checksum field.
- * XXX need hardware checksum support.
*/
-#define TCP_CSUM_OFFSET 16
-#define TCP_CSUM_SIZE 2
if ((mp->b_wptr - mp->b_rptr) <
- (hdr_length + TCP_CSUM_OFFSET + TCP_CSUM_SIZE)) {
+ (hdr_length + TCP_CHECKSUM_OFFSET +
+ TCP_CHECKSUM_SIZE)) {
if (!pullupmsg(mp, hdr_length +
- TCP_CSUM_OFFSET + TCP_CSUM_SIZE)) {
+ TCP_CHECKSUM_OFFSET + TCP_CHECKSUM_SIZE)) {
ip1dbg(("ip_wput_v6: TCP hdr pullupmsg"
" failed\n"));
BUMP_MIB(mibptr, ipv6OutDiscards);
@@ -10519,30 +10687,28 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
* into the tcp checksum field, so we don't
* need to explicitly sum it in here.
*/
- if (hdr_length == IPV6_HDR_LEN) {
- /* src, dst, tcp consequtive */
- up = (uint16_t *)(((uchar_t *)ip6h) +
- IPV6_HDR_LEN + TCP_CSUM_OFFSET);
- *up = IP_CSUM(mp,
- IPV6_HDR_LEN - 2 * sizeof (in6_addr_t),
- htons(IPPROTO_TCP));
- } else {
- sum = htons(IPPROTO_TCP) +
- up[0] + up[1] + up[2] + up[3] +
- up[4] + up[5] + up[6] + up[7] +
- up[8] + up[9] + up[10] + up[11] +
- up[12] + up[13] + up[14] + up[15];
- /*
- * Fold the initial sum.
- */
- sum = (sum & 0xffff) + (sum >> 16);
- up = (uint16_t *)(((uchar_t *)ip6h) +
- hdr_length + TCP_CSUM_OFFSET);
- *up = IP_CSUM(mp, hdr_length, sum);
- }
-#undef TCP_CSUM_OFFSET
-#undef TCP_CSUM_SIZE
+ sum = up[0] + up[1] + up[2] + up[3] +
+ up[4] + up[5] + up[6] + up[7] +
+ up[8] + up[9] + up[10] + up[11] +
+ up[12] + up[13] + up[14] + up[15];
+
+ /* Fold the initial sum */
+ sum = (sum & 0xffff) + (sum >> 16);
+
+ up = (uint16_t *)(((uchar_t *)ip6h) +
+ hdr_length + TCP_CHECKSUM_OFFSET);
+ IP_CKSUM_XMIT(ill, ire, mp, ip6h, up, IPPROTO_TCP,
+ hdr_length, ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN,
+ ire->ire_max_frag, mctl_present, sum);
+
+ /* Software checksum? */
+ if (DB_CKSUMFLAGS(mp) == 0) {
+ IP6_STAT(ip6_out_sw_cksum);
+ IP6_STAT_UPDATE(ip6_tcp_out_sw_cksum_bytes,
+ (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN) -
+ hdr_length);
+ }
} else if (nexthdr == IPPROTO_UDP) {
uint16_t *up;
@@ -10550,12 +10716,10 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
* check for full IPv6 header + enough UDP header
* to get at the UDP checksum field
*/
-#define UDP_CSUM_OFFSET 6
-#define UDP_CSUM_SIZE 2
if ((mp->b_wptr - mp->b_rptr) < (hdr_length +
- UDP_CSUM_OFFSET + UDP_CSUM_SIZE)) {
+ UDP_CHECKSUM_OFFSET + UDP_CHECKSUM_SIZE)) {
if (!pullupmsg(mp, hdr_length +
- UDP_CSUM_OFFSET + UDP_CSUM_SIZE)) {
+ UDP_CHECKSUM_OFFSET + UDP_CHECKSUM_SIZE)) {
ip1dbg(("ip_wput_v6: UDP hdr pullupmsg"
" failed\n"));
BUMP_MIB(mibptr, ipv6OutDiscards);
@@ -10570,34 +10734,28 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
* into the udp checksum field, so we don't
* need to explicitly sum it in here.
*/
- if (hdr_length == IPV6_HDR_LEN) {
- /* src, dst, udp consequtive */
- up = (uint16_t *)(((uchar_t *)ip6h) +
- IPV6_HDR_LEN + UDP_CSUM_OFFSET);
- *up = IP_CSUM(mp,
- IPV6_HDR_LEN - 2 * sizeof (in6_addr_t),
- htons(IPPROTO_UDP));
- } else {
- sum = htons(IPPROTO_UDP) +
- up[0] + up[1] + up[2] + up[3] +
- up[4] + up[5] + up[6] + up[7] +
- up[8] + up[9] + up[10] + up[11] +
- up[12] + up[13] + up[14] + up[15];
- sum = (sum & 0xffff) + (sum >> 16);
- up = (uint16_t *)(((uchar_t *)ip6h) +
- hdr_length + UDP_CSUM_OFFSET);
- *up = IP_CSUM(mp, hdr_length, sum);
- }
+ sum = up[0] + up[1] + up[2] + up[3] +
+ up[4] + up[5] + up[6] + up[7] +
+ up[8] + up[9] + up[10] + up[11] +
+ up[12] + up[13] + up[14] + up[15];
- /*
- * According to RFC 2460, UDP in IPv6 shouldn't
- * appear with all zero checksum on the wire and
- * should be changed to 0xffff.
- */
- if (*up == 0)
- *up = 0xffff;
-#undef UDP_CSUM_OFFSET
-#undef UDP_CSUM_SIZE
+ /* Fold the initial sum */
+ sum = (sum & 0xffff) + (sum >> 16);
+
+ up = (uint16_t *)(((uchar_t *)ip6h) +
+ hdr_length + UDP_CHECKSUM_OFFSET);
+
+ IP_CKSUM_XMIT(ill, ire, mp, ip6h, up, IPPROTO_UDP,
+ hdr_length, ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN,
+ ire->ire_max_frag, mctl_present, sum);
+
+ /* Software checksum? */
+ if (DB_CKSUMFLAGS(mp) == 0) {
+ IP6_STAT(ip6_out_sw_cksum);
+ IP6_STAT_UPDATE(ip6_udp_out_sw_cksum_bytes,
+ (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN) -
+ hdr_length);
+ }
} else if (nexthdr == IPPROTO_ICMPV6) {
uint16_t *up;
icmp6_t *icmp6;
@@ -10627,6 +10785,9 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
up[12] + up[13] + up[14] + up[15];
sum = (sum & 0xffff) + (sum >> 16);
icmp6->icmp6_cksum = IP_CSUM(mp, hdr_length, sum);
+ if (icmp6->icmp6_cksum == 0)
+ icmp6->icmp6_cksum = 0xFFFF;
+
/* Update output mib stats */
icmp_update_out_mib_v6(ill, icmp6);
} else if (nexthdr == IPPROTO_SCTP) {
@@ -10764,6 +10925,223 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
}
/*
+ * Outbound IPv6 fragmentation routine using MDT.
+ */
+static void
+ip_wput_frag_mdt_v6(mblk_t *mp, ire_t *ire, size_t max_chunk,
+ size_t unfragmentable_len, uint8_t nexthdr, uint_t prev_nexthdr_offset)
+{
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+ uint_t pkts, wroff, hdr_chunk_len, pbuf_idx;
+ mblk_t *hdr_mp, *md_mp = NULL;
+ int i1;
+ multidata_t *mmd;
+ unsigned char *hdr_ptr, *pld_ptr;
+ ip_pdescinfo_t pdi;
+ uint32_t ident;
+ size_t len;
+ uint16_t offset;
+ queue_t *stq = ire->ire_stq;
+ ill_t *ill = (ill_t *)stq->q_ptr;
+
+ ASSERT(DB_TYPE(mp) == M_DATA);
+ ASSERT(MBLKL(mp) > unfragmentable_len);
+
+ /*
+ * Move read ptr past unfragmentable portion, we don't want this part
+ * of the data in our fragments.
+ */
+ mp->b_rptr += unfragmentable_len;
+
+ /* Calculate how many packets we will send out */
+ i1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgsize(mp);
+ pkts = (i1 + max_chunk - 1) / max_chunk;
+ ASSERT(pkts > 1);
+
+ /* Allocate a message block which will hold all the IP Headers. */
+ wroff = ip_wroff_extra;
+ hdr_chunk_len = wroff + unfragmentable_len + sizeof (ip6_frag_t);
+
+ i1 = pkts * hdr_chunk_len;
+ /*
+ * Create the header buffer, Multidata and destination address
+ * and SAP attribute that should be associated with it.
+ */
+ if ((hdr_mp = allocb(i1, BPRI_HI)) == NULL ||
+ ((hdr_mp->b_wptr += i1),
+ (mmd = mmd_alloc(hdr_mp, &md_mp, KM_NOSLEEP)) == NULL) ||
+ !ip_md_addr_attr(mmd, NULL, ire->ire_nce->nce_res_mp)) {
+ freemsg(mp);
+ if (md_mp == NULL) {
+ freemsg(hdr_mp);
+ } else {
+free_mmd: IP6_STAT(ip6_frag_mdt_discarded);
+ freemsg(md_mp);
+ }
+ IP6_STAT(ip6_frag_mdt_allocfail);
+ BUMP_MIB(ill->ill_ip6_mib, ipv6OutFragFails);
+ UPDATE_MIB(ill->ill_ip6_mib, ipv6OutDiscards, pkts);
+ return;
+ }
+ IP6_STAT(ip6_frag_mdt_allocd);
+
+ /*
+ * Add a payload buffer to the Multidata; this operation must not
+ * fail, or otherwise our logic in this routine is broken. There
+ * is no memory allocation done by the routine, so any returned
+ * failure simply tells us that we've done something wrong.
+ *
+ * A failure tells us that either we're adding the same payload
+ * buffer more than once, or we're trying to add more buffers than
+ * allowed. None of the above cases should happen, and we panic
+ * because either there's horrible heap corruption, and/or
+ * programming mistake.
+ */
+ if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) {
+ goto pbuf_panic;
+ }
+
+ hdr_ptr = hdr_mp->b_rptr;
+ pld_ptr = mp->b_rptr;
+
+ pdi.flags = PDESC_HBUF_REF | PDESC_PBUF_REF;
+
+ ident = htonl(atomic_add_32_nv(&ire->ire_ident, 1));
+
+ /*
+ * len is the total length of the fragmentable data in this
+ * datagram. For each fragment sent, we will decrement len
+ * by the amount of fragmentable data sent in that fragment
+ * until len reaches zero.
+ */
+ len = ntohs(ip6h->ip6_plen) - (unfragmentable_len - IPV6_HDR_LEN);
+
+ offset = 0;
+ prev_nexthdr_offset += wroff;
+
+ while (len != 0) {
+ size_t mlen;
+ ip6_t *fip6h;
+ ip6_frag_t *fraghdr;
+ int error;
+
+ ASSERT((hdr_ptr + hdr_chunk_len) <= hdr_mp->b_wptr);
+ mlen = MIN(len, max_chunk);
+ len -= mlen;
+
+ fip6h = (ip6_t *)(hdr_ptr + wroff);
+ ASSERT(OK_32PTR(fip6h));
+ bcopy(ip6h, fip6h, unfragmentable_len);
+ hdr_ptr[prev_nexthdr_offset] = IPPROTO_FRAGMENT;
+
+ fip6h->ip6_plen = htons((uint16_t)(mlen +
+ unfragmentable_len - IPV6_HDR_LEN + sizeof (ip6_frag_t)));
+
+ fraghdr = (ip6_frag_t *)((unsigned char *)fip6h +
+ unfragmentable_len);
+ fraghdr->ip6f_nxt = nexthdr;
+ fraghdr->ip6f_reserved = 0;
+ fraghdr->ip6f_offlg = htons(offset) |
+ ((len != 0) ? IP6F_MORE_FRAG : 0);
+ fraghdr->ip6f_ident = ident;
+
+ /*
+ * Record offset and size of header and data of the next packet
+ * in the multidata message.
+ */
+ PDESC_HDR_ADD(&pdi, hdr_ptr, wroff,
+ unfragmentable_len + sizeof (ip6_frag_t), 0);
+ PDESC_PLD_INIT(&pdi);
+ i1 = MIN(mp->b_wptr - pld_ptr, mlen);
+ ASSERT(i1 > 0);
+ PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, pld_ptr, i1);
+ if (i1 == mlen) {
+ pld_ptr += mlen;
+ } else {
+ i1 = mlen - i1;
+ mp = mp->b_cont;
+ ASSERT(mp != NULL);
+ ASSERT(MBLKL(mp) >= i1);
+ /*
+ * Attach the next payload message block to the
+ * multidata message.
+ */
+ if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
+ goto pbuf_panic;
+ PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, mp->b_rptr, i1);
+ pld_ptr = mp->b_rptr + i1;
+ }
+
+ if ((mmd_addpdesc(mmd, (pdescinfo_t *)&pdi, &error,
+ KM_NOSLEEP)) == NULL) {
+ /*
+ * Any failure other than ENOMEM indicates that we
+ * have passed in invalid pdesc info or parameters
+ * to mmd_addpdesc, which must not happen.
+ *
+ * EINVAL is a result of failure on boundary checks
+ * against the pdesc info contents. It should not
+ * happen, and we panic because either there's
+ * horrible heap corruption, and/or programming
+ * mistake.
+ */
+ if (error != ENOMEM) {
+ cmn_err(CE_PANIC, "ip_wput_frag_mdt_v6: "
+ "pdesc logic error detected for "
+ "mmd %p pinfo %p (%d)\n",
+ (void *)mmd, (void *)&pdi, error);
+ /* NOTREACHED */
+ }
+ IP6_STAT(ip6_frag_mdt_addpdescfail);
+ /* Free unattached payload message blocks as well */
+ md_mp->b_cont = mp->b_cont;
+ goto free_mmd;
+ }
+
+ /* Advance fragment offset. */
+ offset += mlen;
+
+ /* Advance to location for next header in the buffer. */
+ hdr_ptr += hdr_chunk_len;
+
+ /* Did we reach the next payload message block? */
+ if (pld_ptr == mp->b_wptr && mp->b_cont != NULL) {
+ mp = mp->b_cont;
+ /*
+ * Attach the next message block with payload
+ * data to the multidata message.
+ */
+ if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0)
+ goto pbuf_panic;
+ pld_ptr = mp->b_rptr;
+ }
+ }
+
+ ASSERT(hdr_mp->b_wptr == hdr_ptr);
+ ASSERT(mp->b_wptr == pld_ptr);
+
+ /* Update IP statistics */
+ UPDATE_MIB(ill->ill_ip6_mib, ipv6OutFragCreates, pkts);
+ BUMP_MIB(ill->ill_ip6_mib, ipv6OutFragOKs);
+ IP6_STAT_UPDATE(ip6_frag_mdt_pkt_out, pkts);
+
+ ire->ire_ob_pkt_count += pkts;
+ if (ire->ire_ipif != NULL)
+ atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, pkts);
+
+ ire->ire_last_used_time = lbolt;
+ /* Send it down */
+ putnext(stq, md_mp);
+ return;
+
+pbuf_panic:
+ cmn_err(CE_PANIC, "ip_wput_frag_mdt_v6: payload buffer logic "
+ "error for mmd %p pbuf %p (%d)", (void *)mmd, (void *)mp,
+ pbuf_idx);
+ /* NOTREACHED */
+}
+
+/*
* IPv6 fragmentation. Essentially the same as IPv4 fragmentation.
* We have not optimized this in terms of number of mblks
* allocated. For instance, for each fragment sent we always allocate a
@@ -10779,7 +11157,7 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
*/
void
ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp,
- boolean_t caller, int max_frag)
+ int caller, int max_frag)
{
ip6_t *ip6h = (ip6_t *)mp->b_rptr;
ip6_t *fip6h;
@@ -10849,6 +11227,19 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp,
}
unfragmentable_len = (uint_t)(ptr - (uint8_t *)ip6h);
+ max_chunk = (min(max_frag, ire->ire_max_frag) - unfragmentable_len -
+ sizeof (ip6_frag_t)) & ~7;
+
+ /* Check if we can use MDT to send out the frags. */
+ ASSERT(!IRE_IS_LOCAL(ire));
+ if (ip_multidata_outbound && reachable == 0 &&
+ !(ire->ire_flags & RTF_MULTIRT) && ILL_MDT_CAPABLE(ill) &&
+ IP_CAN_FRAG_MDT(mp, unfragmentable_len, max_chunk)) {
+ ip_wput_frag_mdt_v6(mp, ire, max_chunk, unfragmentable_len,
+ nexthdr, prev_nexthdr_offset);
+ return;
+ }
+
/*
* Allocate an mblk with enough room for the link-layer
* header, the unfragmentable part of the datagram, and the
@@ -10875,7 +11266,7 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp,
fraghdr->ip6f_nxt = nexthdr;
fraghdr->ip6f_reserved = 0;
- fraghdr->ip6f_offlg = htons(0);
+ fraghdr->ip6f_offlg = 0;
fraghdr->ip6f_ident = htonl(ident);
/*
@@ -10886,9 +11277,6 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp,
*/
len = ntohs(ip6h->ip6_plen) - (unfragmentable_len - IPV6_HDR_LEN);
- max_chunk = (min(max_frag, ire->ire_max_frag) - unfragmentable_len -
- sizeof (ip6_frag_t)) & ~7;
-
/*
* Move read ptr past unfragmentable portion, we don't want this part
* of the data in our fragments.
@@ -11117,7 +11505,9 @@ ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp,
}
}
- if (IP_FLOW_CONTROLLED_ULP(ip6h->ip6_nxt) || canput(stq->q_next)) {
+ /* Flow-control check has been done in ip_wput_ire_v6 */
+ if (IP_FLOW_CONTROLLED_ULP(ip6h->ip6_nxt) || caller == IP_WPUT ||
+ caller == IP_WSRV || canput(stq->q_next)) {
uint32_t ill_index;
/*
@@ -11164,7 +11554,7 @@ ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp,
ill = ire_to_ill(ire);
}
IRB_REFRELE(irb);
- } else if (connp != NULL && IS_TCP_CONN(connp) &&
+ } else if (connp != NULL && IPCL_IS_TCP(connp) &&
connp->conn_mdt_ok && !connp->conn_tcp->tcp_mdt &&
ILL_MDT_USABLE(ill)) {
/*
@@ -11583,7 +11973,7 @@ ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp,
(void) putbq(connp->conn_wq, mp);
conn_drain_insert(connp);
/*
- * called_from_wsrv implies we are
+ * caller == IP_WSRV implies we are
* the service thread, and the
* queue is already noenabled.
* The check for canput and
diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c
index 937e0d8b0d..fc793de53b 100644
--- a/usr/src/uts/common/inet/ip/ip_if.c
+++ b/usr/src/uts/common/inet/ip/ip_if.c
@@ -80,6 +80,7 @@
#include <inet/ip_rts.h>
#include <inet/ip_ndp.h>
#include <inet/ip_if.h>
+#include <inet/ip_impl.h>
#include <inet/tun.h>
#include <inet/sctp_ip.h>
@@ -1232,10 +1233,10 @@ ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp)
} else {
/*
* IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't
- * be just ip_ioctl_freemsg. we have to restart it
+ * be just inet_freemsg. we have to restart it
* otherwise the thread will be stuck.
*/
- ip_ioctl_freemsg(mp);
+ inet_freemsg(mp);
}
return (B_TRUE);
}
@@ -1344,10 +1345,10 @@ ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp)
} else {
/*
* IP-MT XXX In the case of TLI/XTI bind / optmgmt
- * this can't be just ip_ioctl_freemsg. we have to
+ * this can't be just inet_freemsg. we have to
* restart it otherwise the thread will be stuck.
*/
- ip_ioctl_freemsg(curr);
+ inet_freemsg(curr);
}
}
}
@@ -1384,7 +1385,7 @@ conn_ioctl_cleanup(conn_t *connp)
if (curr != NULL) {
mutex_exit(&connp->conn_lock);
CONN_DEC_REF(connp);
- ip_ioctl_freemsg(curr);
+ inet_freemsg(curr);
return;
}
/*
@@ -2042,7 +2043,7 @@ ill_capability_mdt_reset(ill_t *ill, mblk_t **sc_mp)
dl_capability_sub_t *dl_subcap;
int size;
- if (!(ill->ill_capabilities & ILL_CAPAB_MDT))
+ if (!ILL_MDT_CAPABLE(ill))
return;
ASSERT(ill->ill_mdt_capab != NULL);
@@ -2857,6 +2858,9 @@ ill_capability_poll_capable(ill_t *ill, dl_capab_poll_t *ipoll,
bcopy((void *)&poll, (void *)opoll, sizeof (dl_capab_poll_t));
ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
+ ip1dbg(("ill_capability_poll_capable: asking interface %s "
+ "to enable polling\n", ill->ill_name));
+
/* nmp points to a DL_CAPABILITY_REQ message to enable polling */
ill_dlpi_send(ill, nmp);
}
@@ -2944,6 +2948,8 @@ ill_capability_poll_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
ASSERT(ill->ill_poll_capab != NULL);
ill->ill_capabilities |= ILL_CAPAB_POLL;
}
+ ip1dbg(("ill_capability_poll_ack: interface %s "
+ "has enabled polling\n", ill->ill_name));
break;
}
}
@@ -3048,8 +3054,9 @@ ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
return;
}
-#define CURR_HCKSUM_CAPAB \
- (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM)
+#define CURR_HCKSUM_CAPAB \
+ (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \
+ HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM)
if ((ihck->hcksum_txflags & HCKSUM_ENABLE) &&
(ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) {
@@ -3126,10 +3133,11 @@ ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
* hardware checksum acceleration.
*/
ill_dlpi_send(ill, nmp);
- } else
+ } else {
ip1dbg(("ill_capability_hcksum_ack: interface %s has "
"advertised %x hardware checksum capability flags\n",
ill->ill_name, ihck->hcksum_txflags));
+ }
}
static void
@@ -3140,7 +3148,7 @@ ill_capability_hcksum_reset(ill_t *ill, mblk_t **sc_mp)
dl_capability_sub_t *dl_subcap;
int size;
- if (!(ill->ill_capabilities & ILL_CAPAB_HCKSUM))
+ if (!ILL_HCKSUM_CAPABLE(ill))
return;
ASSERT(ill->ill_hcksum_capab != NULL);
@@ -7300,7 +7308,7 @@ ipsq_flush(ill_t *ill)
ASSERT(mp_next == NULL);
ipsq->ipsq_mptail = prev;
}
- ip_ioctl_freemsg(mp);
+ inet_freemsg(mp);
} else {
prev = mp;
}
@@ -8838,7 +8846,7 @@ ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp, sin_t *sin,
if (mp1 != NULL)
freeb(mp1);
if (pending_mp != NULL)
- ip_ioctl_freemsg(pending_mp);
+ inet_freemsg(pending_mp);
return (ENOMEM);
}
@@ -8848,7 +8856,7 @@ ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp, sin_t *sin,
(caddr_t)&ipaddr);
if (mp2 == NULL) {
freeb(mp1);
- ip_ioctl_freemsg(pending_mp);
+ inet_freemsg(pending_mp);
return (ENOMEM);
}
/* Put together the chain. */
@@ -9743,7 +9751,7 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp)
pending_mp = ill_pending_mp_get(ill, &connp, ioc_id);
if (pending_mp == NULL) {
ASSERT(connp == NULL);
- ip_ioctl_freemsg(mp);
+ inet_freemsg(mp);
return;
}
ASSERT(connp != NULL);
@@ -9760,7 +9768,7 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp)
*/
orig_ioc_mp->b_cont->b_next = pending_mp->b_cont->b_next;
orig_ioc_mp->b_cont->b_prev = pending_mp->b_cont->b_prev;
- ip_ioctl_freemsg(pending_mp);
+ inet_freemsg(pending_mp);
/*
* We're done if there was an error or if this is not an SIOCG{X}ARP
@@ -18114,6 +18122,8 @@ ipif_mask_reply(ipif_t *ipif)
icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY;
bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN);
icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0);
+ if (icmph->icmph_checksum == 0)
+ icmph->icmph_checksum = 0xffff;
put(ipif->ipif_wq, mp);
diff --git a/usr/src/uts/common/inet/ip/ip_multi.c b/usr/src/uts/common/inet/ip/ip_multi.c
index c19e886d0d..0c42de575d 100644
--- a/usr/src/uts/common/inet/ip/ip_multi.c
+++ b/usr/src/uts/common/inet/ip/ip_multi.c
@@ -65,6 +65,7 @@
#include <inet/ipsec_impl.h>
#include <inet/sctp_ip.h>
#include <inet/ip_listutils.h>
+#include <inet/udp_impl.h>
#include <netinet/igmp.h>
@@ -1186,14 +1187,39 @@ void
ip_multicast_loopback(queue_t *q, ill_t *ill, mblk_t *mp_orig, int fanout_flags,
zoneid_t zoneid)
{
- mblk_t *mp;
- mblk_t *ipsec_mp;
+ mblk_t *mp;
+ mblk_t *ipsec_mp;
+
+ if (DB_TYPE(mp_orig) == M_DATA &&
+ ((ipha_t *)mp_orig->b_rptr)->ipha_protocol == IPPROTO_UDP) {
+ uint_t hdrsz;
+
+ hdrsz = IPH_HDR_LENGTH((ipha_t *)mp_orig->b_rptr) +
+ sizeof (udpha_t);
+ ASSERT(MBLKL(mp_orig) >= hdrsz);
+
+ if (((mp = allocb(hdrsz, BPRI_MED)) != NULL) &&
+ (mp_orig = dupmsg(mp_orig)) != NULL) {
+ bcopy(mp_orig->b_rptr, mp->b_rptr, hdrsz);
+ mp->b_wptr += hdrsz;
+ mp->b_cont = mp_orig;
+ mp_orig->b_rptr += hdrsz;
+ if (MBLKL(mp_orig) == 0) {
+ mp->b_cont = mp_orig->b_cont;
+ mp_orig->b_cont = NULL;
+ freeb(mp_orig);
+ }
+ } else if (mp != NULL) {
+ freeb(mp);
+ mp = NULL;
+ }
+ } else {
+ mp = ip_copymsg(mp_orig);
+ }
- /* TODO this could use dup'ed messages except for the IP header. */
- mp = ip_copymsg(mp_orig);
if (mp == NULL)
return;
- if (mp->b_datap->db_type == M_CTL) {
+ if (DB_TYPE(mp) == M_CTL) {
ipsec_mp = mp;
mp = mp->b_cont;
} else {
@@ -2553,7 +2579,7 @@ ip_extract_msfilter(queue_t *q, mblk_t *mp, ipif_t **ipifpp, ipsq_func_t func)
zoneid = connp->conn_zoneid;
/* don't allow multicast operations on a tcp conn */
- if (IS_TCP_CONN(connp))
+ if (IPCL_IS_TCP(connp))
return (ENOPROTOOPT);
if (cmd == SIOCSIPMSFILTER || cmd == SIOCGIPMSFILTER) {
diff --git a/usr/src/uts/common/inet/ip/ip_ndp.c b/usr/src/uts/common/inet/ip/ip_ndp.c
index ee9386e4af..948ccd4bc1 100644
--- a/usr/src/uts/common/inet/ip/ip_ndp.c
+++ b/usr/src/uts/common/inet/ip/ip_ndp.c
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -144,7 +144,6 @@ static nce_t nce_nil;
mblk_t *mp;
mblk_t *template;
nce_t **ncep;
- int err = 0;
boolean_t dropped = B_FALSE;
ASSERT(MUTEX_HELD(&ndp_g_lock));
@@ -280,8 +279,15 @@ static nce_t nce_nil;
mutex_exit(&nce->nce_lock);
mutex_enter(&ndp_g_lock);
}
-done:
- return (err);
+ /*
+ * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then
+ * we call nce_fastpath as soon as the nce is resolved in ndp_process.
+ * We call nce_fastpath from nce_update if the link layer address of
+ * the peer changes from nce_update
+ */
+ if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER)
+ nce_fastpath(nce);
+ return (0);
}
int
@@ -1028,7 +1034,6 @@ ndp_noresolver(ill_t *ill, const in6_addr_t *dst)
* Cache entry with a proper resolver cookie was
* created.
*/
- nce_fastpath(nce);
NCE_REFRELE(nce);
break;
case EEXIST:
@@ -1108,7 +1113,6 @@ nce_set_multicast(ill_t *ill, const in6_addr_t *dst)
ip1dbg(("nce_set_multicast: create failed" "%d\n", err));
return (err);
}
- nce_fastpath(nce);
NCE_REFRELE(nce);
return (0);
}
@@ -2168,8 +2172,7 @@ nce_set_ll(nce_t *nce, uchar_t *ll_addr)
ASSERT(ll_addr != NULL);
/* Always called before fast_path_probe */
- if (nce->nce_fp_mp != NULL)
- return;
+ ASSERT(nce->nce_fp_mp == NULL);
if (ill->ill_sap_length != 0) {
/*
* Copy the SAP type specified in the
@@ -2265,8 +2268,8 @@ nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr)
if (nce->nce_fp_mp != NULL) {
freemsg(nce->nce_fp_mp);
nce->nce_fp_mp = NULL;
- need_fastpath_update = B_TRUE;
}
+ need_fastpath_update = B_TRUE;
}
mutex_exit(&nce->nce_lock);
if (need_stop_timer) {
diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c
index 2085f212ba..12907ba3b4 100644
--- a/usr/src/uts/common/inet/ip/ipclassifier.c
+++ b/usr/src/uts/common/inet/ip/ipclassifier.c
@@ -233,6 +233,7 @@ const char ipclassifier_version[] = "@(#)ipclassifier.c 1.6 04/03/31 SMI";
#include <inet/ip_rts.h>
#include <inet/optcom.h>
#include <inet/ip_ndp.h>
+#include <inet/udp_impl.h>
#include <inet/sctp_ip.h>
#include <sys/ethernet.h>
@@ -351,8 +352,7 @@ ipcl_init(void)
ipcl_conn_cache = kmem_cache_create("ipcl_conn_cache",
sizeof (conn_t), CACHE_ALIGN_SIZE,
- NULL, NULL,
- NULL, NULL, NULL, 0);
+ NULL, NULL, NULL, NULL, NULL, 0);
ipcl_tcpconn_cache = kmem_cache_create("ipcl_tcpconn_cache",
sizeof (itc_t), CACHE_ALIGN_SIZE,
@@ -501,17 +501,19 @@ ipcl_conn_create(uint32_t type, int sleep)
case IPCL_IPCCONN:
connp = kmem_cache_alloc(ipcl_conn_cache, sleep);
if (connp == NULL)
- return (connp);
+ return (NULL);
bzero(connp, sizeof (conn_t));
- mutex_init(&connp->conn_lock, NULL,
- MUTEX_DEFAULT, NULL);
+ mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL);
- connp->conn_flags |= IPCL_IPCCONN;
+ connp->conn_flags = IPCL_IPCCONN;
connp->conn_ref = 1;
IPCL_DEBUG_LVL(1,
("ipcl_conn_create: connp = %p\n", (void *)connp));
ipcl_globalhash_insert(connp);
break;
+ default:
+ connp = NULL;
+ ASSERT(0);
}
return (connp);
@@ -521,7 +523,6 @@ void
ipcl_conn_destroy(conn_t *connp)
{
mblk_t *mp;
- tcp_t *tcp = connp->conn_tcp;
ASSERT(!MUTEX_HELD(&connp->conn_lock));
ASSERT(connp->conn_ref == 0);
@@ -531,6 +532,8 @@ ipcl_conn_destroy(conn_t *connp)
cv_destroy(&connp->conn_cv);
if (connp->conn_flags & IPCL_TCPCONN) {
+ tcp_t *tcp = connp->conn_tcp;
+
mutex_destroy(&connp->conn_lock);
ASSERT(connp->conn_tcp != NULL);
tcp_free(tcp);
@@ -567,6 +570,7 @@ ipcl_conn_destroy(conn_t *connp)
} else if (connp->conn_flags & IPCL_SCTPCONN) {
sctp_free(connp);
} else {
+ ASSERT(connp->conn_udp == NULL);
mutex_destroy(&connp->conn_lock);
kmem_cache_free(ipcl_conn_cache, connp);
}
@@ -1863,6 +1867,57 @@ ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex,
return (NULL);
}
+/*
+ * ipcl_get_next_conn
+ * get the next entry in the conn global list
+ * and put a reference on the next_conn.
+ * decrement the reference on the current conn.
+ *
+ * This is an iterator based walker function that also provides for
+ * some selection by the caller. It walks through the conn_hash bucket
+ * searching for the next valid connp in the list, and selects connections
+ * that are neither closed nor condemned. It also REFHOLDS the conn
+ * thus ensuring that the conn exists when the caller uses the conn.
+ */
+conn_t *
+ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags)
+{
+ conn_t *next_connp;
+
+ if (connfp == NULL)
+ return (NULL);
+
+ mutex_enter(&connfp->connf_lock);
+
+ next_connp = (connp == NULL) ?
+ connfp->connf_head : connp->conn_g_next;
+
+ while (next_connp != NULL) {
+ mutex_enter(&next_connp->conn_lock);
+ if (!(next_connp->conn_flags & conn_flags) ||
+ (next_connp->conn_state_flags &
+ (CONN_CONDEMNED | CONN_INCIPIENT))) {
+ /*
+ * This conn has been condemned or
+ * is closing, or the flags don't match
+ */
+ mutex_exit(&next_connp->conn_lock);
+ next_connp = next_connp->conn_g_next;
+ continue;
+ }
+ CONN_INC_REF_LOCKED(next_connp);
+ mutex_exit(&next_connp->conn_lock);
+ break;
+ }
+
+ mutex_exit(&connfp->connf_lock);
+
+ if (connp != NULL)
+ CONN_DEC_REF(connp);
+
+ return (next_connp);
+}
+
#ifdef CONN_DEBUG
/*
* Trace of the last NBUF refhold/refrele
diff --git a/usr/src/uts/common/inet/ip/tun.c b/usr/src/uts/common/inet/ip/tun.c
index f0507908f5..85fd4b1ec9 100644
--- a/usr/src/uts/common/inet/ip/tun.c
+++ b/usr/src/uts/common/inet/ip/tun.c
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -3693,6 +3693,8 @@ tun_icmp_message_v4(queue_t *q, ipha_t *ipha, icmph_t *icmp, mblk_t *mp)
*nicmp = *icmp;
nicmp->icmph_checksum = 0;
nicmp->icmph_checksum = IP_CSUM(send_mp, sizeof (ipha_t), 0);
+ if (nicmp->icmph_checksum == 0)
+ nicmp->icmph_checksum = 0xffff;
/* let ip know we are an icmp message */
atomic_add_64(&atp->tun_HCInOctets,
@@ -3757,6 +3759,8 @@ tun_icmp_message_v6(queue_t *q, ip6_t *ip6h, icmp6_t *icmp6, uint8_t hoplim,
up[12] + up[13] + up[14] + up[15];
sum = (sum & 0xffff) + (sum >> 16);
nicmp6->icmp6_cksum = IP_CSUM(send_mp, IPV6_HDR_LEN, sum);
+ if (nicmp6->icmp6_cksum == 0)
+ nicmp6->icmp6_cksum = 0xffff;
/* let ip know we are an icmp message */
atomic_add_64(&atp->tun_HCInOctets,
diff --git a/usr/src/uts/common/inet/ip6.h b/usr/src/uts/common/inet/ip6.h
index 5190bb4bf7..8283250d2a 100644
--- a/usr/src/uts/common/inet/ip6.h
+++ b/usr/src/uts/common/inet/ip6.h
@@ -370,8 +370,7 @@ extern boolean_t ip_hdr_length_nexthdr_v6(mblk_t *, ip6_t *,
uint16_t *, uint8_t **);
extern int ip_hdr_length_v6(mblk_t *, ip6_t *);
extern uint32_t ip_massage_options_v6(ip6_t *, ip6_rthdr_t *);
-extern void ip_wput_frag_v6(mblk_t *, ire_t *, uint_t, conn_t *,
- boolean_t, int);
+extern void ip_wput_frag_v6(mblk_t *, ire_t *, uint_t, conn_t *, int, int);
extern void ip_wput_ipsec_out_v6(queue_t *, mblk_t *, ip6_t *, ill_t *,
ire_t *);
extern int ip_total_hdrs_len_v6(ip6_pkt_t *);
diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h
new file mode 100644
index 0000000000..f55bb7d6ce
--- /dev/null
+++ b/usr/src/uts/common/inet/ip_impl.h
@@ -0,0 +1,493 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _INET_IP_IMPL_H
+#define _INET_IP_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * IP implementation private declarations. These interfaces are
+ * used to build the IP module and are not meant to be accessed
+ * by any modules except IP itself. They are undocumented and are
+ * subject to change without notice.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+#define IP_MOD_ID 5701
+
+#ifdef _BIG_ENDIAN
+#define IP_HDR_CSUM_TTL_ADJUST 256
+#define IP_TCP_CSUM_COMP IPPROTO_TCP
+#define IP_UDP_CSUM_COMP IPPROTO_UDP
+#else
+#define IP_HDR_CSUM_TTL_ADJUST 1
+#define IP_TCP_CSUM_COMP (IPPROTO_TCP << 8)
+#define IP_UDP_CSUM_COMP (IPPROTO_UDP << 8)
+#endif
+
+#define TCP_CHECKSUM_OFFSET 16
+#define TCP_CHECKSUM_SIZE 2
+
+#define UDP_CHECKSUM_OFFSET 6
+#define UDP_CHECKSUM_SIZE 2
+
+#define IPH_TCPH_CHECKSUMP(ipha, hlen) \
+ ((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + TCP_CHECKSUM_OFFSET)))
+
+#define IPH_UDPH_CHECKSUMP(ipha, hlen) \
+ ((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + UDP_CHECKSUM_OFFSET)))
+
+#define ILL_HCKSUM_CAPABLE(ill) \
+ (((ill)->ill_capabilities & ILL_CAPAB_HCKSUM) != 0)
+/*
+ * Macro that performs software checksum calculation on the IP header.
+ */
+#define IP_HDR_CKSUM(ipha, sum, v_hlen_tos_len, ttl_protocol) { \
+ (sum) += (ttl_protocol) + (ipha)->ipha_ident + \
+ ((v_hlen_tos_len) >> 16) + \
+ ((v_hlen_tos_len) & 0xFFFF) + \
+ (ipha)->ipha_fragment_offset_and_flags; \
+ (sum) = (((sum) & 0xFFFF) + ((sum) >> 16)); \
+ (sum) = ~((sum) + ((sum) >> 16)); \
+ (ipha)->ipha_hdr_checksum = (uint16_t)(sum); \
+}
+
+#define IS_IP_HDR_HWCKSUM(ipsec, mp, ill) \
+ ((!ipsec) && (DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) && \
+ ILL_HCKSUM_CAPABLE(ill) && dohwcksum)
+
+/*
+ * This macro acts as a wrapper around IP_CKSUM_XMIT_FAST, and it performs
+ * several checks on the IRE and ILL (among other things) in order to see
+ * whether or not hardware checksum offload is allowed for the outgoing
+ * packet. It assumes that the caller has held a reference to the IRE.
+ */
+#define IP_CKSUM_XMIT(ill, ire, mp, ihp, up, proto, start, end, \
+ max_frag, ipsec_len, pseudo) { \
+ uint32_t _hck_flags; \
+ /* \
+ * We offload checksum calculation to hardware when IPsec isn't \
+ * present and if fragmentation isn't required. We also check \
+ * if M_DATA fastpath is safe to be used on the corresponding \
+ * IRE; this check is performed without grabbing ire_lock but \
+ * instead by holding a reference to it. This is sufficient \
+ * for IRE_CACHE; for IRE_BROADCAST on non-Ethernet links, the \
+ * DL_NOTE_FASTPATH_FLUSH indication could come up from the \
+ * driver and trigger the IRE (hence fp_mp) deletion. This is \
+ * why only IRE_CACHE type is eligible for offload. \
+ * \
+ * The presense of IP options also forces the network stack to \
+ * calculate the checksum in software. This is because: \
+ * \
+ * Wrap around: certain partial-checksum NICs (eri, ce) limit \
+ * the size of "start offset" width to 6-bit. This effectively \
+ * sets the largest value of the offset to 64-bytes, starting \
+ * from the MAC header. When the cumulative MAC and IP headers \
+ * exceed such limit, the offset will wrap around. This causes \
+ * the checksum to be calculated at the wrong place. \
+ * \
+ * IPv4 source routing: none of the full-checksum capable NICs \
+ * is capable of correctly handling the IPv4 source-routing \
+ * option for purposes of calculating the pseudo-header; the \
+ * actual destination is different from the destination in the \
+ * header which is that of the next-hop. (This case may not be \
+ * true for NICs which can parse IPv6 extension headers, but \
+ * we choose to simplify the implementation by not offloading \
+ * checksum when they are present.) \
+ * \
+ */ \
+ if ((ill) != NULL && ILL_HCKSUM_CAPABLE(ill) && \
+ !((ire)->ire_flags & RTF_MULTIRT) && \
+ (!((ire)->ire_type & (IRE_BROADCAST|IRE_MIPRTUN)) || \
+ (ill)->ill_type == IFT_ETHER) && \
+ (ipsec_len) == 0 && \
+ (((ire)->ire_ipversion == IPV4_VERSION && \
+ (start) == IP_SIMPLE_HDR_LENGTH && \
+ (ire)->ire_fp_mp != NULL && \
+ MBLKHEAD(mp) >= MBLKL((ire)->ire_fp_mp)) || \
+ ((ire)->ire_ipversion == IPV6_VERSION && \
+ (start) == IPV6_HDR_LEN && \
+ (ire)->ire_nce->nce_fp_mp != NULL && \
+ MBLKHEAD(mp) >= MBLKL((ire)->ire_nce->nce_fp_mp))) && \
+ (max_frag) >= (uint_t)((end) + (ipsec_len)) && \
+ dohwcksum) { \
+ _hck_flags = (ill)->ill_hcksum_capab->ill_hcksum_txflags; \
+ } else { \
+ _hck_flags = 0; \
+ } \
+ IP_CKSUM_XMIT_FAST((ire)->ire_ipversion, _hck_flags, mp, ihp, \
+ up, proto, start, end, pseudo); \
+}
+
+/*
+ * Based on the device capabilities, this macro either marks an outgoing
+ * packet with hardware checksum offload information or calculate the
+ * checksum in software. If the latter is performed, the checksum field
+ * of the dblk is cleared; otherwise it will be non-zero and contain the
+ * necessary flag(s) for the driver.
+ */
+#define IP_CKSUM_XMIT_FAST(ipver, hck_flags, mp, ihp, up, proto, start, \
+ end, pseudo) { \
+ uint32_t _sum; \
+ /* \
+ * Underlying interface supports hardware checksum offload for \
+ * the payload; leave the payload checksum for the hardware to \
+ * calculate. N.B: We only need to set up checksum info on the \
+ * first mblk. \
+ */ \
+ DB_CKSUMFLAGS(mp) = 0; \
+ if (((ipver) == IPV4_VERSION && \
+ ((hck_flags) & HCKSUM_INET_FULL_V4)) || \
+ ((ipver) == IPV6_VERSION && \
+ ((hck_flags) & HCKSUM_INET_FULL_V6))) { \
+ /* \
+ * Hardware calculates pseudo-header, header and the \
+ * payload checksums, so clear the checksum field in \
+ * the protocol header. \
+ */ \
+ *(up) = 0; \
+ DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM; \
+ } else if ((hck_flags) & HCKSUM_INET_PARTIAL) { \
+ /* \
+ * Partial checksum offload has been enabled. Fill \
+ * the checksum field in the protocl header with the \
+ * pseudo-header checksum value. \
+ */ \
+ _sum = ((proto) == IPPROTO_UDP) ? \
+ IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP; \
+ _sum += *(up) + (pseudo); \
+ _sum = (_sum & 0xFFFF) + (_sum >> 16); \
+ *(up) = (_sum & 0xFFFF) + (_sum >> 16); \
+ /* \
+ * Offsets are relative to beginning of IP header. \
+ */ \
+ DB_CKSUMSTART(mp) = (start); \
+ DB_CKSUMSTUFF(mp) = ((proto) == IPPROTO_UDP) ? \
+ (start) + UDP_CHECKSUM_OFFSET : \
+ (start) + TCP_CHECKSUM_OFFSET; \
+ DB_CKSUMEND(mp) = (end); \
+ DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM; \
+ } else { \
+ /* \
+ * Software checksumming. \
+ */ \
+ _sum = ((proto) == IPPROTO_UDP) ? \
+ IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP; \
+ _sum += (pseudo); \
+ _sum = IP_CSUM(mp, start, _sum); \
+ *(up) = (uint16_t)(_sum ? _sum : ~_sum); \
+ } \
+ /* \
+ * Hardware supports IP header checksum offload; clear the \
+ * contents of IP header checksum field as expected by NIC. \
+ * Do this only if we offloaded either full or partial sum. \
+ */ \
+ if ((ipver) == IPV4_VERSION && DB_CKSUMFLAGS(mp) != 0 && \
+ ((hck_flags) & HCKSUM_IPHDRCKSUM)) { \
+ DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; \
+ ((ipha_t *)(ihp))->ipha_hdr_checksum = 0; \
+ } \
+}
+
+/*
+ * Macro to inspect the checksum of a fully-reassembled incoming datagram.
+ */
+#define IP_CKSUM_RECV_REASS(hck_flags, off, pseudo, sum, err) { \
+ (err) = B_FALSE; \
+ if ((hck_flags) & HCK_FULLCKSUM) { \
+ /* \
+ * The sum of all fragment checksums should \
+ * result in -0 (0xFFFF) or otherwise invalid. \
+ */ \
+ if ((sum) != 0xFFFF) \
+ (err) = B_TRUE; \
+ } else if ((hck_flags) & HCK_PARTIALCKSUM) { \
+ (sum) += (pseudo); \
+ (sum) = ((sum) & 0xFFFF) + ((sum) >> 16); \
+ (sum) = ((sum) & 0xFFFF) + ((sum) >> 16); \
+ if (~(sum) & 0xFFFF) \
+ (err) = B_TRUE; \
+ } else if (((sum) = IP_CSUM(mp, off, pseudo)) != 0) { \
+ (err) = B_TRUE; \
+ } \
+}
+
+/*
+ * This macro inspects an incoming packet to see if the checksum value
+ * contained in it is valid; if the hardware has provided the information,
+ * the value is verified, otherwise it performs software checksumming.
+ * The checksum value is returned to caller.
+ */
+#define IP_CKSUM_RECV(hck_flags, sum, cksum_start, ulph_off, mp, mp1, err) { \
+ int32_t _len; \
+ \
+ (err) = B_FALSE; \
+ if ((hck_flags) & HCK_FULLCKSUM) { \
+ /* \
+ * Full checksum has been computed by the hardware \
+ * and has been attached. If the driver wants us to \
+ * verify the correctness of the attached value, in \
+ * order to protect against faulty hardware, compare \
+ * it against -0 (0xFFFF) to see if it's valid. \
+ */ \
+ (sum) = DB_CKSUM16(mp); \
+ if (!((hck_flags) & HCK_FULLCKSUM_OK) && (sum) != 0xFFFF) \
+ (err) = B_TRUE; \
+ } else if (((hck_flags) & HCK_PARTIALCKSUM) && \
+ ((mp1) == NULL || (mp1)->b_cont == NULL) && \
+ (ulph_off) >= DB_CKSUMSTART(mp) && \
+ ((_len = (ulph_off) - DB_CKSUMSTART(mp)) & 1) == 0) { \
+ uint32_t _adj; \
+ /* \
+ * Partial checksum has been calculated by hardware \
+ * and attached to the packet; in addition, any \
+ * prepended extraneous data is even byte aligned, \
+ * and there are at most two mblks associated with \
+ * the packet. If any such data exists, we adjust \
+ * the checksum; also take care any postpended data. \
+ */ \
+ IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, _len, _adj); \
+ /* \
+ * One's complement subtract extraneous checksum \
+ */ \
+ (sum) += DB_CKSUM16(mp); \
+ if (_adj >= (sum)) \
+ (sum) = ~(_adj - (sum)) & 0xFFFF; \
+ else \
+ (sum) -= _adj; \
+ (sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16); \
+ (sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16); \
+ if (~(sum) & 0xFFFF) \
+ (err) = B_TRUE; \
+ } else if (((sum) = IP_CSUM(mp, ulph_off, sum)) != 0) { \
+ (err) = B_TRUE; \
+ } \
+}
+
+/*
+ * Macro to adjust a given checksum value depending on any prepended
+ * or postpended data on the packet. It expects the start offset to
+ * begin at an even boundary and that the packet consists of at most
+ * two mblks.
+ */
+#define IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, len, adj) { \
+ /* \
+ * Prepended extraneous data; adjust checksum. \
+ */ \
+ if ((len) > 0) \
+ (adj) = IP_BCSUM_PARTIAL(cksum_start, len, 0); \
+ else \
+ (adj) = 0; \
+ /* \
+ * len is now the total length of mblk(s) \
+ */ \
+ (len) = MBLKL(mp); \
+ if ((mp1) == NULL) \
+ (mp1) = (mp); \
+ else \
+ (len) += MBLKL(mp1); \
+ /* \
+ * Postpended extraneous data; adjust checksum. \
+ */ \
+ if (((len) = (DB_CKSUMEND(mp) - len)) > 0) { \
+ uint32_t _pad; \
+ \
+ _pad = IP_BCSUM_PARTIAL((mp1)->b_wptr, len, 0); \
+ /* \
+ * If the postpended extraneous data was odd \
+ * byte aligned, swap resulting checksum bytes. \
+ */ \
+ if ((uintptr_t)(mp1)->b_wptr & 1) \
+ (adj) += ((_pad << 8) & 0xFFFF) | (_pad >> 8); \
+ else \
+ (adj) += _pad; \
+ (adj) = ((adj) & 0xFFFF) + ((int)(adj) >> 16); \
+ } \
+}
+
+#define ILL_MDT_CAPABLE(ill) \
+ (((ill)->ill_capabilities & ILL_CAPAB_MDT) != 0)
+
+/*
+ * ioctl identifier and structure for Multidata Transmit update
+ * private M_CTL communication from IP to ULP.
+ */
+#define MDT_IOC_INFO_UPDATE (('M' << 8) + 1020)
+
+typedef struct ip_mdt_info_s {
+ uint_t mdt_info_id; /* MDT_IOC_INFO_UPDATE */
+ ill_mdt_capab_t mdt_capab; /* ILL MDT capabilities */
+} ip_mdt_info_t;
+
+/*
+ * Macro that determines whether or not a given ILL is allowed for MDT.
+ */
+#define ILL_MDT_USABLE(ill) \
+ (ILL_MDT_CAPABLE(ill) && \
+ ill->ill_mdt_capab != NULL && \
+ ill->ill_mdt_capab->ill_mdt_version == MDT_VERSION_2 && \
+ ill->ill_mdt_capab->ill_mdt_on != 0)
+
+/*
+ * Macro that determines whether or not a given CONN may be considered
+ * for fast path prior to proceeding further with Multidata.
+ */
+#define CONN_IS_MD_FASTPATH(connp) \
+ ((connp)->conn_dontroute == 0 && /* SO_DONTROUTE */ \
+ (connp)->conn_nofailover_ill == NULL && /* IPIF_NOFAILOVER */ \
+ (connp)->conn_xmit_if_ill == NULL && /* IP_XMIT_IF */ \
+ (connp)->conn_outgoing_pill == NULL && /* IP{V6}_BOUND_PIF */ \
+ (connp)->conn_outgoing_ill == NULL) /* IP{V6}_BOUND_IF */
+
+/* Definitons for fragmenting IP packets using MDT. */
+
+/*
+ * Smaller and private version of pdescinfo_t used specifically for IP,
+ * which allows for only a single payload span per packet.
+ */
+typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2) ip_pdescinfo_t;
+
+/*
+ * Macro version of ip_can_frag_mdt() which avoids the function call if we
+ * only examine a single message block.
+ */
+#define IP_CAN_FRAG_MDT(mp, hdr_len, len) \
+ (((mp)->b_cont == NULL) ? \
+ (MBLKL(mp) >= ((hdr_len) + ip_wput_frag_mdt_min)) : \
+ ip_can_frag_mdt((mp), (hdr_len), (len)))
+
+/*
+ * Macro that determines whether or not a given IPC requires
+ * outbound IPSEC processing.
+ */
+#define CONN_IPSEC_OUT_ENCAPSULATED(connp) \
+ ((connp)->conn_out_enforce_policy || \
+ ((connp)->conn_latch != NULL && \
+ (connp)->conn_latch->ipl_out_policy != NULL))
+
+/*
+ * These are used by the synchronous streams code in tcp and udp.
+ */
+#define STR_WAKEUP_CLEAR(stp) { \
+ mutex_enter(&stp->sd_lock); \
+ stp->sd_wakeq &= ~RSLEEP; \
+ mutex_exit(&stp->sd_lock); \
+}
+
+#define STR_WAKEUP_SET(stp) { \
+ mutex_enter(&stp->sd_lock); \
+ if (stp->sd_flag & RSLEEP) { \
+ stp->sd_flag &= ~RSLEEP; \
+ cv_broadcast(&_RD(stp->sd_wrq)->q_wait); \
+ } else { \
+ stp->sd_wakeq |= RSLEEP; \
+ } \
+ mutex_exit(&stp->sd_lock); \
+}
+
+#define STR_SENDSIG(stp) { \
+ int _events; \
+ mutex_enter(&stp->sd_lock); \
+ if ((_events = stp->sd_sigflags & (S_INPUT | S_RDNORM)) != 0) \
+ strsendsig(stp->sd_siglist, _events, 0, 0); \
+ if (stp->sd_rput_opt & SR_POLLIN) { \
+ stp->sd_rput_opt &= ~SR_POLLIN; \
+ mutex_exit(&stp->sd_lock); \
+ pollwakeup(&stp->sd_pollist, POLLIN | POLLRDNORM); \
+ } else { \
+ mutex_exit(&stp->sd_lock); \
+ } \
+}
+
+#define CONN_UDP_SYNCSTR(connp) \
+ (IPCL_IS_UDP(connp) && (connp)->conn_udp->udp_direct_sockfs)
+
+/*
+ * Macro that checks whether or not a particular UDP conn is
+ * flow-controlling on the read-side. If udp module is directly
+ * above ip, check to see if the drain queue is full; note here
+ * that we check this without any lock protection because this
+ * is a coarse granularity inbound flow-control. If the module
+ * above ip is not udp, then use canputnext to determine the
+ * flow-control.
+ *
+ * Note that these checks are done after the conn is found in
+ * the UDP fanout table. A UDP conn in that table may have its
+ * IPCL_UDP bit cleared from the conn_flags when the application
+ * pops the udp module without issuing an unbind; in this case
+ * IP will still receive packets for the conn and deliver it
+ * upstream via putnext. This is the reason why we have to test
+ * against IPCL_UDP.
+ */
+#define CONN_UDP_FLOWCTLD(connp) \
+ ((CONN_UDP_SYNCSTR(connp) && \
+ (connp)->conn_udp->udp_drain_qfull) || \
+ (!CONN_UDP_SYNCSTR(connp) && !canputnext((connp)->conn_rq)))
+
+/*
+ * Macro that delivers a given message upstream; if udp module
+ * is directly above ip, the message is passed directly into
+ * the stream-less entry point. Otherwise putnext is used.
+ */
+#define CONN_UDP_RECV(connp, mp) { \
+ if (IPCL_IS_UDP(connp)) \
+ udp_conn_recv(connp, mp); \
+ else \
+ putnext((connp)->conn_rq, mp); \
+}
+
+#define ILL_POLL_CAPABLE(ill) \
+ (((ill)->ill_capabilities & ILL_CAPAB_POLL) != 0)
+
+/*
+ * Macro that hands off one or more messages directly to DLD
+ * when the interface is marked with ILL_CAPAB_POLL.
+ */
+#define IP_POLL_ILL_TX(ill, mp) { \
+ ill_poll_capab_t *ill_poll = ill->ill_poll_capab; \
+ ASSERT(ILL_POLL_CAPABLE(ill)); \
+ ASSERT(ill_poll != NULL); \
+ ASSERT(ill_poll->ill_tx != NULL); \
+ ASSERT(ill_poll->ill_tx_handle != NULL); \
+ ill_poll->ill_tx(ill_poll->ill_tx_handle, mp); \
+}
+
+extern int ip_wput_frag_mdt_min;
+extern boolean_t ip_can_frag_mdt(mblk_t *, ssize_t, ssize_t);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _INET_IP_IMPL_H */
diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h
index 78de4a0b86..a5148c57c0 100644
--- a/usr/src/uts/common/inet/ipclassifier.h
+++ b/usr/src/uts/common/inet/ipclassifier.h
@@ -37,6 +37,7 @@ extern "C" {
#include <inet/ip.h>
#include <inet/mi.h>
#include <inet/tcp.h>
+#include <inet/udp_impl.h>
#include <inet/ip6.h>
#include <netinet/in.h> /* for IPPROTO_* constants */
#include <sys/sdt.h>
@@ -58,17 +59,19 @@ typedef void (*edesc_rpf)(void *, mblk_t *, void *);
*/
/* Conn Flags */
-#define IPCL_BOUND 0x80000000 /* Conn in bind table */
-#define IPCL_CONNECTED 0x40000000 /* Conn in connected table */
-#define IPCL_TCP4 0x08000000 /* A TCP connection */
-#define IPCL_TCP6 0x04000000 /* A TCP6 connection */
-#define IPCL_EAGER 0x01000000 /* Incoming connection */
-#define IPCL_CL_LISTENER 0x00800000 /* Cluster listener */
-#define IPCL_ACCEPTOR 0x00400000 /* Sockfs priv acceptor */
-#define IPCL_SOCKET 0x00200000 /* Sockfs connection */
-#define IPCL_CHECK_POLICY 0x00100000 /* Needs policy checking */
+#define IPCL_UDPMOD 0x00020000 /* Is UDP module instance */
+#define IPCL_TCPMOD 0x00040000 /* Is TCP module instance */
#define IPCL_FULLY_BOUND 0x00080000 /* Bound to correct squeue */
-#define IPCL_TCPMOD 0x00040000 /* Is tcp module instance */
+#define IPCL_CHECK_POLICY 0x00100000 /* Needs policy checking */
+#define IPCL_SOCKET 0x00200000 /* Sockfs connection */
+#define IPCL_ACCEPTOR 0x00400000 /* Sockfs priv acceptor */
+#define IPCL_CL_LISTENER 0x00800000 /* Cluster listener */
+#define IPCL_EAGER 0x01000000 /* Incoming connection */
+#define IPCL_UDP 0x02000000 /* A UDP connection */
+#define IPCL_TCP6 0x04000000 /* A TCP6 connection */
+#define IPCL_TCP4 0x08000000 /* A TCP connection */
+#define IPCL_CONNECTED 0x40000000 /* Conn in connected table */
+#define IPCL_BOUND 0x80000000 /* Conn in bind table */
/* Flags identifying the type of conn */
#define IPCL_TCPCONN 0x00000001 /* Flag to indicate cache */
@@ -81,8 +84,6 @@ typedef void (*edesc_rpf)(void *, mblk_t *, void *);
#define IPCL_REMOVED 0x00000020
#define IPCL_REUSED 0x00000040
-#define IS_TCP_CONN(connp) (((connp)->conn_flags & IPCL_TCP) != 0)
-
#define IPCL_IS_TCP4(connp) \
(((connp)->conn_flags & IPCL_TCP4))
@@ -108,6 +109,13 @@ typedef void (*edesc_rpf)(void *, mblk_t *, void *);
#define IPCL_IS_TCP(connp) \
((connp)->conn_flags & (IPCL_TCP4|IPCL_TCP6))
+/*
+ * IPCL_UDP is set on the conn when udp is directly above ip;
+ * this flag is cleared the moment udp is popped.
+ */
+#define IPCL_IS_UDP(connp) \
+ ((connp)->conn_flags & IPCL_UDP)
+
#define IPCL_IS_IPTUN(connp) \
((connp)->conn_ulp == IPPROTO_ENCAP || \
(connp)->conn_ulp == IPPROTO_IPV6)
@@ -169,6 +177,8 @@ struct conn_s {
pad_to_bit_31 : 2;
tcp_t *conn_tcp; /* Pointer to the tcp struct */
+ udp_t *conn_udp; /* Pointer to the udp struct */
+
squeue_t *conn_sqp; /* Squeue for processing */
edesc_rpf conn_recv; /* Pointer to recv routine */
void *conn_pad1;
@@ -483,6 +493,7 @@ extern int ipcl_conn_insert(conn_t *, uint8_t, ipaddr_t, ipaddr_t,
uint32_t);
extern int ipcl_conn_insert_v6(conn_t *, uint8_t, const in6_addr_t *,
const in6_addr_t *, uint32_t, uint_t);
+extern conn_t *ipcl_get_next_conn(connf_t *, conn_t *, uint32_t);
void ipcl_proto_insert(conn_t *, uint8_t);
void ipcl_proto_insert_v6(conn_t *, uint8_t);
diff --git a/usr/src/uts/common/inet/ipp_common.h b/usr/src/uts/common/inet/ipp_common.h
index fff5a4ba7f..5703f29d48 100644
--- a/usr/src/uts/common/inet/ipp_common.h
+++ b/usr/src/uts/common/inet/ipp_common.h
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2002, 2003 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -52,7 +52,7 @@ extern uint32_t ipp_action_count;
/* Apply IPQoS policies for inbound traffic? */
#define IP6_IN_IPP(flags) (IPP_ENABLED(IPP_LOCAL_IN) && \
- (!((flags) & (IP6_NO_IPPOLICY|IP6_IN_NOCKSUM))))
+ (!((flags) & IP6_NO_IPPOLICY)))
/* Apply IPQoS policies for oubound traffic? */
#define IP6_OUT_IPP(flags) \
diff --git a/usr/src/uts/common/inet/led.h b/usr/src/uts/common/inet/led.h
index 463c8acb70..1e7ba80cff 100644
--- a/usr/src/uts/common/inet/led.h
+++ b/usr/src/uts/common/inet/led.h
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -44,12 +44,12 @@ extern "C" {
#include <sys/types.h>
/*
- * Intel x86 can handle unaligned access. However, the checksum routine
+ * x86 can handle unaligned access. However, the checksum routine
* assumes that the source is 16 bit aligned so we always make sure
* that packet headers are 16 bit aligned.
*/
#define OK_16PTR(p) (!((uintptr_t)(p) & 0x1))
-#if defined(__i386)
+#if defined(__x86)
#define OK_32PTR(p) OK_16PTR(p)
#else
#define OK_32PTR(p) (!((uintptr_t)(p) & 0x3))
diff --git a/usr/src/uts/common/inet/optcom.c b/usr/src/uts/common/inet/optcom.c
index af4b08b0e5..8e4ce9358a 100644
--- a/usr/src/uts/common/inet/optcom.c
+++ b/usr/src/uts/common/inet/optcom.c
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -82,8 +82,6 @@ static boolean_t opt_length_ok(opdes_t *, struct T_opthdr *);
static t_uscalar_t optcom_max_optbuf_len(opdes_t *, uint_t);
static boolean_t opt_bloated_maxsize(opdes_t *);
-extern optdb_obj_t tcp_opt_obj;
-
/* Common code for sending back a T_ERROR_ACK. */
void
optcom_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
@@ -220,9 +218,12 @@ svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp)
opdes_t *optd;
boolean_t pass_to_next = B_FALSE;
boolean_t pass_to_ip = B_FALSE;
+ boolean_t is_tcp;
struct T_optmgmt_ack *toa;
struct T_optmgmt_req *tor;
+ is_tcp = (dbobjp == &tcp_opt_obj);
+
/*
* Allocate M_CTL and prepend to the packet for restarting this
* option if needed. IP may need to queue and restart the option
@@ -550,14 +551,14 @@ no_mem:;
opt1->len = opt->len;
bcopy(&opt[1], &opt1[1], opt->len);
/*
- * Pass the option down to IP only if
- * TCP hasn't processed it.
+ * Pass the option down to IP only
+ * if TCP hasn't processed it.
*/
- if (dbobjp == &tcp_opt_obj)
+ if (is_tcp)
pass_to_ip = B_TRUE;
- }
- else
+ } else {
opt1->len = (t_uscalar_t)len;
+ }
opt1 = (struct opthdr *)((uchar_t *)&opt1[1] +
_TPI_ALIGN_OPT(opt1->len));
} /* end for loop */
@@ -639,10 +640,10 @@ restart:
optcom_err_ack(q, mp, TSYSERR, error);
freeb(first_mp);
return (0);
- } else if (error < 0 && dbobjp == &tcp_opt_obj) {
+ } else if (error < 0 && is_tcp) {
/*
- * Pass the option down to IP only if
- * TCP hasn't processed it.
+ * Pass the option down to IP only
+ * if TCP hasn't processed it.
*/
pass_to_ip = B_TRUE;
}
diff --git a/usr/src/uts/common/inet/optcom.h b/usr/src/uts/common/inet/optcom.h
index 8f9226de18..84a64c5317 100644
--- a/usr/src/uts/common/inet/optcom.h
+++ b/usr/src/uts/common/inet/optcom.h
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -205,6 +205,18 @@ typedef struct opt_restart_s {
#define SETFN_CONN_NEGOTIATE 4 /* semantics for T_CONN_*_REQ */
/*
+ * Object to represent database of options to search passed to
+ * {sock,tpi}optcom_req() interface routine to take care of option
+ * management and associated methods.
+ */
+extern optdb_obj_t tcp_opt_obj;
+extern optdb_obj_t udp_opt_obj;
+extern optdb_obj_t ip_opt_obj;
+
+extern uint_t tcp_max_optsize;
+extern uint_t udp_max_optsize;
+
+/*
* Function prototypes
*/
extern void optcom_err_ack(queue_t *, mblk_t *, t_scalar_t, int);
diff --git a/usr/src/uts/common/inet/snmpcom.c b/usr/src/uts/common/inet/snmpcom.c
index 852fb167b9..fa417fae88 100644
--- a/usr/src/uts/common/inet/snmpcom.c
+++ b/usr/src/uts/common/inet/snmpcom.c
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 1992,1997-2003 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -51,6 +51,11 @@
#include <inet/optcom.h>
#include <inet/snmpcom.h>
+#include <inet/ip.h>
+#include <inet/ip6.h>
+#include <inet/tcp.h>
+#include <inet/udp_impl.h>
+
#define DEFAULT_LENGTH sizeof (long)
#define DATA_MBLK_SIZE 1024
#define TOAHDR_SIZE (sizeof (struct T_optmgmt_ack) +\
@@ -90,10 +95,7 @@ static sor_t req_arr[] = {
* ctl buffer.
*/
int
-snmp_append_data(mpdata, blob, len)
- mblk_t *mpdata;
- char *blob;
- int len;
+snmp_append_data(mblk_t *mpdata, char *blob, int len)
{
if (!mpdata)
@@ -169,12 +171,7 @@ snmp_append_data2(mblk_t *mpdata, mblk_t **last_mpp, char *blob, int len)
* for them: getfn() returns 0, setfn() returns 1.
*/
boolean_t
-snmpcom_req(q, mp, setfn, getfn, credp)
- queue_t *q;
- mblk_t *mp;
- pfi_t setfn;
- pfi_t getfn;
- cred_t *credp;
+snmpcom_req(queue_t *q, mblk_t *mp, pfi_t setfn, pfi_t getfn, cred_t *credp)
{
mblk_t *mpctl;
struct opthdr *req;
@@ -184,6 +181,7 @@ snmpcom_req(q, mp, setfn, getfn, credp)
sor_t *sreq;
struct T_optmgmt_req *tor = (struct T_optmgmt_req *)mp->b_rptr;
struct T_optmgmt_ack *toa;
+ boolean_t pass_to_ip = B_FALSE;
if (mp->b_cont) { /* don't deal with multiple mblk's */
freemsg(mp->b_cont);
@@ -209,6 +207,10 @@ snmpcom_req(q, mp, setfn, getfn, credp)
req_start->level <= EXPER_RANGE_END)))
return (B_FALSE);
+ if (setfn == tcp_snmp_set || setfn == udp_snmp_set ||
+ getfn == tcp_snmp_get || getfn == udp_snmp_get)
+ pass_to_ip = B_TRUE;
+
switch (tor->MGMT_flags) {
case T_NEGOTIATE:
@@ -235,8 +237,10 @@ snmpcom_req(q, mp, setfn, getfn, credp)
(uchar_t *)&req[1], req->len))
goto bad_req4;
}
- if (q->q_next)
+ if (q->q_next != NULL)
putnext(q, mp);
+ else if (pass_to_ip)
+ ip_output(Q_TO_CONN(q), mp, q, IP_WPUT);
else
freemsg(mp);
return (B_TRUE);
@@ -268,9 +272,12 @@ snmpcom_req(q, mp, setfn, getfn, credp)
* this is bottom module of stream, send up an EOD ctl msg,
* otherwise pass onto the next guy for processing.
*/
- if (q->q_next) {
+ if (q->q_next != NULL) {
putnext(q, mp);
return (B_TRUE);
+ } else if (pass_to_ip) {
+ ip_output(Q_TO_CONN(q), mp, q, IP_WPUT);
+ return (B_TRUE);
}
if (mp->b_cont) {
freemsg(mp->b_cont);
diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c
index ab6aae1a88..48e9409721 100644
--- a/usr/src/uts/common/inet/squeue.c
+++ b/usr/src/uts/common/inet/squeue.c
@@ -729,7 +729,8 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, sqproc_t proc, void *arg,
#endif
#if SQUEUE_DEBUG
conn_t *connp = (conn_t *)arg;
- ASSERT(connp->conn_tcp->tcp_connp == connp);
+ ASSERT(!IPCL_IS_TCP(connp) || connp->conn_tcp->tcp_connp == connp);
+ ASSERT(!IPCL_IS_UDP(connp) || connp->conn_udp->udp_connp == connp);
#endif
ASSERT(proc != NULL);
@@ -954,9 +955,10 @@ squeue_enter_nodrain(squeue_t *sqp, mblk_t *mp, sqproc_t proc, void *arg,
ASSERT(sqp != NULL);
ASSERT(mp != NULL);
ASSERT(mp->b_next == NULL);
- ASSERT(connp->conn_tcp->tcp_connp == connp);
-
+ ASSERT(!IPCL_IS_TCP(connp) || connp->conn_tcp->tcp_connp == connp);
+ ASSERT(!IPCL_IS_UDP(connp) || connp->conn_udp->udp_connp == connp);
ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
+
mutex_enter(&sqp->sq_lock);
being_processed = (sqp->sq_state & SQS_PROC);
@@ -1100,7 +1102,8 @@ squeue_fill(squeue_t *sqp, mblk_t *mp, sqproc_t proc, void * arg,
ASSERT(sqp != NULL);
ASSERT(mp != NULL);
ASSERT(mp->b_next == NULL);
- ASSERT(connp->conn_tcp->tcp_connp == connp);
+ ASSERT(!IPCL_IS_TCP(connp) || connp->conn_tcp->tcp_connp == connp);
+ ASSERT(!IPCL_IS_UDP(connp) || connp->conn_udp->udp_connp == connp);
ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
mutex_enter(&sqp->sq_lock);
diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h
index 8a2ac05292..fbd594e6e6 100644
--- a/usr/src/uts/common/inet/tcp.h
+++ b/usr/src/uts/common/inet/tcp.h
@@ -286,11 +286,8 @@ typedef struct tcp_s {
tcp_accept_error : 1, /* Error during TLI accept */
tcp_send_discon_ind : 1, /* TLI accept err, send discon ind */
- tcp_fused : 1, /* loopback tcp in fusion mode */
- tcp_unfusable : 1, /* fusion not allowed on endpoint */
- tcp_fused_sigurg : 1, /* send SIGURG upon draining */
tcp_cork : 1, /* tcp_cork option */
- tcp_pad_to_bit_31 : 15;
+ tcp_pad_to_bit_31 : 18;
uint32_t tcp_if_mtu; /* Outgoing interface MTU. */
@@ -514,10 +511,29 @@ typedef struct tcp_s {
#define tcp_ipp_use_min_mtu tcp_sticky_ipp.ipp_use_min_mtu
struct tcp_s *tcp_saved_listener; /* saved value of listener */
+ uint32_t tcp_in_ack_unsent; /* ACK for unsent data cnt. */
+
+ /*
+ * The following fusion-related fields are protected by squeue.
+ */
struct tcp_s *tcp_loopback_peer; /* peer tcp for loopback */
mblk_t *tcp_fused_sigurg_mp; /* M_PCSIG mblk for SIGURG */
+ size_t tcp_fuse_rcv_hiwater; /* fusion receive queue size */
+ uint_t tcp_fuse_rcv_unread_hiwater; /* max # of outstanding pkts */
+ /*
+ * The following fusion-related fields and bit fields are to be
+ * manipulated with squeue protection or with tcp_fuse_lock held.
+ */
+ kmutex_t tcp_fuse_lock;
+ uint_t tcp_fuse_rcv_unread_cnt; /* # of outstanding pkts */
+ uint32_t
+ tcp_fused : 1, /* loopback tcp in fusion mode */
+ tcp_unfusable : 1, /* fusion not allowed on endpoint */
+ tcp_fused_sigurg : 1, /* send SIGURG upon draining */
+ tcp_direct_sockfs : 1, /* direct calls to sockfs */
- uint32_t tcp_in_ack_unsent; /* ACK for unsent data cnt. */
+ tcp_fuse_syncstr_stopped : 1, /* synchronous streams stopped */
+ tcp_fuse_to_bit_31 : 27;
/*
* This variable is accessed without any lock protection
@@ -525,6 +541,8 @@ typedef struct tcp_s {
* with the rest which require such condition.
*/
boolean_t tcp_issocket; /* this is a socket tcp */
+
+ uint32_t tcp_squeue_bytes;
} tcp_t;
extern void tcp_free(tcp_t *tcp);
@@ -537,7 +555,8 @@ extern void tcp_input(void *arg, mblk_t *mp, void *arg2);
extern void tcp_rput_data(void *arg, mblk_t *mp, void *arg2);
extern void *tcp_get_conn(void *arg);
extern void tcp_time_wait_collector(void *arg);
-
+extern int tcp_snmp_get(queue_t *, mblk_t *);
+extern int tcp_snmp_set(queue_t *, int, int, uchar_t *, int len);
/*
* The TCP Fanout structure.
* The hash tables and their linkage (tcp_*_hash_next, tcp_ptp*hn) are
@@ -610,18 +629,6 @@ typedef struct tcp_ioc_abort_conn_s {
#pragma pack()
#endif
-/* Named Dispatch Parameter Management Structure */
-typedef struct tcpparam_s {
- uint32_t tcp_param_min;
- uint32_t tcp_param_max;
- uint32_t tcp_param_val;
- char *tcp_param_name;
-} tcpparam_t;
-
-extern tcpparam_t tcp_param_arr[];
-
-extern boolean_t do_tcp_fusion;
-
#if (defined(_KERNEL) || defined(_KMEMUSER))
extern void tcp_rput_other(tcp_t *tcp, mblk_t *mp);
#endif
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index 8c651d1443..9b995cd7df 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -73,6 +73,7 @@ const char tcp_version[] = "%Z%%M% %I% %E% SMI";
#include <inet/common.h>
#include <inet/ip.h>
+#include <inet/ip_impl.h>
#include <inet/ip6.h>
#include <inet/ip_ndp.h>
#include <inet/mi.h>
@@ -82,6 +83,7 @@ const char tcp_version[] = "%Z%%M% %I% %E% SMI";
#include <inet/snmpcom.h>
#include <inet/kstatcom.h>
#include <inet/tcp.h>
+#include <inet/tcp_impl.h>
#include <net/pfkeyv2.h>
#include <inet/ipsec_info.h>
#include <inet/ipdrop.h>
@@ -230,8 +232,6 @@ int tcp_squeue_wput = 2;
squeue_func_t tcp_squeue_close_proc;
squeue_func_t tcp_squeue_wput_proc;
-extern vmem_t *ip_minor_arena;
-
/*
* This controls how tiny a write must be before we try to copy it
* into the the mblk on the tail of the transmit queue. Not much
@@ -278,9 +278,6 @@ int tcp_tx_pull_len = 16;
* TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags.
*/
-#define TCP_COUNTERS 1
-#define TCP_CLD_COUNTERS 0
-
#ifndef TCP_DEBUG_COUNTER
#ifdef DEBUG
#define TCP_DEBUG_COUNTER 1
@@ -289,6 +286,7 @@ int tcp_tx_pull_len = 16;
#endif
#endif
+#define TCP_CLD_COUNTERS 0
#define TCP_TAG_CLEAN_DEATH 1
#define TCP_MAX_CLEAN_DEATH_TAG 32
@@ -297,20 +295,6 @@ int tcp_tx_pull_len = 16;
static int _lint_dummy_;
#endif
-#if TCP_COUNTERS
-#define TCP_STAT(x) (tcp_statistics.x.value.ui64++)
-#define TCP_STAT_UPDATE(x, n) (tcp_statistics.x.value.ui64 += (n))
-#define TCP_STAT_SET(x, n) (tcp_statistics.x.value.ui64 = (n))
-#elif defined(lint)
-#define TCP_STAT(x) ASSERT(_lint_dummy_ == 0);
-#define TCP_STAT_UPDATE(x, n) ASSERT(_lint_dummy_ == 0);
-#define TCP_STAT_SET(x, n) ASSERT(_lint_dummy_ == 0);
-#else
-#define TCP_STAT(x)
-#define TCP_STAT_UPDATE(x, n)
-#define TCP_STAT_SET(x, n)
-#endif
-
#if TCP_CLD_COUNTERS
static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG];
#define TCP_CLD_STAT(x) tcp_clean_death_stat[x]++
@@ -328,96 +312,7 @@ static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG];
#define TCP_DBGSTAT(x)
#endif
-typedef struct tcp_stat {
- kstat_named_t tcp_time_wait;
- kstat_named_t tcp_time_wait_syn;
- kstat_named_t tcp_time_wait_syn_success;
- kstat_named_t tcp_time_wait_syn_fail;
- kstat_named_t tcp_reinput_syn;
- kstat_named_t tcp_ip_output;
- kstat_named_t tcp_detach_non_time_wait;
- kstat_named_t tcp_detach_time_wait;
- kstat_named_t tcp_time_wait_reap;
- kstat_named_t tcp_clean_death_nondetached;
- kstat_named_t tcp_reinit_calls;
- kstat_named_t tcp_eager_err1;
- kstat_named_t tcp_eager_err2;
- kstat_named_t tcp_eager_blowoff_calls;
- kstat_named_t tcp_eager_blowoff_q;
- kstat_named_t tcp_eager_blowoff_q0;
- kstat_named_t tcp_not_hard_bound;
- kstat_named_t tcp_no_listener;
- kstat_named_t tcp_found_eager;
- kstat_named_t tcp_wrong_queue;
- kstat_named_t tcp_found_eager_binding1;
- kstat_named_t tcp_found_eager_bound1;
- kstat_named_t tcp_eager_has_listener1;
- kstat_named_t tcp_open_alloc;
- kstat_named_t tcp_open_detached_alloc;
- kstat_named_t tcp_rput_time_wait;
- kstat_named_t tcp_listendrop;
- kstat_named_t tcp_listendropq0;
- kstat_named_t tcp_wrong_rq;
- kstat_named_t tcp_rsrv_calls;
- kstat_named_t tcp_eagerfree2;
- kstat_named_t tcp_eagerfree3;
- kstat_named_t tcp_eagerfree4;
- kstat_named_t tcp_eagerfree5;
- kstat_named_t tcp_timewait_syn_fail;
- kstat_named_t tcp_listen_badflags;
- kstat_named_t tcp_timeout_calls;
- kstat_named_t tcp_timeout_cached_alloc;
- kstat_named_t tcp_timeout_cancel_reqs;
- kstat_named_t tcp_timeout_canceled;
- kstat_named_t tcp_timermp_alloced;
- kstat_named_t tcp_timermp_freed;
- kstat_named_t tcp_timermp_allocfail;
- kstat_named_t tcp_timermp_allocdblfail;
- kstat_named_t tcp_push_timer_cnt;
- kstat_named_t tcp_ack_timer_cnt;
- kstat_named_t tcp_ire_null1;
- kstat_named_t tcp_ire_null;
- kstat_named_t tcp_ip_send;
- kstat_named_t tcp_ip_ire_send;
- kstat_named_t tcp_wsrv_called;
- kstat_named_t tcp_flwctl_on;
- kstat_named_t tcp_timer_fire_early;
- kstat_named_t tcp_timer_fire_miss;
- kstat_named_t tcp_freelist_cleanup;
- kstat_named_t tcp_rput_v6_error;
- kstat_named_t tcp_out_sw_cksum;
- kstat_named_t tcp_zcopy_on;
- kstat_named_t tcp_zcopy_off;
- kstat_named_t tcp_zcopy_backoff;
- kstat_named_t tcp_zcopy_disable;
- kstat_named_t tcp_mdt_pkt_out;
- kstat_named_t tcp_mdt_pkt_out_v4;
- kstat_named_t tcp_mdt_pkt_out_v6;
- kstat_named_t tcp_mdt_discarded;
- kstat_named_t tcp_mdt_conn_halted1;
- kstat_named_t tcp_mdt_conn_halted2;
- kstat_named_t tcp_mdt_conn_halted3;
- kstat_named_t tcp_mdt_conn_resumed1;
- kstat_named_t tcp_mdt_conn_resumed2;
- kstat_named_t tcp_mdt_legacy_small;
- kstat_named_t tcp_mdt_legacy_all;
- kstat_named_t tcp_mdt_legacy_ret;
- kstat_named_t tcp_mdt_allocfail;
- kstat_named_t tcp_mdt_addpdescfail;
- kstat_named_t tcp_mdt_allocd;
- kstat_named_t tcp_mdt_linked;
- kstat_named_t tcp_fusion_flowctl;
- kstat_named_t tcp_fusion_backenabled;
- kstat_named_t tcp_fusion_urg;
- kstat_named_t tcp_fusion_putnext;
- kstat_named_t tcp_fusion_unfusable;
- kstat_named_t tcp_fusion_aborted;
- kstat_named_t tcp_fusion_unqualified;
- kstat_named_t tcp_in_ack_unsent_drop;
-} tcp_stat_t;
-
-#if (TCP_COUNTERS || TCP_DEBUG_COUNTER)
-static tcp_stat_t tcp_statistics = {
+tcp_stat_t tcp_statistics = {
{ "tcp_time_wait", KSTAT_DATA_UINT64 },
{ "tcp_time_wait_syn", KSTAT_DATA_UINT64 },
{ "tcp_time_wait_success", KSTAT_DATA_UINT64 },
@@ -475,6 +370,7 @@ static tcp_stat_t tcp_statistics = {
{ "tcp_freelist_cleanup", KSTAT_DATA_UINT64 },
{ "tcp_rput_v6_error", KSTAT_DATA_UINT64 },
{ "tcp_out_sw_cksum", KSTAT_DATA_UINT64 },
+ { "tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 },
{ "tcp_zcopy_on", KSTAT_DATA_UINT64 },
{ "tcp_zcopy_off", KSTAT_DATA_UINT64 },
{ "tcp_zcopy_backoff", KSTAT_DATA_UINT64 },
@@ -502,13 +398,14 @@ static tcp_stat_t tcp_statistics = {
{ "tcp_fusion_unfusable", KSTAT_DATA_UINT64 },
{ "tcp_fusion_aborted", KSTAT_DATA_UINT64 },
{ "tcp_fusion_unqualified", KSTAT_DATA_UINT64 },
+ { "tcp_fusion_rrw_busy", KSTAT_DATA_UINT64 },
+ { "tcp_fusion_rrw_msgcnt", KSTAT_DATA_UINT64 },
{ "tcp_in_ack_unsent_drop", KSTAT_DATA_UINT64 },
+ { "tcp_sock_fallback", KSTAT_DATA_UINT64 },
};
static kstat_t *tcp_kstat;
-#endif
-
/*
* Call either ip_output or ip_output_v6. This replaces putnext() calls on the
* tcp write side.
@@ -519,12 +416,6 @@ static kstat_t *tcp_kstat;
connp->conn_send(connp, (mp), (q), IP_WPUT); \
}
-/*
- * Was this tcp created via socket() interface?
- */
-#define TCP_IS_SOCKET(tcp) ((tcp)->tcp_issocket)
-
-
/* Macros for timestamp comparisons */
#define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0)
#define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0)
@@ -569,8 +460,6 @@ static ipdropper_t tcp_dropper;
*/
#define TCP_OLD_URP_INTERPRETATION 1
-#define TCP_IS_DETACHED(tcp) ((tcp)->tcp_detached)
-
#define TCP_IS_DETACHED_NONEAGER(tcp) \
(TCP_IS_DETACHED(tcp) && \
(!(tcp)->tcp_hard_binding))
@@ -687,22 +576,6 @@ static kmem_cache_t *tcp_timercache;
kmem_cache_t *tcp_sack_info_cache;
kmem_cache_t *tcp_iphc_cache;
-#define TCP_TIMER(tcp, f, tim) tcp_timeout(tcp->tcp_connp, f, tim)
-#define TCP_TIMER_CANCEL(tcp, id) tcp_timeout_cancel(tcp->tcp_connp, id)
-
-/*
- * To restart the TCP retransmission timer.
- */
-#define TCP_TIMER_RESTART(tcp, intvl) \
-{ \
- if ((tcp)->tcp_timer_tid != 0) { \
- (void) TCP_TIMER_CANCEL((tcp), \
- (tcp)->tcp_timer_tid); \
- } \
- (tcp)->tcp_timer_tid = TCP_TIMER((tcp), tcp_timer, \
- MSEC_TO_TICK(intvl)); \
-}
-
/*
* For scalability, we must not run a timer for every TCP connection
* in TIME_WAIT state. To see why, consider (for time wait interval of
@@ -951,7 +824,6 @@ static void tcp_ip_notify(tcp_t *tcp);
static mblk_t *tcp_ire_mp(mblk_t *mp);
static void tcp_iss_init(tcp_t *tcp);
static void tcp_keepalive_killer(void *arg);
-static int tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk);
static int tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt);
static void tcp_mss_set(tcp_t *tcp, uint32_t size);
static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp,
@@ -985,7 +857,6 @@ static void tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval,
tcp_t *thisstream, cred_t *cr);
static uint_t tcp_rcv_drain(queue_t *q, tcp_t *tcp);
-static void tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len);
static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags);
static boolean_t tcp_send_rst_chk(void);
static void tcp_ss_rexmit(tcp_t *tcp);
@@ -994,9 +865,6 @@ static void tcp_process_options(tcp_t *, tcph_t *);
static void tcp_rput_common(tcp_t *tcp, mblk_t *mp);
static void tcp_rsrv(queue_t *q);
static int tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd);
-static int tcp_snmp_get(queue_t *q, mblk_t *mpctl);
-static int tcp_snmp_set(queue_t *q, int level, int name, uchar_t *ptr,
- int len);
static int tcp_snmp_state(tcp_t *tcp);
static int tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp,
cred_t *cr);
@@ -1018,7 +886,6 @@ static void tcp_timer(void *arg);
static void tcp_timer_callback(void *);
static in_port_t tcp_update_next_port(in_port_t port, boolean_t random);
static in_port_t tcp_get_next_priv_port(void);
-static void tcp_wput(queue_t *q, mblk_t *mp);
static void tcp_wput_sock(queue_t *q, mblk_t *mp);
void tcp_wput_accept(queue_t *q, mblk_t *mp);
static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent);
@@ -1044,7 +911,6 @@ static mblk_t *tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send,
boolean_t sendall, uint32_t *seg_len, boolean_t rexmit);
static void tcp_ack_timer(void *arg);
static mblk_t *tcp_ack_mp(tcp_t *tcp);
-static void tcp_push_timer(void *arg);
static void tcp_xmit_early_reset(char *str, mblk_t *mp,
uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len);
static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq,
@@ -1076,9 +942,6 @@ boolean_t tcp_reserved_port_del(in_port_t, in_port_t);
boolean_t tcp_reserved_port_check(in_port_t);
static tcp_t *tcp_alloc_temp_tcp(in_port_t);
static int tcp_reserved_port_list(queue_t *, mblk_t *, caddr_t, cred_t *);
-static void tcp_timers_stop(tcp_t *);
-static timeout_id_t tcp_timeout(conn_t *, void (*)(void *), clock_t);
-static clock_t tcp_timeout_cancel(conn_t *, timeout_id_t);
static mblk_t *tcp_mdt_info_mp(mblk_t *);
static void tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t);
static int tcp_mdt_add_attrs(multidata_t *, const mblk_t *,
@@ -1098,7 +961,6 @@ static void tcp_kstat_init(void);
static void tcp_kstat_fini(void);
static int tcp_kstat_update(kstat_t *kp, int rw);
void tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp);
-conn_t *tcp_get_next_conn(connf_t *, conn_t *);
static int tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
tcph_t *tcph, uint_t ipvers, mblk_t *idmp);
static int tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
@@ -1118,14 +980,6 @@ static mblk_t *tcp_zcopy_disable(tcp_t *, mblk_t *);
static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, int);
static void tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t);
-static void tcp_fuse(tcp_t *, uchar_t *, tcph_t *);
-static void tcp_unfuse(tcp_t *);
-static boolean_t tcp_fuse_output(tcp_t *, mblk_t *);
-static void tcp_fuse_output_urg(tcp_t *, mblk_t *);
-static boolean_t tcp_fuse_rcv_drain(queue_t *, tcp_t *, mblk_t **);
-
-extern mblk_t *allocb_tryhard(size_t);
-
/*
* Routines related to the TCP_IOC_ABORT_CONN ioctl command.
*
@@ -1155,17 +1009,12 @@ static void tcp_ioctl_abort_conn(queue_t *, mblk_t *);
static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
boolean_t);
-
-static void tcp_clrqfull(tcp_t *);
-static void tcp_setqfull(tcp_t *);
-
static struct module_info tcp_rinfo = {
-#define TCP_MODULE_ID 5105
- TCP_MODULE_ID, "tcp", 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER
+ TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER
};
static struct module_info tcp_winfo = {
- TCP_MODULE_ID, "tcp", 0, INFPSZ, 127, 16
+ TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16
};
/*
@@ -1173,11 +1022,12 @@ static struct module_info tcp_winfo = {
* to pass through.
*/
struct qinit tcp_mod_rinit = {
- (pfi_t)putnext, NULL, tcp_open, tcp_modclose, NULL, &tcp_rinfo
+ (pfi_t)putnext, NULL, tcp_open, ip_snmpmod_close, NULL, &tcp_rinfo,
};
struct qinit tcp_mod_winit = {
- (pfi_t)tcp_wput_mod, NULL, tcp_open, tcp_modclose, NULL, &tcp_rinfo
+ (pfi_t)ip_snmpmod_wput, NULL, tcp_open, ip_snmpmod_close, NULL,
+ &tcp_rinfo
};
/*
@@ -1210,11 +1060,18 @@ struct qinit tcp_acceptor_winit = {
(pfi_t)tcp_wput_accept, NULL, NULL, NULL, NULL, &tcp_winfo
};
+/*
+ * Entry points for TCP loopback (read side only)
+ */
+struct qinit tcp_loopback_rinit = {
+ (pfi_t)0, (pfi_t)tcp_rsrv, tcp_open, tcp_close, (pfi_t)0,
+ &tcp_rinfo, NULL, tcp_fuse_rrw, tcp_fuse_rinfop, STRUIOT_STANDARD
+};
+
struct streamtab tcpinfo = {
&tcp_rinit, &tcp_winit
};
-
extern squeue_func_t tcp_squeue_wput_proc;
extern squeue_func_t tcp_squeue_timer_proc;
@@ -1306,15 +1163,6 @@ uint32_t tcp_reserved_port_array_size = 0;
mib2_tcp_t tcp_mib; /* SNMP fixed size info */
kstat_t *tcp_mibkp; /* kstat exporting tcp_mib data */
-/*
- * Object to represent database of options to search passed to
- * {sock,tpi}optcom_req() interface routine to take care of option
- * management and associated methods.
- * XXX These and other externs should ideally move to a TCP header
- */
-extern optdb_obj_t tcp_opt_obj;
-extern uint_t tcp_max_optsize;
-
boolean_t tcp_icmp_source_quench = B_FALSE;
/*
* Following assumes TPI alignment requirements stay along 32 bit
@@ -1454,76 +1302,6 @@ tcpparam_t tcp_param_arr[] = {
};
/* END CSTYLED */
-
-#define tcp_time_wait_interval tcp_param_arr[0].tcp_param_val
-#define tcp_conn_req_max_q tcp_param_arr[1].tcp_param_val
-#define tcp_conn_req_max_q0 tcp_param_arr[2].tcp_param_val
-#define tcp_conn_req_min tcp_param_arr[3].tcp_param_val
-#define tcp_conn_grace_period tcp_param_arr[4].tcp_param_val
-#define tcp_cwnd_max_ tcp_param_arr[5].tcp_param_val
-#define tcp_dbg tcp_param_arr[6].tcp_param_val
-#define tcp_smallest_nonpriv_port tcp_param_arr[7].tcp_param_val
-#define tcp_ip_abort_cinterval tcp_param_arr[8].tcp_param_val
-#define tcp_ip_abort_linterval tcp_param_arr[9].tcp_param_val
-#define tcp_ip_abort_interval tcp_param_arr[10].tcp_param_val
-#define tcp_ip_notify_cinterval tcp_param_arr[11].tcp_param_val
-#define tcp_ip_notify_interval tcp_param_arr[12].tcp_param_val
-#define tcp_ipv4_ttl tcp_param_arr[13].tcp_param_val
-#define tcp_keepalive_interval_high tcp_param_arr[14].tcp_param_max
-#define tcp_keepalive_interval tcp_param_arr[14].tcp_param_val
-#define tcp_keepalive_interval_low tcp_param_arr[14].tcp_param_min
-#define tcp_maxpsz_multiplier tcp_param_arr[15].tcp_param_val
-#define tcp_mss_def_ipv4 tcp_param_arr[16].tcp_param_val
-#define tcp_mss_max_ipv4 tcp_param_arr[17].tcp_param_val
-#define tcp_mss_min tcp_param_arr[18].tcp_param_val
-#define tcp_naglim_def tcp_param_arr[19].tcp_param_val
-#define tcp_rexmit_interval_initial tcp_param_arr[20].tcp_param_val
-#define tcp_rexmit_interval_max tcp_param_arr[21].tcp_param_val
-#define tcp_rexmit_interval_min tcp_param_arr[22].tcp_param_val
-#define tcp_deferred_ack_interval tcp_param_arr[23].tcp_param_val
-#define tcp_snd_lowat_fraction tcp_param_arr[24].tcp_param_val
-#define tcp_sth_rcv_hiwat tcp_param_arr[25].tcp_param_val
-#define tcp_sth_rcv_lowat tcp_param_arr[26].tcp_param_val
-#define tcp_dupack_fast_retransmit tcp_param_arr[27].tcp_param_val
-#define tcp_ignore_path_mtu tcp_param_arr[28].tcp_param_val
-#define tcp_smallest_anon_port tcp_param_arr[29].tcp_param_val
-#define tcp_largest_anon_port tcp_param_arr[30].tcp_param_val
-#define tcp_xmit_hiwat tcp_param_arr[31].tcp_param_val
-#define tcp_xmit_lowat tcp_param_arr[32].tcp_param_val
-#define tcp_recv_hiwat tcp_param_arr[33].tcp_param_val
-#define tcp_recv_hiwat_minmss tcp_param_arr[34].tcp_param_val
-#define tcp_fin_wait_2_flush_interval tcp_param_arr[35].tcp_param_val
-#define tcp_co_min tcp_param_arr[36].tcp_param_val
-#define tcp_max_buf tcp_param_arr[37].tcp_param_val
-#define tcp_strong_iss tcp_param_arr[38].tcp_param_val
-#define tcp_rtt_updates tcp_param_arr[39].tcp_param_val
-#define tcp_wscale_always tcp_param_arr[40].tcp_param_val
-#define tcp_tstamp_always tcp_param_arr[41].tcp_param_val
-#define tcp_tstamp_if_wscale tcp_param_arr[42].tcp_param_val
-#define tcp_rexmit_interval_extra tcp_param_arr[43].tcp_param_val
-#define tcp_deferred_acks_max tcp_param_arr[44].tcp_param_val
-#define tcp_slow_start_after_idle tcp_param_arr[45].tcp_param_val
-#define tcp_slow_start_initial tcp_param_arr[46].tcp_param_val
-#define tcp_co_timer_interval tcp_param_arr[47].tcp_param_val
-#define tcp_sack_permitted tcp_param_arr[48].tcp_param_val
-#define tcp_trace tcp_param_arr[49].tcp_param_val
-#define tcp_compression_enabled tcp_param_arr[50].tcp_param_val
-#define tcp_ipv6_hoplimit tcp_param_arr[51].tcp_param_val
-#define tcp_mss_def_ipv6 tcp_param_arr[52].tcp_param_val
-#define tcp_mss_max_ipv6 tcp_param_arr[53].tcp_param_val
-#define tcp_rev_src_routes tcp_param_arr[54].tcp_param_val
-#define tcp_local_dack_interval tcp_param_arr[55].tcp_param_val
-#define tcp_ndd_get_info_interval tcp_param_arr[56].tcp_param_val
-#define tcp_local_dacks_max tcp_param_arr[57].tcp_param_val
-#define tcp_ecn_permitted tcp_param_arr[58].tcp_param_val
-#define tcp_rst_sent_rate_enabled tcp_param_arr[59].tcp_param_val
-#define tcp_rst_sent_rate tcp_param_arr[60].tcp_param_val
-#define tcp_push_timer_interval tcp_param_arr[61].tcp_param_val
-#define tcp_use_smss_as_mss_opt tcp_param_arr[62].tcp_param_val
-#define tcp_keepalive_abort_interval_high tcp_param_arr[63].tcp_param_max
-#define tcp_keepalive_abort_interval tcp_param_arr[63].tcp_param_val
-#define tcp_keepalive_abort_interval_low tcp_param_arr[63].tcp_param_min
-
/*
* tcp_mdt_hdr_{head,tail}_min are the leading and trailing spaces of
* each header fragment in the header buffer. Each parameter value has
@@ -1720,642 +1498,6 @@ extern uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family,
*/
int cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg);
-#define IPH_TCPH_CHECKSUMP(ipha, hlen) \
- ((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + 16)))
-
-#ifdef _BIG_ENDIAN
-#define IP_TCP_CSUM_COMP IPPROTO_TCP
-#else
-#define IP_TCP_CSUM_COMP (IPPROTO_TCP << 8)
-#endif
-
-#define IP_HDR_CKSUM(ipha, sum, v_hlen_tos_len, ttl_protocol) { \
- (sum) += (ttl_protocol) + (ipha)->ipha_ident + \
- ((v_hlen_tos_len) >> 16) + \
- ((v_hlen_tos_len) & 0xFFFF) + \
- (ipha)->ipha_fragment_offset_and_flags; \
- (sum) = (((sum) & 0xFFFF) + ((sum) >> 16)); \
- (sum) = ~((sum) + ((sum) >> 16)); \
- (ipha)->ipha_hdr_checksum = (uint16_t)(sum); \
-}
-
-/*
- * Macros that determine whether or not IP processing is needed for TCP.
- */
-#define TCP_IPOPT_POLICY_V4(tcp) \
- ((tcp)->tcp_ipversion == IPV4_VERSION && \
- ((tcp)->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH || \
- CONN_OUTBOUND_POLICY_PRESENT((tcp)->tcp_connp) || \
- CONN_INBOUND_POLICY_PRESENT((tcp)->tcp_connp)))
-
-#define TCP_IPOPT_POLICY_V6(tcp) \
- ((tcp)->tcp_ipversion == IPV6_VERSION && \
- ((tcp)->tcp_ip_hdr_len != IPV6_HDR_LEN || \
- CONN_OUTBOUND_POLICY_PRESENT_V6((tcp)->tcp_connp) || \
- CONN_INBOUND_POLICY_PRESENT_V6((tcp)->tcp_connp)))
-
-#define TCP_LOOPBACK_IP(tcp) \
- (TCP_IPOPT_POLICY_V4(tcp) || TCP_IPOPT_POLICY_V6(tcp) || \
- !CONN_IS_MD_FASTPATH((tcp)->tcp_connp))
-
-boolean_t do_tcp_fusion = B_TRUE;
-
-/*
- * This routine gets called by the eager tcp upon changing state from
- * SYN_RCVD to ESTABLISHED. It fuses a direct path between itself
- * and the active connect tcp such that the regular tcp processings
- * may be bypassed under allowable circumstances. Because the fusion
- * requires both endpoints to be in the same squeue, it does not work
- * for simultaneous active connects because there is no easy way to
- * switch from one squeue to another once the connection is created.
- * This is different from the eager tcp case where we assign it the
- * same squeue as the one given to the active connect tcp during open.
- */
-static void
-tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
-{
- conn_t *peer_connp, *connp = tcp->tcp_connp;
- tcp_t *peer_tcp;
-
- ASSERT(!tcp->tcp_fused);
- ASSERT(tcp->tcp_loopback);
- ASSERT(tcp->tcp_loopback_peer == NULL);
- /*
- * We need to check the listener tcp to make sure it's a socket
- * endpoint, but we can't really use tcp_listener since we get
- * here after sending up T_CONN_IND and tcp_wput_accept() may be
- * called independently, at which point tcp_listener is cleared;
- * this is why we use tcp_saved_listener. The listener itself
- * is guaranteed to be around until tcp_accept_finish() is called
- * on this eager -- this won't happen until we're done since
- * we're inside the eager's perimeter now.
- */
- ASSERT(tcp->tcp_saved_listener != NULL);
-
- /*
- * Lookup peer endpoint; search for the remote endpoint having
- * the reversed address-port quadruplet in ESTABLISHED state,
- * which is guaranteed to be unique in the system. Zone check
- * is applied accordingly for loopback address, but not for
- * local address since we want fusion to happen across Zones.
- */
- if (tcp->tcp_ipversion == IPV4_VERSION) {
- peer_connp = ipcl_conn_tcp_lookup_reversed_ipv4(connp,
- (ipha_t *)iphdr, tcph);
- } else {
- peer_connp = ipcl_conn_tcp_lookup_reversed_ipv6(connp,
- (ip6_t *)iphdr, tcph);
- }
-
- /*
- * We can only proceed if peer exists, resides in the same squeue
- * as our conn and is not raw-socket. The squeue assignment of
- * this eager tcp was done earlier at the time of SYN processing
- * in ip_fanout_tcp{_v6}. Note that similar squeues by itself
- * doesn't guarantee a safe condition to fuse, hence we perform
- * additional tests below.
- */
- ASSERT(peer_connp == NULL || peer_connp != connp);
- if (peer_connp == NULL || peer_connp->conn_sqp != connp->conn_sqp ||
- !IPCL_IS_TCP(peer_connp)) {
- if (peer_connp != NULL) {
- TCP_STAT(tcp_fusion_unqualified);
- CONN_DEC_REF(peer_connp);
- }
- return;
- }
- peer_tcp = peer_connp->conn_tcp; /* active connect tcp */
-
- ASSERT(peer_tcp != NULL && peer_tcp != tcp && !peer_tcp->tcp_fused);
- ASSERT(peer_tcp->tcp_loopback && peer_tcp->tcp_loopback_peer == NULL);
- ASSERT(peer_connp->conn_sqp == connp->conn_sqp);
-
- /*
- * Fuse the endpoints; we perform further checks against both
- * tcp endpoints to ensure that a fusion is allowed to happen.
- * In particular we bail out for TPI, non-simple TCP/IP or if
- * IPsec/IPQoS policy exists. We could actually do it for the
- * XTI/TLI/TPI case but this requires more testing, so for now
- * we handle only the socket case.
- */
- if (!tcp->tcp_unfusable && !peer_tcp->tcp_unfusable &&
- TCP_IS_SOCKET(tcp->tcp_saved_listener) && TCP_IS_SOCKET(peer_tcp) &&
- !TCP_LOOPBACK_IP(tcp) && !TCP_LOOPBACK_IP(peer_tcp) &&
- !IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) {
- mblk_t *mp;
- struct stroptions *stropt;
- queue_t *peer_rq = peer_tcp->tcp_rq;
- size_t sth_hiwat;
-
- ASSERT(!TCP_IS_DETACHED(peer_tcp) && peer_rq != NULL);
-
- /*
- * We need to drain data on both endpoints during unfuse.
- * If we need to send up SIGURG at the time of draining,
- * we want to be sure that an mblk is readily available.
- * This is why we pre-allocate the M_PCSIG mblks for both
- * endpoints which will only be used during/after unfuse.
- */
- if ((mp = allocb(1, BPRI_HI)) == NULL) {
- CONN_DEC_REF(peer_connp);
- return;
- }
- ASSERT(tcp->tcp_fused_sigurg_mp == NULL);
- tcp->tcp_fused_sigurg_mp = mp;
-
- if ((mp = allocb(1, BPRI_HI)) == NULL) {
- freeb(tcp->tcp_fused_sigurg_mp);
- tcp->tcp_fused_sigurg_mp = NULL;
- CONN_DEC_REF(peer_connp);
- return;
- }
- ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL);
- peer_tcp->tcp_fused_sigurg_mp = mp;
-
- /* Allocate M_SETOPTS mblk */
- mp = allocb(sizeof (*stropt), BPRI_HI);
- if (mp == NULL) {
- freeb(tcp->tcp_fused_sigurg_mp);
- tcp->tcp_fused_sigurg_mp = NULL;
- freeb(peer_tcp->tcp_fused_sigurg_mp);
- peer_tcp->tcp_fused_sigurg_mp = NULL;
- CONN_DEC_REF(peer_connp);
- return;
- }
-
- /* Fuse both endpoints */
- peer_tcp->tcp_loopback_peer = tcp;
- tcp->tcp_loopback_peer = peer_tcp;
- peer_tcp->tcp_fused = tcp->tcp_fused = B_TRUE;
-
- /*
- * We never use regular tcp paths in fusion and should
- * therefore clear tcp_unsent on both endpoints. Having
- * them set to non-zero values means asking for trouble
- * especially after unfuse, where we may end up sending
- * through regular tcp paths which expect xmit_list and
- * friends to be correctly setup.
- */
- peer_tcp->tcp_unsent = tcp->tcp_unsent = 0;
-
- tcp_timers_stop(tcp);
- tcp_timers_stop(peer_tcp);
-
- /*
- * Set the stream head's write offset value to zero, since we
- * won't be needing any room for TCP/IP headers, and tell it
- * to not break up the writes. This would reduce the amount
- * of work done by kmem. In addition, we set the receive
- * buffer to twice that of q_hiwat in order to simulate the
- * non-fusion case. Note that we can only do this for the
- * active connect tcp since our eager is still detached;
- * it will be dealt with later in tcp_accept_finish().
- */
- DB_TYPE(mp) = M_SETOPTS;
- mp->b_wptr += sizeof (*stropt);
-
- sth_hiwat = peer_rq->q_hiwat << 1;
- if (sth_hiwat > tcp_max_buf)
- sth_hiwat = tcp_max_buf;
-
- stropt = (struct stroptions *)mp->b_rptr;
- stropt->so_flags = SO_MAXBLK | SO_WROFF | SO_HIWAT;
- stropt->so_maxblk = tcp_maxpsz_set(peer_tcp, B_FALSE);
- stropt->so_wroff = 0;
- stropt->so_hiwat = MAX(sth_hiwat, tcp_sth_rcv_hiwat);
-
- /* Send the options up */
- putnext(peer_rq, mp);
- } else {
- TCP_STAT(tcp_fusion_unqualified);
- }
- CONN_DEC_REF(peer_connp);
-}
-
-/*
- * Unfuse a previously-fused pair of tcp loopback endpoints.
- */
-static void
-tcp_unfuse(tcp_t *tcp)
-{
- tcp_t *peer_tcp = tcp->tcp_loopback_peer;
-
- ASSERT(tcp->tcp_fused && peer_tcp != NULL);
- ASSERT(peer_tcp->tcp_fused && peer_tcp->tcp_loopback_peer == tcp);
- ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp);
- ASSERT(tcp->tcp_unsent == 0 && peer_tcp->tcp_unsent == 0);
- ASSERT(tcp->tcp_fused_sigurg_mp != NULL);
- ASSERT(peer_tcp->tcp_fused_sigurg_mp != NULL);
-
- /*
- * Drain any pending data; the detached check is needed because
- * we may be called from tcp_fuse_output(). Note that in case of
- * a detached tcp, the draining will happen later after the tcp
- * is unfused. For non-urgent data, this can be handled by the
- * regular tcp_rcv_drain(). If we have urgent data sitting in
- * the receive list, we will need to send up a SIGURG signal first
- * before draining the data. All of these will be handled by the
- * code in tcp_fuse_rcv_drain() when called from tcp_rcv_drain().
- */
- if (!TCP_IS_DETACHED(tcp)) {
- (void) tcp_fuse_rcv_drain(tcp->tcp_rq, tcp,
- &tcp->tcp_fused_sigurg_mp);
- }
- if (!TCP_IS_DETACHED(peer_tcp)) {
- (void) tcp_fuse_rcv_drain(peer_tcp->tcp_rq, peer_tcp,
- &peer_tcp->tcp_fused_sigurg_mp);
- }
- /* Lift up any flow-control conditions */
- if (tcp->tcp_flow_stopped) {
- tcp_clrqfull(tcp);
- tcp->tcp_flow_stopped = B_FALSE;
- TCP_STAT(tcp_fusion_backenabled);
- }
- if (peer_tcp->tcp_flow_stopped) {
- tcp_clrqfull(peer_tcp);
- peer_tcp->tcp_flow_stopped = B_FALSE;
- TCP_STAT(tcp_fusion_backenabled);
- }
-
- /* Free up M_PCSIG mblk(s) if not needed */
- if (!tcp->tcp_fused_sigurg && tcp->tcp_fused_sigurg_mp != NULL) {
- freeb(tcp->tcp_fused_sigurg_mp);
- tcp->tcp_fused_sigurg_mp = NULL;
- }
- if (!peer_tcp->tcp_fused_sigurg &&
- peer_tcp->tcp_fused_sigurg_mp != NULL) {
- freeb(peer_tcp->tcp_fused_sigurg_mp);
- peer_tcp->tcp_fused_sigurg_mp = NULL;
- }
-
- /*
- * Update th_seq and th_ack in the header template
- */
- U32_TO_ABE32(tcp->tcp_snxt, tcp->tcp_tcph->th_seq);
- U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack);
- U32_TO_ABE32(peer_tcp->tcp_snxt, peer_tcp->tcp_tcph->th_seq);
- U32_TO_ABE32(peer_tcp->tcp_rnxt, peer_tcp->tcp_tcph->th_ack);
-
- /* Unfuse the endpoints */
- peer_tcp->tcp_fused = tcp->tcp_fused = B_FALSE;
- peer_tcp->tcp_loopback_peer = tcp->tcp_loopback_peer = NULL;
-}
-
-/*
- * Fusion output routine for urgent data. This routine is called by
- * tcp_fuse_output() for handling non-M_DATA mblks.
- */
-static void
-tcp_fuse_output_urg(tcp_t *tcp, mblk_t *mp)
-{
- mblk_t *mp1;
- struct T_exdata_ind *tei;
- tcp_t *peer_tcp = tcp->tcp_loopback_peer;
- mblk_t *head, *prev_head = NULL;
-
- ASSERT(tcp->tcp_fused);
- ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp);
- ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
- ASSERT(mp->b_cont != NULL && DB_TYPE(mp->b_cont) == M_DATA);
- ASSERT(MBLKL(mp) >= sizeof (*tei) && MBLKL(mp->b_cont) > 0);
-
- /*
- * Urgent data arrives in the form of T_EXDATA_REQ from above.
- * Each occurence denotes a new urgent pointer. For each new
- * urgent pointer we signal (SIGURG) the receiving app to indicate
- * that it needs to go into urgent mode. This is similar to the
- * urgent data handling in the regular tcp. We don't need to keep
- * track of where the urgent pointer is, because each T_EXDATA_REQ
- * "advances" the urgent pointer for us.
- *
- * The actual urgent data carried by T_EXDATA_REQ is then prepended
- * by a T_EXDATA_IND before being enqueued behind any existing data
- * destined for the receiving app. There is only a single urgent
- * pointer (out-of-band mark) for a given tcp. If the new urgent
- * data arrives before the receiving app reads some existing urgent
- * data, the previous marker is lost. This behavior is emulated
- * accordingly below, by removing any existing T_EXDATA_IND messages
- * and essentially converting old urgent data into non-urgent.
- */
- ASSERT(tcp->tcp_valid_bits & TCP_URG_VALID);
- /* Let sender get out of urgent mode */
- tcp->tcp_valid_bits &= ~TCP_URG_VALID;
-
- /*
- * Send up SIGURG to the receiving peer; if the peer is detached
- * or if we can't allocate the M_PCSIG, indicate that we need to
- * signal upon draining to the peer by marking tcp_fused_sigurg.
- * This flag will only get cleared once SIGURG is delivered and
- * is not affected by the tcp_fused flag -- delivery will still
- * happen even after an endpoint is unfused, to handle the case
- * where the sending endpoint immediately closes/unfuses after
- * sending urgent data and the accept is not yet finished.
- */
- if (!TCP_IS_DETACHED(peer_tcp) &&
- ((mp1 = allocb(1, BPRI_HI)) != NULL ||
- (mp1 = allocb_tryhard(1)) != NULL)) {
- peer_tcp->tcp_fused_sigurg = B_FALSE;
- /* Send up the signal */
- DB_TYPE(mp1) = M_PCSIG;
- *mp1->b_wptr++ = (uchar_t)SIGURG;
- putnext(peer_tcp->tcp_rq, mp1);
- } else {
- peer_tcp->tcp_fused_sigurg = B_TRUE;
- }
-
- /* Reuse T_EXDATA_REQ mblk for T_EXDATA_IND */
- DB_TYPE(mp) = M_PROTO;
- tei = (struct T_exdata_ind *)mp->b_rptr;
- tei->PRIM_type = T_EXDATA_IND;
- tei->MORE_flag = 0;
- mp->b_wptr = (uchar_t *)&tei[1];
-
- TCP_STAT(tcp_fusion_urg);
- BUMP_MIB(&tcp_mib, tcpOutUrg);
-
- head = peer_tcp->tcp_rcv_list;
- while (head != NULL) {
- /*
- * Remove existing T_EXDATA_IND, keep the data which follows
- * it and relink our list. Note that we don't modify the
- * tcp_rcv_last_tail since it never points to T_EXDATA_IND.
- */
- if (DB_TYPE(head) != M_DATA) {
- mp1 = head;
-
- ASSERT(DB_TYPE(mp1->b_cont) == M_DATA);
- head = mp1->b_cont;
- mp1->b_cont = NULL;
- head->b_next = mp1->b_next;
- mp1->b_next = NULL;
- if (prev_head != NULL)
- prev_head->b_next = head;
- if (peer_tcp->tcp_rcv_list == mp1)
- peer_tcp->tcp_rcv_list = head;
- if (peer_tcp->tcp_rcv_last_head == mp1)
- peer_tcp->tcp_rcv_last_head = head;
- freeb(mp1);
- }
- prev_head = head;
- head = head->b_next;
- }
-}
-
-/*
- * Fusion output routine, called by tcp_output() and tcp_wput_proto().
- */
-static boolean_t
-tcp_fuse_output(tcp_t *tcp, mblk_t *mp)
-{
- tcp_t *peer_tcp = tcp->tcp_loopback_peer;
- queue_t *peer_rq;
- mblk_t *mp_tail = mp;
- uint32_t send_size = 0;
-
- ASSERT(tcp->tcp_fused);
- ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp);
- ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp);
- ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO ||
- DB_TYPE(mp) == M_PCPROTO);
-
- peer_rq = peer_tcp->tcp_rq;
-
- /* If this connection requires IP, unfuse and use regular path */
- if (TCP_LOOPBACK_IP(tcp) || TCP_LOOPBACK_IP(peer_tcp) ||
- IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) {
- TCP_STAT(tcp_fusion_aborted);
- tcp_unfuse(tcp);
- return (B_FALSE);
- }
-
- for (;;) {
- if (DB_TYPE(mp_tail) == M_DATA)
- send_size += MBLKL(mp_tail);
- if (mp_tail->b_cont == NULL)
- break;
- mp_tail = mp_tail->b_cont;
- }
-
- if (send_size == 0) {
- freemsg(mp);
- return (B_TRUE);
- }
-
- /*
- * Handle urgent data; we either send up SIGURG to the peer now
- * or do it later when we drain, in case the peer is detached
- * or if we're short of memory for M_PCSIG mblk.
- */
- if (DB_TYPE(mp) != M_DATA)
- tcp_fuse_output_urg(tcp, mp);
-
- /*
- * Enqueue data into the peer's receive list; we may or may not
- * drain the contents depending on the conditions below.
- */
- tcp_rcv_enqueue(peer_tcp, mp, send_size);
-
- /* In case it wrapped around and also to keep it constant */
- peer_tcp->tcp_rwnd += send_size;
-
- /*
- * If peer is detached, exercise flow-control when needed; we will
- * get back-enabled either in tcp_accept_finish() or tcp_unfuse().
- */
- if (TCP_IS_DETACHED(peer_tcp) &&
- peer_tcp->tcp_rcv_cnt > peer_rq->q_hiwat) {
- tcp_setqfull(tcp);
- tcp->tcp_flow_stopped = B_TRUE;
- TCP_STAT(tcp_fusion_flowctl);
- }
-
- loopback_packets++;
- tcp->tcp_last_sent_len = send_size;
-
- /* Need to adjust the following SNMP MIB-related variables */
- tcp->tcp_snxt += send_size;
- tcp->tcp_suna = tcp->tcp_snxt;
- peer_tcp->tcp_rnxt += send_size;
- peer_tcp->tcp_rack = peer_tcp->tcp_rnxt;
-
- BUMP_MIB(&tcp_mib, tcpOutDataSegs);
- UPDATE_MIB(&tcp_mib, tcpOutDataBytes, send_size);
-
- BUMP_MIB(&tcp_mib, tcpInSegs);
- BUMP_MIB(&tcp_mib, tcpInDataInorderSegs);
- UPDATE_MIB(&tcp_mib, tcpInDataInorderBytes, send_size);
-
- BUMP_LOCAL(tcp->tcp_obsegs);
- BUMP_LOCAL(peer_tcp->tcp_ibsegs);
-
- if (!TCP_IS_DETACHED(peer_tcp)) {
- /*
- * If we can't send SIGURG above due to lack of memory,
- * schedule push timer and try again. Otherwise drain
- * the data if we're not flow-controlled.
- */
- if (peer_tcp->tcp_fused_sigurg) {
- if (peer_tcp->tcp_push_tid == 0) {
- peer_tcp->tcp_push_tid =
- TCP_TIMER(peer_tcp, tcp_push_timer,
- MSEC_TO_TICK(tcp_push_timer_interval));
- }
- } else if (!tcp->tcp_flow_stopped) {
- if (!canputnext(peer_rq)) {
- tcp_setqfull(tcp);
- tcp->tcp_flow_stopped = B_TRUE;
- TCP_STAT(tcp_fusion_flowctl);
- } else {
- ASSERT(peer_tcp->tcp_rcv_list != NULL);
- (void) tcp_fuse_rcv_drain(peer_rq,
- peer_tcp, NULL);
- TCP_STAT(tcp_fusion_putnext);
- }
- }
- }
- return (B_TRUE);
-}
-
-/*
- * This routine gets called to deliver data upstream on a fused or
- * previously fused tcp loopback endpoint; the latter happens only
- * when there is a pending SIGURG signal plus urgent data that can't
- * be sent upstream in the past.
- */
-static boolean_t
-tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp)
-{
- mblk_t *mp;
-#ifdef DEBUG
- uint_t cnt = 0;
-#endif
-
- ASSERT(tcp->tcp_loopback);
- ASSERT(tcp->tcp_fused || tcp->tcp_fused_sigurg);
- ASSERT(!tcp->tcp_fused || tcp->tcp_loopback_peer != NULL);
- ASSERT(sigurg_mpp != NULL || tcp->tcp_fused);
-
- /* No need for the push timer now, in case it was scheduled */
- if (tcp->tcp_push_tid != 0) {
- (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
- tcp->tcp_push_tid = 0;
- }
- /*
- * If there's urgent data sitting in receive list and we didn't
- * get a chance to send up a SIGURG signal, make sure we send
- * it first before draining in order to ensure that SIOCATMARK
- * works properly.
- */
- if (tcp->tcp_fused_sigurg) {
- /*
- * sigurg_mpp is normally NULL, i.e. when we're still
- * fused and didn't get here because of tcp_unfuse().
- * In this case try hard to allocate the M_PCSIG mblk.
- */
- if (sigurg_mpp == NULL &&
- (mp = allocb(1, BPRI_HI)) == NULL &&
- (mp = allocb_tryhard(1)) == NULL) {
- /* Alloc failed; try again next time */
- tcp->tcp_push_tid = TCP_TIMER(tcp, tcp_push_timer,
- MSEC_TO_TICK(tcp_push_timer_interval));
- return (B_TRUE);
- } else if (sigurg_mpp != NULL) {
- /*
- * Use the supplied M_PCSIG mblk; it means we're
- * either unfused or in the process of unfusing,
- * and the drain must happen now.
- */
- mp = *sigurg_mpp;
- *sigurg_mpp = NULL;
- }
- ASSERT(mp != NULL);
-
- tcp->tcp_fused_sigurg = B_FALSE;
- /* Send up the signal */
- DB_TYPE(mp) = M_PCSIG;
- *mp->b_wptr++ = (uchar_t)SIGURG;
- putnext(q, mp);
- /*
- * Let the regular tcp_rcv_drain() path handle
- * draining the data if we're no longer fused.
- */
- if (!tcp->tcp_fused)
- return (B_FALSE);
- }
-
- /* Drain the data */
- while ((mp = tcp->tcp_rcv_list) != NULL) {
- tcp->tcp_rcv_list = mp->b_next;
- mp->b_next = NULL;
-#ifdef DEBUG
- cnt += msgdsize(mp);
-#endif
- putnext(q, mp);
- }
-
- ASSERT(cnt == tcp->tcp_rcv_cnt);
- tcp->tcp_rcv_last_head = NULL;
- tcp->tcp_rcv_last_tail = NULL;
- tcp->tcp_rcv_cnt = 0;
- tcp->tcp_rwnd = q->q_hiwat;
-
- return (B_TRUE);
-}
-
-/*
- * This is the walker function, which is TCP specific.
- * It walks through the conn_hash bucket searching for the
- * next valid connp/tcp in the list, selecting connp/tcp
- * which haven't closed or condemned. It also REFHOLDS the
- * reference for the tcp, ensuring that the tcp exists
- * when the caller uses the tcp.
- *
- * tcp_get_next_conn
- * get the next entry in the conn global list
- * and put a reference on the next_conn.
- * decrement the reference on the current conn.
- */
-conn_t *
-tcp_get_next_conn(connf_t *connfp, conn_t *connp)
-{
- conn_t *next_connp;
-
- if (connfp == NULL)
- return (NULL);
-
- mutex_enter(&connfp->connf_lock);
-
- next_connp = (connp == NULL) ?
- connfp->connf_head : connp->conn_g_next;
-
- while (next_connp != NULL) {
- mutex_enter(&next_connp->conn_lock);
- if ((next_connp->conn_state_flags &
- (CONN_CONDEMNED | CONN_INCIPIENT)) ||
- !IPCL_IS_TCP(next_connp)) {
- /*
- * This conn has been condemned or
- * is closing.
- */
- mutex_exit(&next_connp->conn_lock);
- next_connp = next_connp->conn_g_next;
- continue;
- }
- ASSERT(next_connp->conn_tcp != NULL);
- CONN_INC_REF_LOCKED(next_connp);
- mutex_exit(&next_connp->conn_lock);
- break;
- }
-
- mutex_exit(&connfp->connf_lock);
-
- if (connp != NULL) {
- CONN_DEC_REF(connp);
- }
-
- return (next_connp);
-}
-
/*
* Figure out the value of window scale opton. Note that the rwnd is
* ASSUMED to be rounded up to the nearest MSS before the calculation.
@@ -2808,7 +1950,7 @@ tcp_accept(tcp_t *listener, mblk_t *mp)
acceptor = tcp_acceptor_hash_lookup(acceptor_id);
if (acceptor == NULL) {
if (listener->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1,
+ (void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_accept: did not find acceptor 0x%x\n",
acceptor_id);
@@ -3737,7 +2879,7 @@ tcp_bind(tcp_t *tcp, mblk_t *mp)
ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE,
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
"tcp_bind: bad req, len %u",
(uint_t)(mp->b_wptr - mp->b_rptr));
}
@@ -3768,7 +2910,7 @@ tcp_bind(tcp_t *tcp, mblk_t *mp)
goto do_bind;
}
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE,
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
"tcp_bind: bad state, %d", tcp->tcp_state);
}
tcp_err_ack(tcp, mp, TOUTSTATE, 0);
@@ -3805,7 +2947,7 @@ tcp_bind(tcp_t *tcp, mblk_t *mp)
sizeof (sin_t));
if (sin == NULL || !OK_32PTR((char *)sin)) {
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1,
+ (void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_bind: bad address parameter, "
"offset %d, len %d",
@@ -3835,7 +2977,7 @@ tcp_bind(tcp_t *tcp, mblk_t *mp)
tbr->ADDR_offset, sizeof (sin6_t));
if (sin6 == NULL || !OK_32PTR((char *)sin6)) {
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1,
+ (void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_bind: bad IPv6 address parameter, "
"offset %d, len %d", tbr->ADDR_offset,
@@ -3857,7 +2999,7 @@ tcp_bind(tcp_t *tcp, mblk_t *mp)
default:
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE,
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
"tcp_bind: bad address length, %d",
tbr->ADDR_length);
}
@@ -3945,7 +3087,7 @@ tcp_bind(tcp_t *tcp, mblk_t *mp)
if (secpolicy_net_privaddr(cr, requested_port) != 0) {
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1,
+ (void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_bind: no priv for port %d",
requested_port);
@@ -3963,7 +3105,7 @@ tcp_bind(tcp_t *tcp, mblk_t *mp)
if (allocated_port == 0) {
if (bind_to_req_port_only) {
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1,
+ (void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_bind: requested addr busy");
}
@@ -3971,7 +3113,7 @@ tcp_bind(tcp_t *tcp, mblk_t *mp)
} else {
/* If we are out of ports, fail the bind. */
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1,
+ (void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_bind: out of ports?");
}
@@ -4436,7 +3578,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
(void) putnextctl1(q, M_FLUSH, FLUSHR);
}
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE|SL_ERROR,
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
"tcp_clean_death: discon err %d", err);
}
mp = mi_tpi_discon_ind(NULL, err, 0);
@@ -4444,7 +3586,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
putnext(q, mp);
} else {
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1,
+ (void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_clean_death, sending M_ERROR");
}
@@ -4476,7 +3618,6 @@ tcp_stop_lingering(tcp_t *tcp)
if (tcp->tcp_state > TCPS_LISTEN) {
tcp_acceptor_hash_remove(tcp);
if (tcp->tcp_flow_stopped) {
- tcp->tcp_flow_stopped = B_FALSE;
tcp_clrqfull(tcp);
}
@@ -4621,23 +3762,6 @@ tcp_close(queue_t *q, int flags)
return (0);
}
-int
-tcp_modclose(queue_t *q)
-{
- conn_t *connp = Q_TO_CONN(q);
- ASSERT((connp->conn_flags & IPCL_TCPMOD) != 0);
-
- qprocsoff(q);
-
- if (connp->conn_cred != NULL) {
- crfree(connp->conn_cred);
- connp->conn_cred = NULL;
- }
- CONN_DEC_REF(connp);
- q->q_ptr = WR(q)->q_ptr = NULL;
- return (0);
-}
-
static int
tcpclose_accept(queue_t *q)
{
@@ -4798,7 +3922,6 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2)
tcp_acceptor_hash_remove(tcp);
if (tcp->tcp_flow_stopped) {
- tcp->tcp_flow_stopped = B_FALSE;
tcp_clrqfull(tcp);
}
@@ -4922,7 +4045,7 @@ tcp_close_detached(tcp_t *tcp)
/*
* Stop all TCP timers, and free the timer mblks if requested.
*/
-static void
+void
tcp_timers_stop(tcp_t *tcp)
{
if (tcp->tcp_timer_tid != 0) {
@@ -5285,7 +4408,7 @@ tcp_drop_q0(tcp_t *tcp)
return (B_FALSE);
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 3, SL_TRACE,
+ (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
"tcp_drop_q0: listen half-open queue (max=%d) overflow"
" (%d pending) on %s, drop one", tcp_conn_req_max_q0,
tcp->tcp_conn_req_cnt_q0,
@@ -5371,8 +4494,8 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
connp->conn_remv6 = ip6h->ip6_src;
/* db_cksumstuff is set at ip_fanout_tcp_v6 */
- ifindex = (int)mp->b_datap->db_cksumstuff;
- mp->b_datap->db_cksumstuff = 0;
+ ifindex = (int)DB_CKSUMSTUFF(mp);
+ DB_CKSUMSTUFF(mp) = 0;
sin6 = sin6_null;
sin6.sin6_addr = ip6h->ip6_src;
@@ -5727,8 +4850,8 @@ tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp)
mp->b_datap->db_struioflag &= ~STRUIO_POLICY;
}
- new_sqp = (squeue_t *)mp->b_datap->db_cksumstart;
- mp->b_datap->db_cksumstart = 0;
+ new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
+ DB_CKSUMSTART(mp) = 0;
ASSERT(OK_32PTR(mp->b_rptr));
ipvers = IPH_HDR_VERSION(mp->b_rptr);
@@ -6012,7 +5135,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
TCP_STAT(tcp_listendrop);
BUMP_MIB(&tcp_mib, tcpListenDrop);
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE|SL_ERROR,
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
"tcp_conn_request: listen backlog (max=%d) "
"overflow (%d pending) on %s",
tcp->tcp_conn_req_max, tcp->tcp_conn_req_cnt_q,
@@ -6037,7 +5160,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
mutex_exit(&tcp->tcp_eager_lock);
BUMP_MIB(&tcp_mib, tcpListenDropQ0);
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 3, SL_TRACE,
+ (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
"tcp_conn_request: listen half-open queue "
"(max=%d) full (%d pending) on %s",
tcp_conn_req_max_q0,
@@ -6058,8 +5181,8 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
* otherwise an error case if neither of them is set.
*/
if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
- new_sqp = (squeue_t *)mp->b_datap->db_cksumstart;
- mp->b_datap->db_cksumstart = 0;
+ new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
+ DB_CKSUMSTART(mp) = 0;
mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
econnp = (conn_t *)tcp_get_conn(arg2);
if (econnp == NULL)
@@ -6420,7 +5543,7 @@ tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2)
uint32_t conn_flags;
if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
- new_sqp = (squeue_t *)mp->b_datap->db_cksumstart;
+ new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
} else {
goto done;
}
@@ -7174,7 +6297,7 @@ tcp_disconnect(tcp_t *tcp, mblk_t *mp)
*/
if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_hard_binding) {
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE,
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
"tcp_disconnect: bad state, %d", tcp->tcp_state);
}
tcp_err_ack(tcp, mp, TOUTSTATE, 0);
@@ -7988,10 +7111,6 @@ tcp_reinit(tcp_t *tcp)
/* Cancel outstanding timers */
tcp_timers_stop(tcp);
- if (tcp->tcp_flow_stopped) {
- tcp->tcp_flow_stopped = B_FALSE;
- tcp_clrqfull(tcp);
- }
/*
* Reset everything in the state vector, after updating global
* MIB data from instance counters.
@@ -8006,6 +7125,10 @@ tcp_reinit(tcp_t *tcp)
tcp_zcopy_notify(tcp);
tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL;
tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0;
+ if (tcp->tcp_flow_stopped &&
+ TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
+ tcp_clrqfull(tcp);
+ }
tcp_close_mpp(&tcp->tcp_reass_head);
tcp->tcp_reass_tail = NULL;
if (tcp->tcp_rcv_list != NULL) {
@@ -8193,7 +7316,6 @@ tcp_reinit_values(tcp)
tcp->tcp_fin_sent = 0;
tcp->tcp_ordrel_done = 0;
- ASSERT(tcp->tcp_flow_stopped == 0);
tcp->tcp_debug = 0;
tcp->tcp_dontroute = 0;
tcp->tcp_broadcast = 0;
@@ -8390,14 +7512,22 @@ tcp_reinit_values(tcp)
ASSERT(tcp->tcp_rthdrlen == 0);
PRESERVE(tcp->tcp_drop_opt_ack_cnt);
+ /* Reset fusion-related fields */
tcp->tcp_fused = B_FALSE;
tcp->tcp_unfusable = B_FALSE;
tcp->tcp_fused_sigurg = B_FALSE;
+ tcp->tcp_direct_sockfs = B_FALSE;
+ tcp->tcp_fuse_syncstr_stopped = B_FALSE;
tcp->tcp_loopback_peer = NULL;
+ tcp->tcp_fuse_rcv_hiwater = 0;
+ tcp->tcp_fuse_rcv_unread_hiwater = 0;
+ tcp->tcp_fuse_rcv_unread_cnt = 0;
tcp->tcp_in_ack_unsent = 0;
tcp->tcp_cork = B_FALSE;
+ PRESERVE(tcp->tcp_squeue_bytes);
+
#undef DONTCARE
#undef PRESERVE
}
@@ -8469,10 +7599,16 @@ tcp_init_values(tcp_t *tcp)
tcp->tcp_mdt_hdr_head = 0;
tcp->tcp_mdt_hdr_tail = 0;
+ /* Reset fusion-related fields */
tcp->tcp_fused = B_FALSE;
tcp->tcp_unfusable = B_FALSE;
tcp->tcp_fused_sigurg = B_FALSE;
+ tcp->tcp_direct_sockfs = B_FALSE;
+ tcp->tcp_fuse_syncstr_stopped = B_FALSE;
tcp->tcp_loopback_peer = NULL;
+ tcp->tcp_fuse_rcv_hiwater = 0;
+ tcp->tcp_fuse_rcv_unread_hiwater = 0;
+ tcp->tcp_fuse_rcv_unread_cnt = 0;
/* Initialize the header template */
if (tcp->tcp_ipversion == IPV4_VERSION) {
@@ -9505,7 +8641,7 @@ tcp_keepalive_killer(void *arg)
MSEC_TO_TICK(firetime));
}
-static int
+int
tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
{
queue_t *q = tcp->tcp_rq;
@@ -9515,7 +8651,10 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
if (TCP_IS_DETACHED(tcp))
return (mss);
- if (tcp->tcp_mdt || tcp->tcp_maxpsz == 0) {
+ if (tcp->tcp_fused) {
+ maxpsz = tcp_fuse_maxpsz_set(tcp);
+ mss = INFPSZ;
+ } else if (tcp->tcp_mdt || tcp->tcp_maxpsz == 0) {
/*
* Set the sd_qn_maxpsz according to the socket send buffer
* size, and sd_maxblk to INFPSZ (-1). This will essentially
@@ -9545,9 +8684,6 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
if (set_maxblk)
(void) mi_set_sth_maxblk(q, mss);
- if (tcp->tcp_loopback)
- (void) mi_set_sth_copyopt(tcp->tcp_rq, COPYCACHED);
-
return (mss);
}
@@ -9868,7 +9004,6 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
*/
connp->conn_flags |= IPCL_SOCKET;
tcp->tcp_issocket = 1;
-
WR(q)->q_qinfo = &tcp_sock_winit;
} else {
#ifdef _ILP32
@@ -10452,32 +9587,45 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
if (!checkonly)
tcp->tcp_dgram_errind = onoff;
break;
- case SO_SNDBUF:
+ case SO_SNDBUF: {
+ tcp_t *peer_tcp;
+
if (*i1 > tcp_max_buf) {
*outlenp = 0;
return (ENOBUFS);
}
- if (!checkonly) {
- tcp->tcp_xmit_hiwater = *i1;
- if (tcp_snd_lowat_fraction != 0)
- tcp->tcp_xmit_lowater =
- tcp->tcp_xmit_hiwater /
- tcp_snd_lowat_fraction;
- (void) tcp_maxpsz_set(tcp, B_TRUE);
- /*
- * If we are flow-controlled, recheck the
- * condition. There are apps that increase
- * SO_SNDBUF size when flow-controlled
- * (EWOULDBLOCK), and expect the flow control
- * condition to be lifted right away.
- */
- if (tcp->tcp_flow_stopped &&
- tcp->tcp_unsent < tcp->tcp_xmit_hiwater) {
- tcp->tcp_flow_stopped = B_FALSE;
- tcp_clrqfull(tcp);
- }
+ if (checkonly)
+ break;
+
+ tcp->tcp_xmit_hiwater = *i1;
+ if (tcp_snd_lowat_fraction != 0)
+ tcp->tcp_xmit_lowater =
+ tcp->tcp_xmit_hiwater /
+ tcp_snd_lowat_fraction;
+ (void) tcp_maxpsz_set(tcp, B_TRUE);
+ /*
+ * If we are flow-controlled, recheck the condition.
+ * There are apps that increase SO_SNDBUF size when
+ * flow-controlled (EWOULDBLOCK), and expect the flow
+ * control condition to be lifted right away.
+ *
+ * For the fused tcp loopback case, in order to avoid
+ * a race with the peer's tcp_fuse_rrw() we need to
+ * hold its fuse_lock while accessing tcp_flow_stopped.
+ */
+ peer_tcp = tcp->tcp_loopback_peer;
+ ASSERT(!tcp->tcp_fused || peer_tcp != NULL);
+ if (tcp->tcp_fused)
+ mutex_enter(&peer_tcp->tcp_fuse_lock);
+
+ if (tcp->tcp_flow_stopped &&
+ TCP_UNSENT_BYTES(tcp) < tcp->tcp_xmit_hiwater) {
+ tcp_clrqfull(tcp);
}
+ if (tcp->tcp_fused)
+ mutex_exit(&peer_tcp->tcp_fuse_lock);
break;
+ }
case SO_RCVBUF:
if (*i1 > tcp_max_buf) {
*outlenp = 0;
@@ -11892,7 +11040,7 @@ tcp_rcv_drain(queue_t *q, tcp_t *tcp)
* M_DATA messages are added to the current element.
* Other messages are added as new (b_next) elements.
*/
-static void
+void
tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len)
{
ASSERT(seg_len == msgdsize(mp));
@@ -12380,7 +11528,7 @@ tcp_check_policy(tcp_t *tcp, mblk_t *first_mp, ipha_t *ipha, ip6_t *ip6h,
BUMP_MIB(&ip_mib, ipsecInSucceeded);
return (B_TRUE);
}
- (void) strlog(TCP_MODULE_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE,
+ (void) strlog(TCP_MOD_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE,
"tcp inbound policy mismatch: %s, packet dropped\n",
reason);
BUMP_MIB(&ip_mib, ipsecInFailed);
@@ -13469,7 +12617,7 @@ try_again:;
*/
seg_len -= gap;
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
"tcp_rput: unacceptable, gap %d, rgap %d, "
"flags 0x%x, seg_seq %u, seg_ack %u, "
"seg_len %d, rnxt %u, snxt %u, %s",
@@ -13873,7 +13021,7 @@ ok:;
tcp->tcp_urp_mark_mp = mp1;
flags |= TH_SEND_URP_MARK;
#ifdef DEBUG
- (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
"tcp_rput: sent M_PCSIG 2 seq %x urp %x "
"last %x, %s",
seg_seq, urp, tcp->tcp_urp_last,
@@ -14012,7 +13160,7 @@ ok:;
mp1->b_wptr = (uchar_t *)&tei[1];
tcp->tcp_urp_mp = mp1;
#ifdef DEBUG
- (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
"tcp_rput: allocated exdata_ind %s",
tcp_display(tcp, NULL,
DISP_PORT_ONLY));
@@ -14059,7 +13207,7 @@ ok:;
tcp->tcp_urp_mark_mp->b_flag |= MSGMARKNEXT;
}
#ifdef DEBUG
- (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
"tcp_rput: AT MARK, len %d, flags 0x%x, %s",
seg_len, flags,
tcp_display(tcp, NULL, DISP_PORT_ONLY));
@@ -14067,7 +13215,7 @@ ok:;
} else {
/* Data left until we hit mark */
#ifdef DEBUG
- (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
"tcp_rput: URP %d bytes left, %s",
urp - seg_len, tcp_display(tcp, NULL,
DISP_PORT_ONLY));
@@ -14990,7 +14138,7 @@ est:
/* Ready for a new signal. */
tcp->tcp_urp_last_valid = B_FALSE;
#ifdef DEBUG
- (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
"tcp_rput: sending exdata_ind %s",
tcp_display(tcp, NULL, DISP_PORT_ONLY));
#endif /* DEBUG */
@@ -15026,7 +14174,7 @@ est:
tcp->tcp_fused_sigurg);
if (flags & TH_MARKNEXT_NEEDED) {
#ifdef DEBUG
- (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
"tcp_rput: sending MSGMARKNEXT %s",
tcp_display(tcp, NULL,
DISP_PORT_ONLY));
@@ -15167,7 +14315,7 @@ ack_check:
mp1 = tcp->tcp_urp_mark_mp;
tcp->tcp_urp_mark_mp = NULL;
#ifdef DEBUG
- (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
"tcp_rput: sending zero-length %s %s",
((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" :
"MSGNOTMARKNEXT"),
@@ -15853,7 +15001,7 @@ tcp_rput_other(tcp_t *tcp, mblk_t *mp)
return;
case T_ERROR_ACK:
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1,
+ (void) strlog(TCP_MOD_ID, 0, 1,
SL_TRACE|SL_ERROR,
"tcp_rput_other: case T_ERROR_ACK, "
"ERROR_prim == %d",
@@ -15984,11 +15132,20 @@ tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2)
ASSERT(tcp->tcp_connp->conn_sqp ==
peer_tcp->tcp_connp->conn_sqp);
+ /*
+ * Normally we would not get backenabled in synchronous
+ * streams mode, but in case this happens, we need to stop
+ * synchronous streams temporarily to prevent a race with
+ * tcp_fuse_rrw() or tcp_fuse_rinfop(). It is safe to access
+ * tcp_rcv_list here because those entry points will return
+ * right away when synchronous streams is stopped.
+ */
+ TCP_FUSE_SYNCSTR_STOP(tcp);
if (tcp->tcp_rcv_list != NULL)
(void) tcp_rcv_drain(tcp->tcp_rq, tcp);
tcp_clrqfull(peer_tcp);
- peer_tcp->tcp_flow_stopped = B_FALSE;
+ TCP_FUSE_SYNCSTR_RESUME(tcp);
TCP_STAT(tcp_fusion_backenabled);
return;
}
@@ -16118,6 +15275,30 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd)
uint32_t max_transmittable_rwnd;
boolean_t tcp_detached = TCP_IS_DETACHED(tcp);
+ if (tcp->tcp_fused) {
+ size_t sth_hiwat;
+ tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+
+ ASSERT(peer_tcp != NULL);
+ /*
+ * Record the stream head's high water mark for
+ * this endpoint; this is used for flow-control
+ * purposes in tcp_fuse_output().
+ */
+ sth_hiwat = tcp_fuse_set_rcv_hiwat(tcp, rwnd);
+ if (!tcp_detached)
+ (void) mi_set_sth_hiwat(tcp->tcp_rq, sth_hiwat);
+
+ /*
+ * In the fusion case, the maxpsz stream head value of
+ * our peer is set according to its send buffer size
+ * and our receive buffer size; since the latter may
+ * have changed we need to update the peer's maxpsz.
+ */
+ (void) tcp_maxpsz_set(peer_tcp, B_TRUE);
+ return (rwnd);
+ }
+
if (tcp_detached)
old_max_rwnd = tcp->tcp_rwnd;
else
@@ -16196,23 +15377,16 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd)
* Set the Stream head high water mark. This doesn't have to be
* here, since we are simply using default values, but we would
* prefer to choose these values algorithmically, with a likely
- * relationship to rwnd. For fused loopback tcp, we double the
- * amount of buffer in order to simulate the normal tcp case.
+ * relationship to rwnd.
*/
- if (tcp->tcp_fused) {
- (void) mi_set_sth_hiwat(tcp->tcp_rq, MAX(rwnd << 1,
- tcp_sth_rcv_hiwat));
- } else {
- (void) mi_set_sth_hiwat(tcp->tcp_rq, MAX(rwnd,
- tcp_sth_rcv_hiwat));
- }
+ (void) mi_set_sth_hiwat(tcp->tcp_rq, MAX(rwnd, tcp_sth_rcv_hiwat));
return (rwnd);
}
/*
* Return SNMP stuff in buffer in mpdata.
*/
-static int
+int
tcp_snmp_get(queue_t *q, mblk_t *mpctl)
{
mblk_t *mpdata;
@@ -16261,7 +15435,8 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl)
connp = NULL;
- while ((connp = tcp_get_next_conn(connfp, connp))) {
+ while ((connp =
+ ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
tcp_t *tcp;
if (connp->conn_zoneid != zoneid)
@@ -16406,7 +15581,7 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl)
/* Return 0 if invalid set request, 1 otherwise, including non-tcp requests */
/* ARGSUSED */
-static int
+int
tcp_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len)
{
mib2_tcpConnEntry_t *tce = (mib2_tcpConnEntry_t *)ptr;
@@ -16627,7 +15802,8 @@ tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
connp = NULL;
- while ((connp = tcp_get_next_conn(connfp, connp))) {
+ while ((connp =
+ ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
tcp = connp->conn_tcp;
if (zoneid != GLOBAL_ZONEID &&
zoneid != connp->conn_zoneid)
@@ -16723,7 +15899,8 @@ tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
for (i = 0; i < ipcl_bind_fanout_size; i++) {
connfp = &ipcl_bind_fanout[i];
connp = NULL;
- while ((connp = tcp_get_next_conn(connfp, connp))) {
+ while ((connp =
+ ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
tcp = connp->conn_tcp;
if (zoneid != GLOBAL_ZONEID &&
zoneid != connp->conn_zoneid)
@@ -16770,7 +15947,8 @@ tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
for (i = 0; i < ipcl_conn_fanout_size; i++) {
connfp = &ipcl_conn_fanout[i];
connp = NULL;
- while ((connp = tcp_get_next_conn(connfp, connp))) {
+ while ((connp =
+ ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
tcp = connp->conn_tcp;
if (zoneid != GLOBAL_ZONEID &&
zoneid != connp->conn_zoneid)
@@ -16927,7 +16105,7 @@ tcp_timer(void *arg)
*/
if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) {
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1,
+ (void) strlog(TCP_MOD_ID, 0, 1,
SL_TRACE, "tcp_timer: zero win");
}
} else {
@@ -17040,7 +16218,7 @@ tcp_timer(void *arg)
return;
default:
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE|SL_ERROR,
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
"tcp_timer: strange state (%d) %s",
tcp->tcp_state, tcp_display(tcp, NULL,
DISP_PORT_ONLY));
@@ -17372,52 +16550,6 @@ tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2)
}
/*
- * Write side put procedure for TCP module instance.
- * TCP as a module is only used for MIB browsers that push TCP over IP or
- * ARP. The only supported primitives are T_SVR4_OPTMGMT_REQ and
- * T_OPTMGMT_REQ. M_FLUSH messages are only passed downstream; we don't flush
- * our queues as we never enqueue messages there. All ioctls are NAKed and
- * everything else is freed.
- */
-static void
-tcp_wput_mod(queue_t *q, mblk_t *mp)
-{
- switch (DB_TYPE(mp)) {
- case M_PROTO:
- case M_PCPROTO:
- if ((MBLKL(mp) >= sizeof (t_scalar_t)) &&
- ((((union T_primitives *)mp->b_rptr)->type ==
- T_SVR4_OPTMGMT_REQ) ||
- (((union T_primitives *)mp->b_rptr)->type ==
- T_OPTMGMT_REQ))) {
- /*
- * This is the only TPI primitive supported. Its
- * handling does not require tcp_t, but it does require
- * conn_t to check permissions.
- */
- cred_t *cr = DB_CREDDEF(mp, Q_TO_CONN(q)->conn_cred);
- if (!snmpcom_req(q, mp, tcp_snmp_set,
- tcp_snmp_get, cr)) {
- freemsg(mp);
- return;
- }
- } else if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, ENOTSUP))
- != NULL)
- qreply(q, mp);
- break;
- case M_FLUSH:
- putnext(q, mp);
- break;
- case M_IOCTL:
- miocnak(q, mp, 0, ENOTSUP);
- break;
- default:
- freemsg(mp);
- break;
- }
-}
-
-/*
* The TCP fast path write put procedure.
* NOTE: the logic of the fast path is duplicated from tcp_wput_data()
*/
@@ -17441,6 +16573,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
int usable;
conn_t *connp = (conn_t *)arg;
tcp_t *tcp = connp->conn_tcp;
+ uint32_t msize;
/*
* Try and ASSERT the minimum possible references on the
@@ -17455,8 +16588,15 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
(connp->conn_fanout == NULL && connp->conn_ref >= 3));
/* Bypass tcp protocol for fused tcp loopback */
- if (tcp->tcp_fused && tcp_fuse_output(tcp, mp))
- return;
+ if (tcp->tcp_fused) {
+ msize = msgdsize(mp);
+ mutex_enter(&connp->conn_lock);
+ tcp->tcp_squeue_bytes -= msize;
+ mutex_exit(&connp->conn_lock);
+
+ if (tcp_fuse_output(tcp, mp, msize))
+ return;
+ }
mss = tcp->tcp_mss;
if (tcp->tcp_xmit_zc_clean)
@@ -17482,6 +16622,11 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
(len == 0) ||
(len > mss) ||
(tcp->tcp_valid_bits != 0)) {
+ msize = msgdsize(mp);
+ mutex_enter(&connp->conn_lock);
+ tcp->tcp_squeue_bytes -= msize;
+ mutex_exit(&connp->conn_lock);
+
tcp_wput_data(tcp, mp, B_FALSE);
return;
}
@@ -17489,6 +16634,10 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
ASSERT(tcp->tcp_xmit_tail_unsent == 0);
ASSERT(tcp->tcp_fin_sent == 0);
+ mutex_enter(&connp->conn_lock);
+ tcp->tcp_squeue_bytes -= len;
+ mutex_exit(&connp->conn_lock);
+
/* queue new packet onto retransmission queue */
if (tcp->tcp_xmit_head == NULL) {
tcp->tcp_xmit_head = mp;
@@ -17536,6 +16685,11 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
goto slow;
}
+ if (tcp->tcp_flow_stopped &&
+ TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
+ tcp_clrqfull(tcp);
+ }
+
/*
* determine if anything to send (Nagle).
*
@@ -17789,6 +16943,13 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
mp = NULL;
/*
+ * For a loopback connection with tcp_direct_sockfs on, note that
+ * we don't have to protect tcp_rcv_list yet because synchronous
+ * streams has not yet been enabled and tcp_fuse_rrw() cannot
+ * possibly race with us.
+ */
+
+ /*
* Set the max window size (tcp_rq->q_hiwat) of the acceptor
* properly. This is the first time we know of the acceptor'
* queue. So we do it here.
@@ -17828,9 +16989,8 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
/* Allocate room for SACK options if needed. */
stropt->so_flags |= SO_WROFF;
if (tcp->tcp_fused) {
- size_t sth_hiwat;
-
ASSERT(tcp->tcp_loopback);
+ ASSERT(tcp->tcp_loopback_peer != NULL);
/*
* For fused tcp loopback, set the stream head's write
* offset value to zero since we won't be needing any room
@@ -17839,16 +16999,16 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
* Non-fused tcp loopback case is handled separately below.
*/
stropt->so_wroff = 0;
-
/*
- * Override q_hiwat and set it to be twice that of the
- * previous value; this is to simulate non-fusion case.
+ * Record the stream head's high water mark for this endpoint;
+ * this is used for flow-control purposes in tcp_fuse_output().
*/
- sth_hiwat = q->q_hiwat << 1;
- if (sth_hiwat > tcp_max_buf)
- sth_hiwat = tcp_max_buf;
-
- stropt->so_hiwat = MAX(sth_hiwat, tcp_sth_rcv_hiwat);
+ stropt->so_hiwat = tcp_fuse_set_rcv_hiwat(tcp, q->q_hiwat);
+ /*
+ * Update the peer's transmit parameters according to
+ * our recently calculated high water mark value.
+ */
+ (void) tcp_maxpsz_set(tcp->tcp_loopback_peer, B_TRUE);
} else if (tcp->tcp_snd_sack_ok) {
stropt->so_wroff = tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN +
(tcp->tcp_loopback ? 0 : tcp_wroff_xtra);
@@ -17857,15 +17017,6 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
tcp_wroff_xtra);
}
- /*
- * If loopback, set COPYCACHED option to make sure NOT to use
- * non-temporal access.
- */
- if (tcp->tcp_loopback) {
- stropt->so_flags |= SO_COPYOPT;
- stropt->so_copyopt = COPYCACHED;
- }
-
/* Send the options up */
putnext(q, stropt_mp);
@@ -17909,7 +17060,6 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
ASSERT(peer_tcp->tcp_fused);
tcp_clrqfull(peer_tcp);
- peer_tcp->tcp_flow_stopped = B_FALSE;
TCP_STAT(tcp_fusion_backenabled);
}
}
@@ -17924,11 +17074,9 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
* tcp_clean_death was deferred
* for T_ORDREL_IND - do it now
*/
- (void) tcp_clean_death(
- tcp,
- tcp->tcp_client_errno, 21);
- tcp->tcp_deferred_clean_death =
- B_FALSE;
+ (void) tcp_clean_death(tcp,
+ tcp->tcp_client_errno, 21);
+ tcp->tcp_deferred_clean_death = B_FALSE;
}
} else {
/*
@@ -17942,8 +17090,14 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
tcp->tcp_hard_binding = B_FALSE;
tcp->tcp_hard_bound = B_TRUE;
}
+
tcp->tcp_detached = B_FALSE;
+ /* We can enable synchronous streams now */
+ if (tcp->tcp_fused) {
+ tcp_fuse_syncstr_enable_pair(tcp);
+ }
+
if (tcp->tcp_ka_enabled) {
tcp->tcp_ka_last_intrvl = 0;
tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer,
@@ -18236,7 +17390,7 @@ tcp_wput_accept(queue_t *q, mblk_t *mp)
}
}
-static void
+void
tcp_wput(queue_t *q, mblk_t *mp)
{
conn_t *connp = Q_TO_CONN(q);
@@ -18245,12 +17399,27 @@ tcp_wput(queue_t *q, mblk_t *mp)
t_scalar_t type;
uchar_t *rptr;
struct iocblk *iocp;
+ uint32_t msize;
ASSERT(connp->conn_ref >= 2);
switch (DB_TYPE(mp)) {
case M_DATA:
- CONN_INC_REF(connp);
+ tcp = connp->conn_tcp;
+ ASSERT(tcp != NULL);
+
+ msize = msgdsize(mp);
+
+ mutex_enter(&connp->conn_lock);
+ CONN_INC_REF_LOCKED(connp);
+
+ tcp->tcp_squeue_bytes += msize;
+ if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) {
+ mutex_exit(&connp->conn_lock);
+ tcp_setqfull(tcp);
+ } else
+ mutex_exit(&connp->conn_lock);
+
(*tcp_squeue_wput_proc)(connp->conn_sqp, mp,
tcp_output, connp, SQTAG_TCP_OUTPUT);
return;
@@ -18265,7 +17434,7 @@ tcp_wput(queue_t *q, mblk_t *mp)
type = ((union T_primitives *)rptr)->type;
} else {
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1,
+ (void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_wput_proto, dropping one...");
}
@@ -18292,7 +17461,7 @@ tcp_wput(queue_t *q, mblk_t *mp)
/*
* Most ioctls can be processed right away without going via
* squeues - process them right here. Those that do require
- * squeue (currently TCP_IOC_DEFAULT_Q and SIOCPOPSOCKFS)
+ * squeue (currently TCP_IOC_DEFAULT_Q and _SIOCSOCKFALLBACK)
* are processed by tcp_wput_ioctl().
*/
iocp = (struct iocblk *)mp->b_rptr;
@@ -18372,7 +17541,7 @@ tcp_wput_sock(queue_t *wq, mblk_t *mp)
ASSERT(wq->q_qinfo == &tcp_sock_winit);
wq->q_qinfo = &tcp_winit;
- ASSERT(IS_TCP_CONN(connp));
+ ASSERT(IPCL_IS_TCP(connp));
ASSERT(TCP_IS_SOCKET(tcp));
if (DB_TYPE(mp) == M_PCPROTO &&
@@ -18540,7 +17709,6 @@ tcp_zcopy_notify(tcp_t *tcp)
mutex_exit(&stp->sd_lock);
}
-
static void
tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp)
{
@@ -18555,7 +17723,6 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp)
uint32_t hcksum_txflags = 0;
mblk_t *ire_fp_mp;
uint_t ire_fp_mp_len;
- ill_poll_capab_t *ill_poll;
ASSERT(DB_TYPE(mp) == M_DATA);
@@ -18699,7 +17866,7 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp)
*/
}
- if ((ill->ill_capabilities & ILL_CAPAB_HCKSUM) && dohwcksum) {
+ if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
ASSERT(ill->ill_hcksum_capab != NULL);
hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags;
}
@@ -18710,53 +17877,21 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp)
ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION);
up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
- /*
- * Underlying interface supports hardware checksum offload for
- * the tcp payload, along with M_DATA fast path; leave the payload
- * checksum for the hardware to calculate.
- *
- * N.B: We only need to set up checksum info on the first mblk.
- */
- if (hcksum_txflags & HCKSUM_INET_FULL_V4) {
- /*
- * Hardware calculates pseudo-header, header and payload
- * checksums, so clear checksum field in TCP header.
- */
- *up = 0;
- mp->b_datap->db_struioun.cksum.flags |= HCK_FULLCKSUM;
- } else if (hcksum_txflags & HCKSUM_INET_PARTIAL) {
- uint32_t sum;
- /*
- * Partial checksum offload has been enabled. Fill the
- * checksum field in the TCP header with the pseudo-header
- * checksum value.
- */
- sum = *up + cksum + IP_TCP_CSUM_COMP;
- sum = (sum & 0xFFFF) + (sum >> 16);
- *up = (sum & 0xFFFF) + (sum >> 16);
- mp->b_datap->db_cksumstart = IP_SIMPLE_HDR_LENGTH;
- mp->b_datap->db_cksumstuff = IP_SIMPLE_HDR_LENGTH + 16;
- mp->b_datap->db_cksumend = ntohs(ipha->ipha_length);
- mp->b_datap->db_struioun.cksum.flags |= HCK_PARTIALCKSUM;
- } else {
- /* software checksumming */
+ IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, mp, ipha, up,
+ IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum);
+
+ /* Software checksum? */
+ if (DB_CKSUMFLAGS(mp) == 0) {
TCP_STAT(tcp_out_sw_cksum);
- *up = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH,
- cksum + IP_TCP_CSUM_COMP);
- mp->b_datap->db_struioun.cksum.flags = 0;
+ TCP_STAT_UPDATE(tcp_out_sw_cksum_bytes,
+ ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH);
}
ipha->ipha_fragment_offset_and_flags |=
(uint32_t)htons(ire->ire_frag_flag);
- /*
- * Hardware supports IP header checksum offload; clear contents
- * of IP header checksum field. Otherwise we calculate it.
- */
- if (hcksum_txflags & HCKSUM_IPHDRCKSUM) {
- ipha->ipha_hdr_checksum = 0;
- mp->b_datap->db_struioun.cksum.flags |= HCK_IPV4_HDRCKSUM;
- } else {
+ /* Calculate IP header checksum if hardware isn't capable */
+ if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
IP_HDR_CKSUM(ipha, cksum, ((uint32_t *)ipha)[0],
((uint16_t *)ipha)[4]);
}
@@ -18769,13 +17904,13 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp)
ire->ire_last_used_time = lbolt;
BUMP_MIB(&ip_mib, ipOutRequests);
- if (ill->ill_capabilities & ILL_CAPAB_POLL) {
- ill_poll = ill->ill_poll_capab;
- ASSERT(ill_poll != NULL);
- ASSERT(ill_poll->ill_tx != NULL);
- ASSERT(ill_poll->ill_tx_handle != NULL);
-
- ill_poll->ill_tx(ill_poll->ill_tx_handle, mp);
+ if (ILL_POLL_CAPABLE(ill)) {
+ /*
+ * Send the packet directly to DLD, where it may be queued
+ * depending on the availability of transmit resources at
+ * the media layer.
+ */
+ IP_POLL_ILL_TX(ill, mp);
} else {
putnext(ire->ire_stq, mp);
}
@@ -18876,7 +18011,7 @@ tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
DISP_ADDR_AND_PORT));
#else
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1,
+ (void) strlog(TCP_MOD_ID, 0, 1,
SL_TRACE|SL_ERROR,
"tcp_wput_data: data after ordrel, %s\n",
tcp_display(tcp, NULL,
@@ -18888,6 +18023,10 @@ tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent)
(mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) != 0)
tcp_zcopy_notify(tcp);
freemsg(mp);
+ if (tcp->tcp_flow_stopped &&
+ TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
+ tcp_clrqfull(tcp);
+ }
return;
}
@@ -19214,15 +18353,12 @@ done:;
TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
}
/* Note that len is the amount we just sent but with a negative sign */
- len += tcp->tcp_unsent;
- tcp->tcp_unsent = len;
+ tcp->tcp_unsent += len;
if (tcp->tcp_flow_stopped) {
- if (len <= tcp->tcp_xmit_lowater) {
- tcp->tcp_flow_stopped = B_FALSE;
+ if (TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
tcp_clrqfull(tcp);
}
- } else if (len >= tcp->tcp_xmit_hiwater) {
- tcp->tcp_flow_stopped = B_TRUE;
+ } else if (TCP_UNSENT_BYTES(tcp) >= tcp->tcp_xmit_hiwater) {
tcp_setqfull(tcp);
}
}
@@ -19361,6 +18497,12 @@ tcp_mdt_add_attrs(multidata_t *mmd, const mblk_t *dlmp, const boolean_t hwcksum,
}
/*
+ * Smaller and private version of pdescinfo_t used specifically for TCP,
+ * which allows for only two payload spans per packet.
+ */
+typedef struct tcp_pdescinfo_s PDESCINFO_STRUCT(2) tcp_pdescinfo_t;
+
+/*
* tcp_multisend() is called by tcp_wput_data() for Multidata Transmit
* scheme, and returns one the following:
*
@@ -19404,9 +18546,6 @@ tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
#define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 4) & 0x7)
#endif
-#define TCP_CSUM_OFFSET 16
-#define TCP_CSUM_SIZE 2
-
#define PREP_NEW_MULTIDATA() { \
mmd = NULL; \
md_mp = md_hbuf = NULL; \
@@ -19542,8 +18681,7 @@ tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
ill = ire_to_ill(ire);
ASSERT(ill != NULL);
- ASSERT((ill->ill_capabilities & ILL_CAPAB_MDT) == 0 ||
- ill->ill_mdt_capab != NULL);
+ ASSERT(!ILL_MDT_CAPABLE(ill) || ill->ill_mdt_capab != NULL);
if (!tcp->tcp_ire_ill_check_done) {
tcp_ire_ill_check(tcp, ire, ill, B_TRUE);
@@ -19576,16 +18714,16 @@ tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
/* does the interface support hardware checksum offload? */
hwcksum_flags = 0;
- if ((ill->ill_capabilities & ILL_CAPAB_HCKSUM) &&
+ if (ILL_HCKSUM_CAPABLE(ill) &&
(ill->ill_hcksum_capab->ill_hcksum_txflags &
- (HCKSUM_INET_FULL_V4 | HCKSUM_INET_PARTIAL | HCKSUM_IPHDRCKSUM)) &&
- dohwcksum) {
+ (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6 | HCKSUM_INET_PARTIAL |
+ HCKSUM_IPHDRCKSUM)) && dohwcksum) {
if (ill->ill_hcksum_capab->ill_hcksum_txflags &
HCKSUM_IPHDRCKSUM)
hwcksum_flags = HCK_IPV4_HDRCKSUM;
if (ill->ill_hcksum_capab->ill_hcksum_txflags &
- HCKSUM_INET_FULL_V4)
+ (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6))
hwcksum_flags |= HCK_FULLCKSUM;
else if (ill->ill_hcksum_capab->ill_hcksum_txflags &
HCKSUM_INET_PARTIAL)
@@ -19726,10 +18864,16 @@ tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
* checksum offload; these are currently for IPv4.
* For full checksum offload, they are set to zero.
*/
- if (af == AF_INET &&
- (hwcksum_flags & HCK_PARTIALCKSUM)) {
- start = IP_SIMPLE_HDR_LENGTH;
- stuff = IP_SIMPLE_HDR_LENGTH + TCP_CSUM_OFFSET;
+ if ((hwcksum_flags & HCK_PARTIALCKSUM)) {
+ if (af == AF_INET) {
+ start = IP_SIMPLE_HDR_LENGTH;
+ stuff = IP_SIMPLE_HDR_LENGTH +
+ TCP_CHECKSUM_OFFSET;
+ } else {
+ start = IPV6_HDR_LEN;
+ stuff = IPV6_HDR_LEN +
+ TCP_CHECKSUM_OFFSET;
+ }
} else {
start = stuff = 0;
}
@@ -19748,8 +18892,8 @@ tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len,
/* fastpath mblk */
(af == AF_INET) ? ire->ire_dlureq_mp :
ire->ire_nce->nce_res_mp,
- /* hardware checksum enabled (IPv4 only) */
- (af == AF_INET && hwcksum_flags != 0),
+ /* hardware checksum enabled */
+ (hwcksum_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)),
/* hardware checksum offsets */
start, stuff, 0,
/* hardware checksum flag */
@@ -20224,8 +19368,8 @@ legacy_send_no_md:
ASSERT(IPVER(ip6h) == IPV6_VERSION);
ASSERT(ip6h->ip6_nxt == IPPROTO_TCP);
ASSERT(PDESC_HDRL(pkt_info) >=
- (IPV6_HDR_LEN + TCP_CSUM_OFFSET +
- TCP_CSUM_SIZE));
+ (IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET +
+ TCP_CHECKSUM_SIZE));
ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
if (tcp->tcp_ip_forward_progress) {
@@ -20273,29 +19417,45 @@ legacy_send_no_md:
/* offset for TCP header checksum */
up = IPH_TCPH_CHECKSUMP(ipha,
IP_SIMPLE_HDR_LENGTH);
+ } else {
+ up = (uint16_t *)&ip6h->ip6_src;
- if (hwcksum_flags & HCK_FULLCKSUM) {
- /*
- * Hardware calculates pseudo-header,
- * header and payload checksums, so
- * zero out this field.
- */
- *up = 0;
- } else if (hwcksum_flags & HCK_PARTIALCKSUM) {
- uint32_t sum;
-
- /* pseudo-header checksumming */
- sum = *up + cksum + IP_TCP_CSUM_COMP;
- sum = (sum & 0xFFFF) + (sum >> 16);
- *up = (sum & 0xFFFF) + (sum >> 16);
- } else {
- /* software checksumming */
- TCP_STAT(tcp_out_sw_cksum);
- *up = IP_MD_CSUM(pkt,
- IP_SIMPLE_HDR_LENGTH,
- cksum + IP_TCP_CSUM_COMP);
- }
+ /* calculate pseudo-header checksum */
+ cksum = up[0] + up[1] + up[2] + up[3] +
+ up[4] + up[5] + up[6] + up[7] +
+ up[8] + up[9] + up[10] + up[11] +
+ up[12] + up[13] + up[14] + up[15];
+
+ /* Fold the initial sum */
+ cksum = (cksum & 0xffff) + (cksum >> 16);
+
+ up = (uint16_t *)(((uchar_t *)ip6h) +
+ IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET);
+ }
+ if (hwcksum_flags & HCK_FULLCKSUM) {
+ /* clear checksum field for hardware */
+ *up = 0;
+ } else if (hwcksum_flags & HCK_PARTIALCKSUM) {
+ uint32_t sum;
+
+ /* pseudo-header checksumming */
+ sum = *up + cksum + IP_TCP_CSUM_COMP;
+ sum = (sum & 0xFFFF) + (sum >> 16);
+ *up = (sum & 0xFFFF) + (sum >> 16);
+ } else {
+ /* software checksumming */
+ TCP_STAT(tcp_out_sw_cksum);
+ TCP_STAT_UPDATE(tcp_out_sw_cksum_bytes,
+ tcp->tcp_hdr_len + tcp->tcp_last_sent_len);
+ *up = IP_MD_CSUM(pkt, tcp->tcp_ip_hdr_len,
+ cksum + IP_TCP_CSUM_COMP);
+ if (*up == 0)
+ *up = 0xFFFF;
+ }
+
+ /* IPv4 header checksum */
+ if (af == AF_INET) {
ipha->ipha_fragment_offset_and_flags |=
(uint32_t)htons(ire->ire_frag_flag);
@@ -20306,19 +19466,6 @@ legacy_send_no_md:
((uint32_t *)ipha)[0],
((uint16_t *)ipha)[4]);
}
- } else {
- up = (uint16_t *)(((uchar_t *)ip6h) +
- IPV6_HDR_LEN + TCP_CSUM_OFFSET);
-
- /*
- * Software checksumming (hardware checksum
- * offload for IPv6 will hopefully be
- * implemented one day).
- */
- TCP_STAT(tcp_out_sw_cksum);
- *up = IP_MD_CSUM(pkt,
- IPV6_HDR_LEN - 2 * sizeof (in6_addr_t),
- htons(IPPROTO_TCP));
}
/* advance header offset */
@@ -20373,8 +19520,6 @@ legacy_send_no_md:
#undef PREP_NEW_MULTIDATA
#undef PREP_NEW_PBUF
#undef IPVER
-#undef TCP_CSUM_OFFSET
-#undef TCP_CSUM_SIZE
IRE_REFRELE(ire);
return (0);
@@ -20999,7 +20144,7 @@ tcp_ire_ill_check(tcp_t *tcp, ire_t *ire, ill_t *ill, boolean_t check_mdt)
*/
if (ip_multidata_outbound && check_mdt &&
!(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) &&
- ill != NULL && (ill->ill_capabilities & ILL_CAPAB_MDT) &&
+ ill != NULL && ILL_MDT_CAPABLE(ill) &&
!CONN_IPSEC_OUT_ENCAPSULATED(connp) &&
!(ire->ire_flags & RTF_MULTIRT) &&
!IPP_ENABLED(IPP_LOCAL_OUT) &&
@@ -21112,7 +20257,6 @@ tcp_wput_flush(tcp_t *tcp, mblk_t *mp)
* tcp_xmit_lowater, so re-enable flow.
*/
if (tcp->tcp_flow_stopped) {
- tcp->tcp_flow_stopped = B_FALSE;
tcp_clrqfull(tcp);
}
}
@@ -21305,26 +20449,47 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2)
}
tcp_def_q_set(tcp, mp);
return;
- case SIOCPOPSOCKFS:
+ case _SIOCSOCKFALLBACK:
/*
- * sockfs is being I_POP'ed, reset the flag
- * indicating this
- */
- tcp->tcp_issocket = B_FALSE;
-
- /*
- * Insert this socket into the acceptor hash.
- * We might need it for T_CONN_RES message
+ * Either sockmod is about to be popped and the socket
+ * would now be treated as a plain stream, or a module
+ * is about to be pushed so we could no longer use read-
+ * side synchronous streams for fused loopback tcp.
+ * Drain any queued data and disable direct sockfs
+ * interface from now on.
*/
+ if (!tcp->tcp_issocket) {
+ DB_TYPE(mp) = M_IOCNAK;
+ iocp->ioc_error = EINVAL;
+ } else {
#ifdef _ILP32
- tcp->tcp_acceptor_id = (t_uscalar_t)RD(q);
+ tcp->tcp_acceptor_id = (t_uscalar_t)RD(q);
#else
- tcp->tcp_acceptor_id = tcp->tcp_connp->conn_dev;
+ tcp->tcp_acceptor_id = tcp->tcp_connp->conn_dev;
#endif
- tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
- mp->b_datap->db_type = M_IOCACK;
+ /*
+ * Insert this socket into the acceptor hash.
+ * We might need it for T_CONN_RES message
+ */
+ tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
+
+ if (tcp->tcp_fused) {
+ /*
+ * This is a fused loopback tcp; disable
+ * read-side synchronous streams interface
+ * and drain any queued data. It is okay
+ * to do this for non-synchronous streams
+ * fused tcp as well.
+ */
+ tcp_fuse_disable_pair(tcp, B_FALSE);
+ }
+ tcp->tcp_issocket = B_FALSE;
+ TCP_STAT(tcp_sock_fallback);
+
+ DB_TYPE(mp) = M_IOCACK;
+ iocp->ioc_error = 0;
+ }
iocp->ioc_count = 0;
- iocp->ioc_error = 0;
iocp->ioc_rval = 0;
qreply(q, mp);
return;
@@ -21364,7 +20529,9 @@ tcp_wput_proto(void *arg, mblk_t *mp, void *arg2)
if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) {
type = ((union T_primitives *)rptr)->type;
if (type == T_EXDATA_REQ) {
- len = msgdsize(mp->b_cont) - 1;
+ uint32_t msize = msgdsize(mp->b_cont);
+
+ len = msize - 1;
if (len < 0) {
freemsg(mp);
return;
@@ -21381,7 +20548,7 @@ tcp_wput_proto(void *arg, mblk_t *mp, void *arg2)
tcp->tcp_valid_bits |= TCP_URG_VALID;
/* Bypass tcp protocol for fused tcp loopback */
- if (tcp->tcp_fused && tcp_fuse_output(tcp, mp))
+ if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize))
return;
} else if (type != T_DATA_REQ) {
goto non_urgent_data;
@@ -21393,7 +20560,7 @@ tcp_wput_proto(void *arg, mblk_t *mp, void *arg2)
return;
} else {
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE,
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
"tcp_wput_proto, dropping one...");
}
freemsg(mp);
@@ -21454,7 +20621,7 @@ non_urgent_data:
* the other side. Just ignore it.
*/
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1,
+ (void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_wput_proto, T_ORDREL_REQ out of "
"state %s",
@@ -21468,7 +20635,7 @@ non_urgent_data:
break;
default:
if (tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE,
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
"tcp_wput_proto, bogus TPI msg, type %d",
tprim->type);
}
@@ -21530,7 +20697,7 @@ tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl)
/* If a text string is passed in with the request, pass it to strlog. */
if (str != NULL && tcp->tcp_debug) {
- (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
"tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x",
str, seq, ack, ctl);
}
@@ -21737,7 +20904,7 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq,
}
if (str && q && tcp_dbg) {
- (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE,
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE,
"tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, "
"flags 0x%x",
str, seq, ack, ctl);
@@ -22478,7 +21645,7 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
}
/* This function handles the push timeout. */
-static void
+void
tcp_push_timer(void *arg)
{
conn_t *connp = (conn_t *)arg;
@@ -22488,10 +21655,18 @@ tcp_push_timer(void *arg)
ASSERT(tcp->tcp_listener == NULL);
+ /*
+ * We need to stop synchronous streams temporarily to prevent a race
+ * with tcp_fuse_rrw() or tcp_fusion rinfop(). It is safe to access
+ * tcp_rcv_list here because those entry points will return right
+ * away when synchronous streams is stopped.
+ */
+ TCP_FUSE_SYNCSTR_STOP(tcp);
tcp->tcp_push_tid = 0;
if ((tcp->tcp_rcv_list != NULL) &&
(tcp_rcv_drain(tcp->tcp_rq, tcp) == TH_ACK_NEEDED))
tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
+ TCP_FUSE_SYNCSTR_RESUME(tcp);
}
/*
@@ -24059,15 +23234,14 @@ tcp_ddi_init(void)
tcp_iss_key_init((uint8_t *)&tcp_g_t_info_ack,
sizeof (tcp_g_t_info_ack));
-#if TCP_COUNTERS || TCP_DEBUG_COUNTER
- if ((tcp_kstat = kstat_create("tcp", 0, "tcpstat",
+ if ((tcp_kstat = kstat_create(TCP_MOD_NAME, 0, "tcpstat",
"net", KSTAT_TYPE_NAMED,
sizeof (tcp_statistics) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL)) != NULL) {
tcp_kstat->ks_data = &tcp_statistics;
kstat_install(tcp_kstat);
}
-#endif
+
tcp_kstat_init();
}
@@ -24181,7 +23355,8 @@ cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg)
connfp = &ipcl_globalhash_fanout[i];
connp = NULL;
- while ((connp = tcp_get_next_conn(connfp, connp))) {
+ while ((connp =
+ ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
tcp = connp->conn_tcp;
cl_tcpi.cl_tcpi_version = CL_TCPI_V1;
@@ -24373,7 +23548,7 @@ tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp)
*/
if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES)
logflags |= SL_CONSOLE;
- (void) strlog(TCP_MODULE_ID, 0, 1, logflags,
+ (void) strlog(TCP_MOD_ID, 0, 1, logflags,
"TCP_IOC_ABORT_CONN: local = %s:%d, remote = %s:%d, "
"start = %d, end = %d\n", lbuf, lport, rbuf, rport,
acp->ac_start, acp->ac_end);
@@ -24529,7 +23704,7 @@ tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp)
*/
if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES)
logflags |= SL_CONSOLE;
- (void) strlog(TCP_MODULE_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: "
+ (void) strlog(TCP_MOD_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: "
"aborted %d connection%c\n", count, ((count > 1) ? 's' : ' '));
if (err == 0 && count == 0)
err = ENOENT;
@@ -24846,7 +24021,7 @@ process_ack:
}
done:
if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
- mp->b_datap->db_cksumstart = 0;
+ DB_CKSUMSTART(mp) = 0;
mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
TCP_STAT(tcp_time_wait_syn_fail);
}
@@ -24965,7 +24140,7 @@ tcp_setsockopt_mp(int level, int cmd, char *opt, int optlen)
/*
* TCP Timers Implementation.
*/
-static timeout_id_t
+timeout_id_t
tcp_timeout(conn_t *connp, void (*f)(void *), clock_t tim)
{
mblk_t *mp;
@@ -25038,7 +24213,7 @@ tcp_timer_handler(void *arg, mblk_t *mp, void *arg2)
* it. But since both should execute on the same squeue, this race should not
* occur.
*/
-static clock_t
+clock_t
tcp_timeout_cancel(conn_t *connp, timeout_id_t id)
{
mblk_t *mp = (mblk_t *)id;
@@ -25165,30 +24340,48 @@ tcp_timer_free(tcp_t *tcp, mblk_t *mp)
* End of TCP Timers implementation.
*/
-static void
+/*
+ * tcp_{set,clr}qfull() functions are used to either set or clear QFULL
+ * on the specified backing STREAMS q. Note, the caller may make the
+ * decision to call based on the tcp_t.tcp_flow_stopped value which
+ * when check outside the q's lock is only an advisory check ...
+ */
+
+void
tcp_setqfull(tcp_t *tcp)
{
queue_t *q = tcp->tcp_wq;
if (!(q->q_flag & QFULL)) {
- TCP_STAT(tcp_flwctl_on);
mutex_enter(QLOCK(q));
- q->q_flag |= QFULL;
- mutex_exit(QLOCK(q));
+ if (!(q->q_flag & QFULL)) {
+ /* still need to set QFULL */
+ q->q_flag |= QFULL;
+ tcp->tcp_flow_stopped = B_TRUE;
+ mutex_exit(QLOCK(q));
+ TCP_STAT(tcp_flwctl_on);
+ } else {
+ mutex_exit(QLOCK(q));
+ }
}
}
-static void
+void
tcp_clrqfull(tcp_t *tcp)
{
queue_t *q = tcp->tcp_wq;
if (q->q_flag & QFULL) {
mutex_enter(QLOCK(q));
- q->q_flag &= ~QFULL;
- mutex_exit(QLOCK(q));
- if (q->q_flag & QWANTW)
- qbackenable(q, 0);
+ if (q->q_flag & QFULL) {
+ q->q_flag &= ~QFULL;
+ tcp->tcp_flow_stopped = B_FALSE;
+ mutex_exit(QLOCK(q));
+ if (q->q_flag & QWANTW)
+ qbackenable(q, 0);
+ } else {
+ mutex_exit(QLOCK(q));
+ }
}
}
@@ -25254,8 +24447,8 @@ tcp_kstat_init(void)
{ "connTableSize6", KSTAT_DATA_INT32, 0 }
};
- tcp_mibkp = kstat_create("tcp", 0, "tcp", "mib2", KSTAT_TYPE_NAMED,
- NUM_OF_FIELDS(tcp_named_kstat_t), 0);
+ tcp_mibkp = kstat_create(TCP_MOD_NAME, 0, TCP_MOD_NAME,
+ "mib2", KSTAT_TYPE_NAMED, NUM_OF_FIELDS(tcp_named_kstat_t), 0);
if (tcp_mibkp == NULL)
return;
@@ -25304,7 +24497,8 @@ tcp_kstat_update(kstat_t *kp, int rw)
for (i = 0; i < CONN_G_HASH_SIZE; i++) {
connfp = &ipcl_globalhash_fanout[i];
connp = NULL;
- while ((connp = tcp_get_next_conn(connfp, connp))) {
+ while ((connp =
+ ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) {
tcp = connp->conn_tcp;
switch (tcp_snmp_state(tcp)) {
case MIB2_TCP_established:
@@ -25401,7 +24595,7 @@ tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp)
tcph = (tcph_t *)&mp->b_rptr[hdr_len];
if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) {
mp->b_datap->db_struioflag |= STRUIO_EAGER;
- mp->b_datap->db_cksumstart = (intptr_t)sqp;
+ DB_CKSUMSTART(mp) = (intptr_t)sqp;
}
squeue_fill(connp->conn_sqp, mp, connp->conn_recv, connp,
diff --git a/usr/src/uts/common/inet/tcp/tcp6ddi.c b/usr/src/uts/common/inet/tcp/tcp6ddi.c
index c055414f0a..3ccef00029 100644
--- a/usr/src/uts/common/inet/tcp/tcp6ddi.c
+++ b/usr/src/uts/common/inet/tcp/tcp6ddi.c
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -37,7 +37,13 @@
#define INET_DEVDESC "TCP6 STREAMS driver %I%"
#define INET_MODDESC "TCP6 STREAMS module %I%"
#define INET_DEVMINOR TCP_MINOR6
-#define INET_DEVMTFLAGS D_MP
+/*
+ * Note that unlike UDP, TCP uses synchronous STREAMS only
+ * for TCP Fusion (loopback); this is why we don't define
+ * D_SYNCSTR here. Since TCP as a module is used only for
+ * SNMP purposes, we define _D_DIRECT for device instance.
+ */
+#define INET_DEVMTFLAGS (D_MP|_D_DIRECT)
#define INET_MODMTFLAGS D_MP
#include "../inetddi.c"
diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c
new file mode 100644
index 0000000000..31d54d6f95
--- /dev/null
+++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c
@@ -0,0 +1,1087 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/strsubr.h>
+#include <sys/debug.h>
+#include <sys/cmn_err.h>
+#include <sys/tihdr.h>
+
+#include <inet/common.h>
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/tcp.h>
+#include <inet/tcp_impl.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ipclassifier.h>
+#include <inet/ipp_common.h>
+
+/*
+ * This file implements TCP fusion - a protocol-less data path for TCP
+ * loopback connections. The fusion of two local TCP endpoints occurs
+ * at connection establishment time. Various conditions (see details
+ * in tcp_fuse()) need to be met for fusion to be successful. If it
+ * fails, we fall back to the regular TCP data path; if it succeeds,
+ * both endpoints proceed to use tcp_fuse_output() as the transmit path.
+ * tcp_fuse_output() enqueues application data directly onto the peer's
+ * receive queue; no protocol processing is involved. After enqueueing
+ * the data, the sender can either push (putnext) data up the receiver's
+ * read queue; or the sender can simply return and let the receiver
+ * retrieve the enqueued data via the synchronous streams entry point
+ * tcp_fuse_rrw(). The latter path is taken if synchronous streams is
+ * enabled (the default). It is disabled if sockfs no longer resides
+ * directly on top of tcp module due to a module insertion or removal.
+ * It also needs to be temporarily disabled when sending urgent data
+ * because the tcp_fuse_rrw() path bypasses the M_PROTO processing done
+ * by strsock_proto() hook.
+ *
+ * Sychronization is handled by squeue and the mutex tcp_fuse_lock.
+ * One of the requirements for fusion to succeed is that both endpoints
+ * need to be using the same squeue. This ensures that neither side
+ * can disappear while the other side is still sending data. By itself,
+ * squeue is not sufficient for guaranteeing safety when synchronous
+ * streams is enabled. The reason is that tcp_fuse_rrw() doesn't enter
+ * the squeue and its access to tcp_rcv_list and other fusion-related
+ * fields needs to be sychronized with the sender. tcp_fuse_lock is
+ * used for this purpose. When there is urgent data, the sender needs
+ * to push the data up the receiver's streams read queue. In order to
+ * avoid holding the tcp_fuse_lock across putnext(), the sender sets
+ * the peer tcp's tcp_fuse_syncstr_stopped bit and releases tcp_fuse_lock
+ * (see macro TCP_FUSE_SYNCSTR_STOP()). If tcp_fuse_rrw() enters after
+ * this point, it will see that synchronous streams is temporarily
+ * stopped and it will immediately return EBUSY without accessing the
+ * tcp_rcv_list or other fields protected by the tcp_fuse_lock. This
+ * will result in strget() calling getq_noenab() to dequeue data from
+ * the stream head instead. After the sender has finished pushing up
+ * all urgent data, it will clear the tcp_fuse_syncstr_stopped bit using
+ * TCP_FUSE_SYNCSTR_RESUME and the receiver may then resume using
+ * tcp_fuse_rrw() to retrieve data from tcp_rcv_list.
+ *
+ * The following note applies only to the synchronous streams mode.
+ *
+ * Flow control is done by checking the size of receive buffer and
+ * the number of data blocks, both set to different limits. This is
+ * different than regular streams flow control where cumulative size
+ * check dominates block count check -- streams queue high water mark
+ * typically represents bytes. Each enqueue triggers notifications
+ * to the receiving process; a build up of data blocks indicates a
+ * slow receiver and the sender should be blocked or informed at the
+ * earliest moment instead of further wasting system resources. In
+ * effect, this is equivalent to limiting the number of outstanding
+ * segments in flight.
+ */
+
+/*
+ * Macros that determine whether or not IP processing is needed for TCP.
+ */
+#define TCP_IPOPT_POLICY_V4(tcp) \
+ ((tcp)->tcp_ipversion == IPV4_VERSION && \
+ ((tcp)->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH || \
+ CONN_OUTBOUND_POLICY_PRESENT((tcp)->tcp_connp) || \
+ CONN_INBOUND_POLICY_PRESENT((tcp)->tcp_connp)))
+
+#define TCP_IPOPT_POLICY_V6(tcp) \
+ ((tcp)->tcp_ipversion == IPV6_VERSION && \
+ ((tcp)->tcp_ip_hdr_len != IPV6_HDR_LEN || \
+ CONN_OUTBOUND_POLICY_PRESENT_V6((tcp)->tcp_connp) || \
+ CONN_INBOUND_POLICY_PRESENT_V6((tcp)->tcp_connp)))
+
+#define TCP_LOOPBACK_IP(tcp) \
+ (TCP_IPOPT_POLICY_V4(tcp) || TCP_IPOPT_POLICY_V6(tcp) || \
+ !CONN_IS_MD_FASTPATH((tcp)->tcp_connp))
+
+/*
+ * Setting this to false means we disable fusion altogether and
+ * loopback connections would go through the protocol paths.
+ */
+boolean_t do_tcp_fusion = B_TRUE;
+
+/*
+ * Enabling this flag allows sockfs to retrieve data directly
+ * from a fused tcp endpoint using synchronous streams interface.
+ */
+boolean_t do_tcp_direct_sockfs = B_TRUE;
+
+/*
+ * This is the minimum amount of outstanding writes allowed on
+ * a synchronous streams-enabled receiving endpoint before the
+ * sender gets flow-controlled. Setting this value to 0 means
+ * that the data block limit is equivalent to the byte count
+ * limit, which essentially disables the check.
+ */
+#define TCP_FUSION_RCV_UNREAD_MIN 8
+uint_t tcp_fusion_rcv_unread_min = TCP_FUSION_RCV_UNREAD_MIN;
+
+static void tcp_fuse_syncstr_enable(tcp_t *);
+static void tcp_fuse_syncstr_disable(tcp_t *);
+static void strrput_sig(queue_t *, boolean_t);
+
+/*
+ * This routine gets called by the eager tcp upon changing state from
+ * SYN_RCVD to ESTABLISHED. It fuses a direct path between itself
+ * and the active connect tcp such that the regular tcp processings
+ * may be bypassed under allowable circumstances. Because the fusion
+ * requires both endpoints to be in the same squeue, it does not work
+ * for simultaneous active connects because there is no easy way to
+ * switch from one squeue to another once the connection is created.
+ * This is different from the eager tcp case where we assign it the
+ * same squeue as the one given to the active connect tcp during open.
+ */
+void
+tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph)
+{
+ conn_t *peer_connp, *connp = tcp->tcp_connp;
+ tcp_t *peer_tcp;
+
+ ASSERT(!tcp->tcp_fused);
+ ASSERT(tcp->tcp_loopback);
+ ASSERT(tcp->tcp_loopback_peer == NULL);
+ /*
+ * We need to inherit q_hiwat of the listener tcp, but we can't
+ * really use tcp_listener since we get here after sending up
+ * T_CONN_IND and tcp_wput_accept() may be called independently,
+ * at which point tcp_listener is cleared; this is why we use
+ * tcp_saved_listener. The listener itself is guaranteed to be
+ * around until tcp_accept_finish() is called on this eager --
+ * this won't happen until we're done since we're inside the
+ * eager's perimeter now.
+ */
+ ASSERT(tcp->tcp_saved_listener != NULL);
+
+ /*
+ * Lookup peer endpoint; search for the remote endpoint having
+ * the reversed address-port quadruplet in ESTABLISHED state,
+ * which is guaranteed to be unique in the system. Zone check
+ * is applied accordingly for loopback address, but not for
+ * local address since we want fusion to happen across Zones.
+ */
+ if (tcp->tcp_ipversion == IPV4_VERSION) {
+ peer_connp = ipcl_conn_tcp_lookup_reversed_ipv4(connp,
+ (ipha_t *)iphdr, tcph);
+ } else {
+ peer_connp = ipcl_conn_tcp_lookup_reversed_ipv6(connp,
+ (ip6_t *)iphdr, tcph);
+ }
+
+ /*
+ * We can only proceed if peer exists, resides in the same squeue
+ * as our conn and is not raw-socket. The squeue assignment of
+ * this eager tcp was done earlier at the time of SYN processing
+ * in ip_fanout_tcp{_v6}. Note that similar squeues by itself
+ * doesn't guarantee a safe condition to fuse, hence we perform
+ * additional tests below.
+ */
+ ASSERT(peer_connp == NULL || peer_connp != connp);
+ if (peer_connp == NULL || peer_connp->conn_sqp != connp->conn_sqp ||
+ !IPCL_IS_TCP(peer_connp)) {
+ if (peer_connp != NULL) {
+ TCP_STAT(tcp_fusion_unqualified);
+ CONN_DEC_REF(peer_connp);
+ }
+ return;
+ }
+ peer_tcp = peer_connp->conn_tcp; /* active connect tcp */
+
+ ASSERT(peer_tcp != NULL && peer_tcp != tcp && !peer_tcp->tcp_fused);
+ ASSERT(peer_tcp->tcp_loopback && peer_tcp->tcp_loopback_peer == NULL);
+ ASSERT(peer_connp->conn_sqp == connp->conn_sqp);
+
+ /*
+ * Fuse the endpoints; we perform further checks against both
+ * tcp endpoints to ensure that a fusion is allowed to happen.
+ * In particular we bail out for non-simple TCP/IP or if IPsec/
+ * IPQoS policy exists.
+ */
+ if (!tcp->tcp_unfusable && !peer_tcp->tcp_unfusable &&
+ !TCP_LOOPBACK_IP(tcp) && !TCP_LOOPBACK_IP(peer_tcp) &&
+ !IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) {
+ mblk_t *mp;
+ struct stroptions *stropt;
+ queue_t *peer_rq = peer_tcp->tcp_rq;
+
+ ASSERT(!TCP_IS_DETACHED(peer_tcp) && peer_rq != NULL);
+ ASSERT(tcp->tcp_fused_sigurg_mp == NULL);
+ ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL);
+
+ /*
+ * We need to drain data on both endpoints during unfuse.
+ * If we need to send up SIGURG at the time of draining,
+ * we want to be sure that an mblk is readily available.
+ * This is why we pre-allocate the M_PCSIG mblks for both
+ * endpoints which will only be used during/after unfuse.
+ */
+ if ((mp = allocb(1, BPRI_HI)) == NULL)
+ goto failed;
+
+ tcp->tcp_fused_sigurg_mp = mp;
+
+ if ((mp = allocb(1, BPRI_HI)) == NULL)
+ goto failed;
+
+ peer_tcp->tcp_fused_sigurg_mp = mp;
+
+ /* Allocate M_SETOPTS mblk */
+ if ((mp = allocb(sizeof (*stropt), BPRI_HI)) == NULL)
+ goto failed;
+
+ /* Fuse both endpoints */
+ peer_tcp->tcp_loopback_peer = tcp;
+ tcp->tcp_loopback_peer = peer_tcp;
+ peer_tcp->tcp_fused = tcp->tcp_fused = B_TRUE;
+
+ /*
+ * We never use regular tcp paths in fusion and should
+ * therefore clear tcp_unsent on both endpoints. Having
+ * them set to non-zero values means asking for trouble
+ * especially after unfuse, where we may end up sending
+ * through regular tcp paths which expect xmit_list and
+ * friends to be correctly setup.
+ */
+ peer_tcp->tcp_unsent = tcp->tcp_unsent = 0;
+
+ tcp_timers_stop(tcp);
+ tcp_timers_stop(peer_tcp);
+
+ /*
+ * At this point we are a detached eager tcp and therefore
+ * don't have a queue assigned to us until accept happens.
+ * In the mean time the peer endpoint may immediately send
+ * us data as soon as fusion is finished, and we need to be
+ * able to flow control it in case it sends down huge amount
+ * of data while we're still detached. To prevent that we
+ * inherit the listener's q_hiwat value; this is temporary
+ * since we'll repeat the process in tcp_accept_finish().
+ */
+ (void) tcp_fuse_set_rcv_hiwat(tcp,
+ tcp->tcp_saved_listener->tcp_rq->q_hiwat);
+
+ /*
+ * Set the stream head's write offset value to zero since we
+ * won't be needing any room for TCP/IP headers; tell it to
+ * not break up the writes (this would reduce the amount of
+ * work done by kmem); and configure our receive buffer.
+ * Note that we can only do this for the active connect tcp
+ * since our eager is still detached; it will be dealt with
+ * later in tcp_accept_finish().
+ */
+ DB_TYPE(mp) = M_SETOPTS;
+ mp->b_wptr += sizeof (*stropt);
+
+ stropt = (struct stroptions *)mp->b_rptr;
+ stropt->so_flags = SO_MAXBLK | SO_WROFF | SO_HIWAT;
+ stropt->so_maxblk = tcp_maxpsz_set(peer_tcp, B_FALSE);
+ stropt->so_wroff = 0;
+
+ /*
+ * Record the stream head's high water mark for
+ * peer endpoint; this is used for flow-control
+ * purposes in tcp_fuse_output().
+ */
+ stropt->so_hiwat = tcp_fuse_set_rcv_hiwat(peer_tcp,
+ peer_rq->q_hiwat);
+
+ /* Send the options up */
+ putnext(peer_rq, mp);
+ } else {
+ TCP_STAT(tcp_fusion_unqualified);
+ }
+ CONN_DEC_REF(peer_connp);
+ return;
+
+failed:
+ if (tcp->tcp_fused_sigurg_mp != NULL) {
+ freeb(tcp->tcp_fused_sigurg_mp);
+ tcp->tcp_fused_sigurg_mp = NULL;
+ }
+ if (peer_tcp->tcp_fused_sigurg_mp != NULL) {
+ freeb(peer_tcp->tcp_fused_sigurg_mp);
+ peer_tcp->tcp_fused_sigurg_mp = NULL;
+ }
+ CONN_DEC_REF(peer_connp);
+}
+
+/*
+ * Unfuse a previously-fused pair of tcp loopback endpoints.
+ */
+void
+tcp_unfuse(tcp_t *tcp)
+{
+ tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+
+ ASSERT(tcp->tcp_fused && peer_tcp != NULL);
+ ASSERT(peer_tcp->tcp_fused && peer_tcp->tcp_loopback_peer == tcp);
+ ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp);
+ ASSERT(tcp->tcp_unsent == 0 && peer_tcp->tcp_unsent == 0);
+ ASSERT(tcp->tcp_fused_sigurg_mp != NULL);
+ ASSERT(peer_tcp->tcp_fused_sigurg_mp != NULL);
+
+ /*
+ * We disable synchronous streams, drain any queued data and
+ * clear tcp_direct_sockfs. The synchronous streams entry
+ * points will become no-ops after this point.
+ */
+ tcp_fuse_disable_pair(tcp, B_TRUE);
+
+ /*
+ * Update th_seq and th_ack in the header template
+ */
+ U32_TO_ABE32(tcp->tcp_snxt, tcp->tcp_tcph->th_seq);
+ U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack);
+ U32_TO_ABE32(peer_tcp->tcp_snxt, peer_tcp->tcp_tcph->th_seq);
+ U32_TO_ABE32(peer_tcp->tcp_rnxt, peer_tcp->tcp_tcph->th_ack);
+
+ /* Unfuse the endpoints */
+ peer_tcp->tcp_fused = tcp->tcp_fused = B_FALSE;
+ peer_tcp->tcp_loopback_peer = tcp->tcp_loopback_peer = NULL;
+}
+
+/*
+ * Fusion output routine for urgent data. This routine is called by
+ * tcp_fuse_output() for handling non-M_DATA mblks.
+ */
+void
+tcp_fuse_output_urg(tcp_t *tcp, mblk_t *mp)
+{
+ mblk_t *mp1;
+ struct T_exdata_ind *tei;
+ tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+ mblk_t *head, *prev_head = NULL;
+
+ ASSERT(tcp->tcp_fused);
+ ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp);
+ ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
+ ASSERT(mp->b_cont != NULL && DB_TYPE(mp->b_cont) == M_DATA);
+ ASSERT(MBLKL(mp) >= sizeof (*tei) && MBLKL(mp->b_cont) > 0);
+
+ /*
+ * Urgent data arrives in the form of T_EXDATA_REQ from above.
+ * Each occurence denotes a new urgent pointer. For each new
+ * urgent pointer we signal (SIGURG) the receiving app to indicate
+ * that it needs to go into urgent mode. This is similar to the
+ * urgent data handling in the regular tcp. We don't need to keep
+ * track of where the urgent pointer is, because each T_EXDATA_REQ
+ * "advances" the urgent pointer for us.
+ *
+ * The actual urgent data carried by T_EXDATA_REQ is then prepended
+ * by a T_EXDATA_IND before being enqueued behind any existing data
+ * destined for the receiving app. There is only a single urgent
+ * pointer (out-of-band mark) for a given tcp. If the new urgent
+ * data arrives before the receiving app reads some existing urgent
+ * data, the previous marker is lost. This behavior is emulated
+ * accordingly below, by removing any existing T_EXDATA_IND messages
+ * and essentially converting old urgent data into non-urgent.
+ */
+ ASSERT(tcp->tcp_valid_bits & TCP_URG_VALID);
+ /* Let sender get out of urgent mode */
+ tcp->tcp_valid_bits &= ~TCP_URG_VALID;
+
+ /*
+ * This flag indicates that a signal needs to be sent up.
+ * This flag will only get cleared once SIGURG is delivered and
+ * is not affected by the tcp_fused flag -- delivery will still
+ * happen even after an endpoint is unfused, to handle the case
+ * where the sending endpoint immediately closes/unfuses after
+ * sending urgent data and the accept is not yet finished.
+ */
+ peer_tcp->tcp_fused_sigurg = B_TRUE;
+
+ /* Reuse T_EXDATA_REQ mblk for T_EXDATA_IND */
+ DB_TYPE(mp) = M_PROTO;
+ tei = (struct T_exdata_ind *)mp->b_rptr;
+ tei->PRIM_type = T_EXDATA_IND;
+ tei->MORE_flag = 0;
+ mp->b_wptr = (uchar_t *)&tei[1];
+
+ TCP_STAT(tcp_fusion_urg);
+ BUMP_MIB(&tcp_mib, tcpOutUrg);
+
+ head = peer_tcp->tcp_rcv_list;
+ while (head != NULL) {
+ /*
+ * Remove existing T_EXDATA_IND, keep the data which follows
+ * it and relink our list. Note that we don't modify the
+ * tcp_rcv_last_tail since it never points to T_EXDATA_IND.
+ */
+ if (DB_TYPE(head) != M_DATA) {
+ mp1 = head;
+
+ ASSERT(DB_TYPE(mp1->b_cont) == M_DATA);
+ head = mp1->b_cont;
+ mp1->b_cont = NULL;
+ head->b_next = mp1->b_next;
+ mp1->b_next = NULL;
+ if (prev_head != NULL)
+ prev_head->b_next = head;
+ if (peer_tcp->tcp_rcv_list == mp1)
+ peer_tcp->tcp_rcv_list = head;
+ if (peer_tcp->tcp_rcv_last_head == mp1)
+ peer_tcp->tcp_rcv_last_head = head;
+ freeb(mp1);
+ }
+ prev_head = head;
+ head = head->b_next;
+ }
+}
+
+/*
+ * Fusion output routine, called by tcp_output() and tcp_wput_proto().
+ */
+boolean_t
+tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
+{
+ tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+ queue_t *peer_rq;
+ uint_t max_unread;
+ boolean_t flow_stopped;
+ boolean_t urgent = (DB_TYPE(mp) != M_DATA);
+
+ ASSERT(tcp->tcp_fused);
+ ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp);
+ ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp);
+ ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO ||
+ DB_TYPE(mp) == M_PCPROTO);
+
+ peer_rq = peer_tcp->tcp_rq;
+ max_unread = peer_tcp->tcp_fuse_rcv_unread_hiwater;
+
+ /* If this connection requires IP, unfuse and use regular path */
+ if (TCP_LOOPBACK_IP(tcp) || TCP_LOOPBACK_IP(peer_tcp) ||
+ IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) {
+ TCP_STAT(tcp_fusion_aborted);
+ tcp_unfuse(tcp);
+ return (B_FALSE);
+ }
+
+ if (send_size == 0) {
+ freemsg(mp);
+ return (B_TRUE);
+ }
+
+ /*
+ * Handle urgent data; we either send up SIGURG to the peer now
+ * or do it later when we drain, in case the peer is detached
+ * or if we're short of memory for M_PCSIG mblk.
+ */
+ if (urgent) {
+ /*
+ * We stop synchronous streams when we have urgent data
+ * queued to prevent tcp_fuse_rrw() from pulling it. If
+ * for some reasons the urgent data can't be delivered
+ * below, synchronous streams will remain stopped until
+ * someone drains the tcp_rcv_list.
+ */
+ TCP_FUSE_SYNCSTR_STOP(peer_tcp);
+ tcp_fuse_output_urg(tcp, mp);
+ }
+
+ mutex_enter(&peer_tcp->tcp_fuse_lock);
+ /*
+ * Wake up and signal the peer; it is okay to do this before
+ * enqueueing because we are holding the lock. One of the
+ * advantages of synchronous streams is the ability for us to
+ * find out when the application performs a read on the socket,
+ * by way of tcp_fuse_rrw() entry point being called. Every
+ * data that gets enqueued onto the receiver is treated as if
+ * it has arrived at the receiving endpoint, thus generating
+ * SIGPOLL/SIGIO for asynchronous socket just as in the strrput()
+ * case. However, we only wake up the application when necessary,
+ * i.e. during the first enqueue. When tcp_fuse_rrw() is called
+ * it will send everything upstream.
+ */
+ if (peer_tcp->tcp_direct_sockfs && !urgent &&
+ !TCP_IS_DETACHED(peer_tcp)) {
+ if (peer_tcp->tcp_rcv_list == NULL)
+ STR_WAKEUP_SET(STREAM(peer_tcp->tcp_rq));
+ /* Update poll events and send SIGPOLL/SIGIO if necessary */
+ STR_SENDSIG(STREAM(peer_tcp->tcp_rq));
+ }
+
+ /*
+ * Enqueue data into the peer's receive list; we may or may not
+ * drain the contents depending on the conditions below.
+ */
+ tcp_rcv_enqueue(peer_tcp, mp, send_size);
+
+ /* In case it wrapped around and also to keep it constant */
+ peer_tcp->tcp_rwnd += send_size;
+
+ /*
+ * Exercise flow-control when needed; we will get back-enabled
+ * in either tcp_accept_finish(), tcp_unfuse(), or tcp_fuse_rrw().
+ * If tcp_direct_sockfs is on or if the peer endpoint is detached,
+ * we emulate streams flow control by checking the peer's queue
+ * size and high water mark; otherwise we simply use canputnext()
+ * to decide if we need to stop our flow.
+ *
+ * The outstanding unread data block check does not apply for a
+ * detached receiver; this is to avoid unnecessary blocking of the
+ * sender while the accept is currently in progress and is quite
+ * similar to the regular tcp.
+ */
+ if (TCP_IS_DETACHED(peer_tcp) || max_unread == 0)
+ max_unread = UINT_MAX;
+
+ flow_stopped = tcp->tcp_flow_stopped;
+ if (!flow_stopped &&
+ (((peer_tcp->tcp_direct_sockfs || TCP_IS_DETACHED(peer_tcp)) &&
+ (peer_tcp->tcp_rcv_cnt >= peer_tcp->tcp_fuse_rcv_hiwater ||
+ ++peer_tcp->tcp_fuse_rcv_unread_cnt >= max_unread)) ||
+ (!peer_tcp->tcp_direct_sockfs &&
+ !TCP_IS_DETACHED(peer_tcp) && !canputnext(peer_tcp->tcp_rq)))) {
+ tcp_setqfull(tcp);
+ flow_stopped = B_TRUE;
+ TCP_STAT(tcp_fusion_flowctl);
+ DTRACE_PROBE4(tcp__fuse__output__flowctl, tcp_t *, tcp,
+ uint_t, send_size, uint_t, peer_tcp->tcp_rcv_cnt,
+ uint_t, peer_tcp->tcp_fuse_rcv_unread_cnt);
+ } else if (flow_stopped &&
+ TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
+ tcp_clrqfull(tcp);
+ }
+
+ loopback_packets++;
+ tcp->tcp_last_sent_len = send_size;
+
+ /* Need to adjust the following SNMP MIB-related variables */
+ tcp->tcp_snxt += send_size;
+ tcp->tcp_suna = tcp->tcp_snxt;
+ peer_tcp->tcp_rnxt += send_size;
+ peer_tcp->tcp_rack = peer_tcp->tcp_rnxt;
+
+ BUMP_MIB(&tcp_mib, tcpOutDataSegs);
+ UPDATE_MIB(&tcp_mib, tcpOutDataBytes, send_size);
+
+ BUMP_MIB(&tcp_mib, tcpInSegs);
+ BUMP_MIB(&tcp_mib, tcpInDataInorderSegs);
+ UPDATE_MIB(&tcp_mib, tcpInDataInorderBytes, send_size);
+
+ BUMP_LOCAL(tcp->tcp_obsegs);
+ BUMP_LOCAL(peer_tcp->tcp_ibsegs);
+
+ mutex_exit(&peer_tcp->tcp_fuse_lock);
+
+ DTRACE_PROBE2(tcp__fuse__output, tcp_t *, tcp, uint_t, send_size);
+
+ if (!TCP_IS_DETACHED(peer_tcp)) {
+ /*
+ * Drain the peer's receive queue it has urgent data or if
+ * we're not flow-controlled. There is no need for draining
+ * normal data when tcp_direct_sockfs is on because the peer
+ * will pull the data via tcp_fuse_rrw().
+ */
+ if (urgent || (!flow_stopped && !peer_tcp->tcp_direct_sockfs)) {
+ ASSERT(peer_tcp->tcp_rcv_list != NULL);
+ (void) tcp_fuse_rcv_drain(peer_rq, peer_tcp, NULL);
+ /*
+ * If synchronous streams was stopped above due
+ * to the presence of urgent data, re-enable it.
+ */
+ if (urgent)
+ TCP_FUSE_SYNCSTR_RESUME(peer_tcp);
+ }
+ }
+ return (B_TRUE);
+}
+
+/*
+ * This routine gets called to deliver data upstream on a fused or
+ * previously fused tcp loopback endpoint; the latter happens only
+ * when there is a pending SIGURG signal plus urgent data that can't
+ * be sent upstream in the past.
+ */
+boolean_t
+tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp)
+{
+ mblk_t *mp;
+#ifdef DEBUG
+ uint_t cnt = 0;
+#endif
+
+ ASSERT(tcp->tcp_loopback);
+ ASSERT(tcp->tcp_fused || tcp->tcp_fused_sigurg);
+ ASSERT(!tcp->tcp_fused || tcp->tcp_loopback_peer != NULL);
+ ASSERT(sigurg_mpp != NULL || tcp->tcp_fused);
+
+ /* No need for the push timer now, in case it was scheduled */
+ if (tcp->tcp_push_tid != 0) {
+ (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
+ tcp->tcp_push_tid = 0;
+ }
+ /*
+ * If there's urgent data sitting in receive list and we didn't
+ * get a chance to send up a SIGURG signal, make sure we send
+ * it first before draining in order to ensure that SIOCATMARK
+ * works properly.
+ */
+ if (tcp->tcp_fused_sigurg) {
+ /*
+ * sigurg_mpp is normally NULL, i.e. when we're still
+ * fused and didn't get here because of tcp_unfuse().
+ * In this case try hard to allocate the M_PCSIG mblk.
+ */
+ if (sigurg_mpp == NULL &&
+ (mp = allocb(1, BPRI_HI)) == NULL &&
+ (mp = allocb_tryhard(1)) == NULL) {
+ /* Alloc failed; try again next time */
+ tcp->tcp_push_tid = TCP_TIMER(tcp, tcp_push_timer,
+ MSEC_TO_TICK(tcp_push_timer_interval));
+ return (B_TRUE);
+ } else if (sigurg_mpp != NULL) {
+ /*
+ * Use the supplied M_PCSIG mblk; it means we're
+ * either unfused or in the process of unfusing,
+ * and the drain must happen now.
+ */
+ mp = *sigurg_mpp;
+ *sigurg_mpp = NULL;
+ }
+ ASSERT(mp != NULL);
+
+ tcp->tcp_fused_sigurg = B_FALSE;
+ /* Send up the signal */
+ DB_TYPE(mp) = M_PCSIG;
+ *mp->b_wptr++ = (uchar_t)SIGURG;
+ putnext(q, mp);
+ /*
+ * Let the regular tcp_rcv_drain() path handle
+ * draining the data if we're no longer fused.
+ */
+ if (!tcp->tcp_fused)
+ return (B_FALSE);
+ }
+
+ /*
+ * In the synchronous streams case, we generate SIGPOLL/SIGIO for
+ * each M_DATA that gets enqueued onto the receiver. At this point
+ * we are about to drain any queued data via putnext(). In order
+ * to avoid extraneous signal generation from strrput(), we set
+ * STRGETINPROG flag at the stream head prior to the draining and
+ * restore it afterwards. This masks out signal generation only
+ * for M_DATA messages and does not affect urgent data.
+ */
+ if (tcp->tcp_direct_sockfs)
+ strrput_sig(q, B_FALSE);
+
+ /* Drain the data */
+ while ((mp = tcp->tcp_rcv_list) != NULL) {
+ tcp->tcp_rcv_list = mp->b_next;
+ mp->b_next = NULL;
+#ifdef DEBUG
+ cnt += msgdsize(mp);
+#endif
+ putnext(q, mp);
+ TCP_STAT(tcp_fusion_putnext);
+ }
+
+ if (tcp->tcp_direct_sockfs)
+ strrput_sig(q, B_TRUE);
+
+ ASSERT(cnt == tcp->tcp_rcv_cnt);
+ tcp->tcp_rcv_last_head = NULL;
+ tcp->tcp_rcv_last_tail = NULL;
+ tcp->tcp_rcv_cnt = 0;
+ tcp->tcp_fuse_rcv_unread_cnt = 0;
+ tcp->tcp_rwnd = q->q_hiwat;
+
+ return (B_TRUE);
+}
+
+/*
+ * Synchronous stream entry point for sockfs to retrieve
+ * data directly from tcp_rcv_list.
+ */
+int
+tcp_fuse_rrw(queue_t *q, struiod_t *dp)
+{
+ tcp_t *tcp = Q_TO_CONN(q)->conn_tcp;
+ mblk_t *mp;
+
+ mutex_enter(&tcp->tcp_fuse_lock);
+ /*
+ * If someone had turned off tcp_direct_sockfs or if synchronous
+ * streams is temporarily disabled, we return EBUSY. This causes
+ * strget() to dequeue data from the stream head instead.
+ */
+ if (!tcp->tcp_direct_sockfs || tcp->tcp_fuse_syncstr_stopped) {
+ mutex_exit(&tcp->tcp_fuse_lock);
+ TCP_STAT(tcp_fusion_rrw_busy);
+ return (EBUSY);
+ }
+
+ if ((mp = tcp->tcp_rcv_list) != NULL) {
+ tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+
+ DTRACE_PROBE3(tcp__fuse__rrw, tcp_t *, tcp,
+ uint32_t, tcp->tcp_rcv_cnt, ssize_t, dp->d_uio.uio_resid);
+
+ tcp->tcp_rcv_list = NULL;
+ TCP_STAT(tcp_fusion_rrw_msgcnt);
+
+ /*
+ * At this point nothing should be left in tcp_rcv_list.
+ * The only possible case where we would have a chain of
+ * b_next-linked messages is urgent data, but we wouldn't
+ * be here if that's true since urgent data is delivered
+ * via putnext() and synchronous streams is stopped until
+ * tcp_fuse_rcv_drain() is finished.
+ */
+ ASSERT(DB_TYPE(mp) == M_DATA && mp->b_next == NULL);
+
+ tcp->tcp_rcv_last_head = NULL;
+ tcp->tcp_rcv_last_tail = NULL;
+ tcp->tcp_rcv_cnt = 0;
+ tcp->tcp_fuse_rcv_unread_cnt = 0;
+
+ if (peer_tcp->tcp_flow_stopped) {
+ tcp_clrqfull(peer_tcp);
+ TCP_STAT(tcp_fusion_backenabled);
+ }
+ }
+
+ /*
+ * Either we just dequeued everything or we get here from sockfs
+ * and have nothing to return; in this case clear RSLEEP.
+ */
+ ASSERT(tcp->tcp_rcv_last_head == NULL);
+ ASSERT(tcp->tcp_rcv_last_tail == NULL);
+ ASSERT(tcp->tcp_rcv_cnt == 0);
+ ASSERT(tcp->tcp_fuse_rcv_unread_cnt == 0);
+ STR_WAKEUP_CLEAR(STREAM(q));
+
+ mutex_exit(&tcp->tcp_fuse_lock);
+ dp->d_mp = mp;
+ return (0);
+}
+
+/*
+ * Synchronous stream entry point used by certain ioctls to retrieve
+ * information about or peek into the tcp_rcv_list.
+ */
+int
+tcp_fuse_rinfop(queue_t *q, infod_t *dp)
+{
+ tcp_t *tcp = Q_TO_CONN(q)->conn_tcp;
+ mblk_t *mp;
+ uint_t cmd = dp->d_cmd;
+ int res = 0;
+ int error = 0;
+ struct stdata *stp = STREAM(q);
+
+ mutex_enter(&tcp->tcp_fuse_lock);
+ /* If shutdown on read has happened, return nothing */
+ mutex_enter(&stp->sd_lock);
+ if (stp->sd_flag & STREOF) {
+ mutex_exit(&stp->sd_lock);
+ goto done;
+ }
+ mutex_exit(&stp->sd_lock);
+
+ /*
+ * It is OK not to return an answer if tcp_rcv_list is
+ * currently not accessible.
+ */
+ if (!tcp->tcp_direct_sockfs || tcp->tcp_fuse_syncstr_stopped ||
+ (mp = tcp->tcp_rcv_list) == NULL)
+ goto done;
+
+ if (cmd & INFOD_COUNT) {
+ /*
+ * We have at least one message and
+ * could return only one at a time.
+ */
+ dp->d_count++;
+ res |= INFOD_COUNT;
+ }
+ if (cmd & INFOD_BYTES) {
+ /*
+ * Return size of all data messages.
+ */
+ dp->d_bytes += tcp->tcp_rcv_cnt;
+ res |= INFOD_BYTES;
+ }
+ if (cmd & INFOD_FIRSTBYTES) {
+ /*
+ * Return size of first data message.
+ */
+ dp->d_bytes = msgdsize(mp);
+ res |= INFOD_FIRSTBYTES;
+ dp->d_cmd &= ~INFOD_FIRSTBYTES;
+ }
+ if (cmd & INFOD_COPYOUT) {
+ mblk_t *mp1;
+ int n;
+
+ if (DB_TYPE(mp) == M_DATA) {
+ mp1 = mp;
+ } else {
+ mp1 = mp->b_cont;
+ ASSERT(mp1 != NULL);
+ }
+
+ /*
+ * Return data contents of first message.
+ */
+ ASSERT(DB_TYPE(mp1) == M_DATA);
+ while (mp1 != NULL && dp->d_uiop->uio_resid > 0) {
+ n = MIN(dp->d_uiop->uio_resid, MBLKL(mp1));
+ if (n != 0 && (error = uiomove((char *)mp1->b_rptr, n,
+ UIO_READ, dp->d_uiop)) != 0) {
+ goto done;
+ }
+ mp1 = mp1->b_cont;
+ }
+ res |= INFOD_COPYOUT;
+ dp->d_cmd &= ~INFOD_COPYOUT;
+ }
+done:
+ mutex_exit(&tcp->tcp_fuse_lock);
+
+ dp->d_res |= res;
+
+ return (error);
+}
+
+/*
+ * Enable synchronous streams on a fused tcp loopback endpoint.
+ */
+static void
+tcp_fuse_syncstr_enable(tcp_t *tcp)
+{
+ queue_t *rq = tcp->tcp_rq;
+ struct stdata *stp = STREAM(rq);
+
+ /* We can only enable synchronous streams for sockfs mode */
+ tcp->tcp_direct_sockfs = tcp->tcp_issocket && do_tcp_direct_sockfs;
+
+ if (!tcp->tcp_direct_sockfs)
+ return;
+
+ mutex_enter(&stp->sd_lock);
+ mutex_enter(QLOCK(rq));
+
+ /*
+ * We replace our q_qinfo with one that has the qi_rwp entry point.
+ * Clear SR_SIGALLDATA because we generate the equivalent signal(s)
+ * for every enqueued data in tcp_fuse_output().
+ */
+ rq->q_qinfo = &tcp_loopback_rinit;
+ rq->q_struiot = tcp_loopback_rinit.qi_struiot;
+ stp->sd_struiordq = rq;
+ stp->sd_rput_opt &= ~SR_SIGALLDATA;
+
+ mutex_exit(QLOCK(rq));
+ mutex_exit(&stp->sd_lock);
+}
+
+/*
+ * Disable synchronous streams on a fused tcp loopback endpoint.
+ */
+static void
+tcp_fuse_syncstr_disable(tcp_t *tcp)
+{
+ queue_t *rq = tcp->tcp_rq;
+ struct stdata *stp = STREAM(rq);
+
+ if (!tcp->tcp_direct_sockfs)
+ return;
+
+ mutex_enter(&stp->sd_lock);
+ mutex_enter(QLOCK(rq));
+
+ /*
+ * Reset q_qinfo to point to the default tcp entry points.
+ * Also restore SR_SIGALLDATA so that strrput() can generate
+ * the signals again for future M_DATA messages.
+ */
+ rq->q_qinfo = &tcp_rinit;
+ rq->q_struiot = tcp_rinit.qi_struiot;
+ stp->sd_struiordq = NULL;
+ stp->sd_rput_opt |= SR_SIGALLDATA;
+ tcp->tcp_direct_sockfs = B_FALSE;
+
+ mutex_exit(QLOCK(rq));
+ mutex_exit(&stp->sd_lock);
+}
+
+/*
+ * Enable synchronous streams on a pair of fused tcp endpoints.
+ */
+void
+tcp_fuse_syncstr_enable_pair(tcp_t *tcp)
+{
+ tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+
+ ASSERT(tcp->tcp_fused);
+ ASSERT(peer_tcp != NULL);
+
+ tcp_fuse_syncstr_enable(tcp);
+ tcp_fuse_syncstr_enable(peer_tcp);
+}
+
+/*
+ * Allow or disallow signals to be generated by strrput().
+ */
+static void
+strrput_sig(queue_t *q, boolean_t on)
+{
+ struct stdata *stp = STREAM(q);
+
+ mutex_enter(&stp->sd_lock);
+ if (on)
+ stp->sd_flag &= ~STRGETINPROG;
+ else
+ stp->sd_flag |= STRGETINPROG;
+ mutex_exit(&stp->sd_lock);
+}
+
+/*
+ * Disable synchronous streams on a pair of fused tcp endpoints and drain
+ * any queued data; called either during unfuse or upon transitioning from
+ * a socket to a stream endpoint due to _SIOCSOCKFALLBACK.
+ */
+void
+tcp_fuse_disable_pair(tcp_t *tcp, boolean_t unfusing)
+{
+ tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+
+ ASSERT(tcp->tcp_fused);
+ ASSERT(peer_tcp != NULL);
+
+ /*
+ * We need to prevent tcp_fuse_rrw() from entering before
+ * we can disable synchronous streams.
+ */
+ TCP_FUSE_SYNCSTR_STOP(tcp);
+ TCP_FUSE_SYNCSTR_STOP(peer_tcp);
+
+ /*
+ * Drain any pending data; the detached check is needed because
+ * we may be called as a result of a tcp_unfuse() triggered by
+ * tcp_fuse_output(). Note that in case of a detached tcp, the
+ * draining will happen later after the tcp is unfused. For non-
+ * urgent data, this can be handled by the regular tcp_rcv_drain().
+ * If we have urgent data sitting in the receive list, we will
+ * need to send up a SIGURG signal first before draining the data.
+ * All of these will be handled by the code in tcp_fuse_rcv_drain()
+ * when called from tcp_rcv_drain().
+ */
+ if (!TCP_IS_DETACHED(tcp)) {
+ (void) tcp_fuse_rcv_drain(tcp->tcp_rq, tcp,
+ (unfusing ? &tcp->tcp_fused_sigurg_mp : NULL));
+ }
+ if (!TCP_IS_DETACHED(peer_tcp)) {
+ (void) tcp_fuse_rcv_drain(peer_tcp->tcp_rq, peer_tcp,
+ (unfusing ? &peer_tcp->tcp_fused_sigurg_mp : NULL));
+ }
+
+ /* Lift up any flow-control conditions */
+ if (tcp->tcp_flow_stopped) {
+ tcp_clrqfull(tcp);
+ TCP_STAT(tcp_fusion_backenabled);
+ }
+ if (peer_tcp->tcp_flow_stopped) {
+ tcp_clrqfull(peer_tcp);
+ TCP_STAT(tcp_fusion_backenabled);
+ }
+
+ /* Disable synchronous streams */
+ tcp_fuse_syncstr_disable(tcp);
+ tcp_fuse_syncstr_disable(peer_tcp);
+}
+
+/*
+ * Calculate the size of receive buffer for a fused tcp endpoint.
+ */
+size_t
+tcp_fuse_set_rcv_hiwat(tcp_t *tcp, size_t rwnd)
+{
+ ASSERT(tcp->tcp_fused);
+
+ /* Ensure that value is within the maximum upper bound */
+ if (rwnd > tcp_max_buf)
+ rwnd = tcp_max_buf;
+
+ /* Obey the absolute minimum tcp receive high water mark */
+ if (rwnd < tcp_sth_rcv_hiwat)
+ rwnd = tcp_sth_rcv_hiwat;
+
+ /*
+ * Round up to system page size in case SO_RCVBUF is modified
+ * after SO_SNDBUF; the latter is also similarly rounded up.
+ */
+ rwnd = P2ROUNDUP_TYPED(rwnd, PAGESIZE, size_t);
+ tcp->tcp_fuse_rcv_hiwater = rwnd;
+ return (rwnd);
+}
+
+/*
+ * Calculate the maximum outstanding unread data block for a fused tcp endpoint.
+ */
+int
+tcp_fuse_maxpsz_set(tcp_t *tcp)
+{
+ tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+ uint_t sndbuf = tcp->tcp_xmit_hiwater;
+ uint_t maxpsz = sndbuf;
+
+ ASSERT(tcp->tcp_fused);
+ ASSERT(peer_tcp != NULL);
+ ASSERT(peer_tcp->tcp_fuse_rcv_hiwater != 0);
+ /*
+ * In the fused loopback case, we want the stream head to split
+ * up larger writes into smaller chunks for a more accurate flow-
+ * control accounting. Our maxpsz is half of the sender's send
+ * buffer or the receiver's receive buffer, whichever is smaller.
+ * We round up the buffer to system page size due to the lack of
+ * TCP MSS concept in Fusion.
+ */
+ if (maxpsz > peer_tcp->tcp_fuse_rcv_hiwater)
+ maxpsz = peer_tcp->tcp_fuse_rcv_hiwater;
+ maxpsz = P2ROUNDUP_TYPED(maxpsz, PAGESIZE, uint_t) >> 1;
+
+ /*
+ * Calculate the peer's limit for the number of outstanding unread
+ * data block. This is the amount of data blocks that are allowed
+ * to reside in the receiver's queue before the sender gets flow
+ * controlled. It is used only in the synchronous streams mode as
+ * a way to throttle the sender when it performs consecutive writes
+ * faster than can be read. The value is derived from SO_SNDBUF in
+ * order to give the sender some control; we divide it with a large
+ * value (16KB) to produce a fairly low initial limit.
+ */
+ if (tcp_fusion_rcv_unread_min == 0) {
+ /* A value of 0 means that we disable the check */
+ peer_tcp->tcp_fuse_rcv_unread_hiwater = 0;
+ } else {
+ peer_tcp->tcp_fuse_rcv_unread_hiwater =
+ MAX(sndbuf >> 14, tcp_fusion_rcv_unread_min);
+ }
+ return (maxpsz);
+}
diff --git a/usr/src/uts/common/inet/tcp/tcpddi.c b/usr/src/uts/common/inet/tcp/tcpddi.c
index d6d21f16b5..391fc3e65d 100644
--- a/usr/src/uts/common/inet/tcp/tcpddi.c
+++ b/usr/src/uts/common/inet/tcp/tcpddi.c
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -38,7 +38,13 @@
#define INET_DEVDESC "TCP STREAMS driver %I%"
#define INET_MODDESC "TCP STREAMS module %I%"
#define INET_DEVMINOR TCP_MINOR
-#define INET_DEVMTFLAGS D_MP
+/*
+ * Note that unlike UDP, TCP uses synchronous STREAMS only
+ * for TCP Fusion (loopback); this is why we don't define
+ * D_SYNCSTR here. Since TCP as a module is used only for
+ * SNMP purposes, we define _D_DIRECT for device instance.
+ */
+#define INET_DEVMTFLAGS (D_MP|_D_DIRECT)
#define INET_MODMTFLAGS D_MP
#include "../inetddi.c"
diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h
new file mode 100644
index 0000000000..93c08cb144
--- /dev/null
+++ b/usr/src/uts/common/inet/tcp_impl.h
@@ -0,0 +1,332 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _INET_TCP_IMPL_H
+#define _INET_TCP_IMPL_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * TCP implementation private declarations. These interfaces are
+ * used to build the IP module and are not meant to be accessed
+ * by any modules except IP itself. They are undocumented and are
+ * subject to change without notice.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+#include <inet/tcp.h>
+
+#define TCP_MOD_ID 5105
+
+/*
+ * Was this tcp created via socket() interface?
+ */
+#define TCP_IS_SOCKET(tcp) ((tcp)->tcp_issocket)
+
+/*
+ * Is this tcp not attached to any upper client?
+ */
+#define TCP_IS_DETACHED(tcp) ((tcp)->tcp_detached)
+
+#define TCP_TIMER(tcp, f, tim) \
+ tcp_timeout(tcp->tcp_connp, f, tim)
+#define TCP_TIMER_CANCEL(tcp, id) \
+ tcp_timeout_cancel(tcp->tcp_connp, id)
+
+/*
+ * To restart the TCP retransmission timer.
+ */
+#define TCP_TIMER_RESTART(tcp, intvl) { \
+ if ((tcp)->tcp_timer_tid != 0) \
+ (void) TCP_TIMER_CANCEL((tcp), (tcp)->tcp_timer_tid); \
+ (tcp)->tcp_timer_tid = TCP_TIMER((tcp), tcp_timer, \
+ MSEC_TO_TICK(intvl)); \
+}
+
+/*
+ * This stops synchronous streams for a fused tcp endpoint
+ * and prevents tcp_rrw() from pulling data from it.
+ */
+#define TCP_FUSE_SYNCSTR_STOP(tcp) { \
+ if ((tcp)->tcp_direct_sockfs) { \
+ mutex_enter(&(tcp)->tcp_fuse_lock); \
+ (tcp)->tcp_fuse_syncstr_stopped = B_TRUE; \
+ mutex_exit(&(tcp)->tcp_fuse_lock); \
+ } \
+}
+
+/*
+ * This resumes synchronous streams for this fused tcp endpoint
+ * and allows tcp_rrw() to pull data from it again.
+ */
+#define TCP_FUSE_SYNCSTR_RESUME(tcp) { \
+ if ((tcp)->tcp_direct_sockfs) { \
+ mutex_enter(&(tcp)->tcp_fuse_lock); \
+ (tcp)->tcp_fuse_syncstr_stopped = B_FALSE; \
+ mutex_exit(&(tcp)->tcp_fuse_lock); \
+ } \
+}
+
+/*
+ * Write-side flow-control is implemented via the per instance STREAMS
+ * write-side Q by explicitly setting QFULL to stop the flow of mblk_t(s)
+ * and clearing QFULL and calling qbackenable() to restart the flow based
+ * on the number of TCP unsent bytes (i.e. those not on the wire waiting
+ * for a remote ACK).
+ *
+ * This is different than a standard STREAMS kmod which when using the
+ * STREAMS Q the framework would automatictly flow-control based on the
+ * defined hiwat/lowat values as mblk_t's are enqueued/dequeued.
+ *
+ * As of FireEngine TCP write-side flow-control needs to take into account
+ * both the unsent tcp_xmit list bytes but also any squeue_t enqueued bytes
+ * (i.e. from tcp_wput() -> tcp_output()).
+ *
+ * This is accomplished by adding a new tcp_t fields, tcp_squeue_bytes, to
+ * count the number of bytes enqueued by tcp_wput() and the number of bytes
+ * dequeued and processed by tcp_output().
+ *
+ * So, the total number of bytes unsent is (squeue_bytes + unsent) with all
+ * flow-control uses of unsent replaced with the macro TCP_UNSENT_BYTES.
+ */
+extern void tcp_clrqfull(tcp_t *);
+extern void tcp_setqfull(tcp_t *);
+
+#define TCP_UNSENT_BYTES(tcp) \
+ ((tcp)->tcp_squeue_bytes + (tcp)->tcp_unsent)
+
+/* Named Dispatch Parameter Management Structure */
+typedef struct tcpparam_s {
+ uint32_t tcp_param_min;
+ uint32_t tcp_param_max;
+ uint32_t tcp_param_val;
+ char *tcp_param_name;
+} tcpparam_t;
+
+extern tcpparam_t tcp_param_arr[];
+
+#define tcp_time_wait_interval tcp_param_arr[0].tcp_param_val
+#define tcp_conn_req_max_q tcp_param_arr[1].tcp_param_val
+#define tcp_conn_req_max_q0 tcp_param_arr[2].tcp_param_val
+#define tcp_conn_req_min tcp_param_arr[3].tcp_param_val
+#define tcp_conn_grace_period tcp_param_arr[4].tcp_param_val
+#define tcp_cwnd_max_ tcp_param_arr[5].tcp_param_val
+#define tcp_dbg tcp_param_arr[6].tcp_param_val
+#define tcp_smallest_nonpriv_port tcp_param_arr[7].tcp_param_val
+#define tcp_ip_abort_cinterval tcp_param_arr[8].tcp_param_val
+#define tcp_ip_abort_linterval tcp_param_arr[9].tcp_param_val
+#define tcp_ip_abort_interval tcp_param_arr[10].tcp_param_val
+#define tcp_ip_notify_cinterval tcp_param_arr[11].tcp_param_val
+#define tcp_ip_notify_interval tcp_param_arr[12].tcp_param_val
+#define tcp_ipv4_ttl tcp_param_arr[13].tcp_param_val
+#define tcp_keepalive_interval_high tcp_param_arr[14].tcp_param_max
+#define tcp_keepalive_interval tcp_param_arr[14].tcp_param_val
+#define tcp_keepalive_interval_low tcp_param_arr[14].tcp_param_min
+#define tcp_maxpsz_multiplier tcp_param_arr[15].tcp_param_val
+#define tcp_mss_def_ipv4 tcp_param_arr[16].tcp_param_val
+#define tcp_mss_max_ipv4 tcp_param_arr[17].tcp_param_val
+#define tcp_mss_min tcp_param_arr[18].tcp_param_val
+#define tcp_naglim_def tcp_param_arr[19].tcp_param_val
+#define tcp_rexmit_interval_initial tcp_param_arr[20].tcp_param_val
+#define tcp_rexmit_interval_max tcp_param_arr[21].tcp_param_val
+#define tcp_rexmit_interval_min tcp_param_arr[22].tcp_param_val
+#define tcp_deferred_ack_interval tcp_param_arr[23].tcp_param_val
+#define tcp_snd_lowat_fraction tcp_param_arr[24].tcp_param_val
+#define tcp_sth_rcv_hiwat tcp_param_arr[25].tcp_param_val
+#define tcp_sth_rcv_lowat tcp_param_arr[26].tcp_param_val
+#define tcp_dupack_fast_retransmit tcp_param_arr[27].tcp_param_val
+#define tcp_ignore_path_mtu tcp_param_arr[28].tcp_param_val
+#define tcp_smallest_anon_port tcp_param_arr[29].tcp_param_val
+#define tcp_largest_anon_port tcp_param_arr[30].tcp_param_val
+#define tcp_xmit_hiwat tcp_param_arr[31].tcp_param_val
+#define tcp_xmit_lowat tcp_param_arr[32].tcp_param_val
+#define tcp_recv_hiwat tcp_param_arr[33].tcp_param_val
+#define tcp_recv_hiwat_minmss tcp_param_arr[34].tcp_param_val
+#define tcp_fin_wait_2_flush_interval tcp_param_arr[35].tcp_param_val
+#define tcp_co_min tcp_param_arr[36].tcp_param_val
+#define tcp_max_buf tcp_param_arr[37].tcp_param_val
+#define tcp_strong_iss tcp_param_arr[38].tcp_param_val
+#define tcp_rtt_updates tcp_param_arr[39].tcp_param_val
+#define tcp_wscale_always tcp_param_arr[40].tcp_param_val
+#define tcp_tstamp_always tcp_param_arr[41].tcp_param_val
+#define tcp_tstamp_if_wscale tcp_param_arr[42].tcp_param_val
+#define tcp_rexmit_interval_extra tcp_param_arr[43].tcp_param_val
+#define tcp_deferred_acks_max tcp_param_arr[44].tcp_param_val
+#define tcp_slow_start_after_idle tcp_param_arr[45].tcp_param_val
+#define tcp_slow_start_initial tcp_param_arr[46].tcp_param_val
+#define tcp_co_timer_interval tcp_param_arr[47].tcp_param_val
+#define tcp_sack_permitted tcp_param_arr[48].tcp_param_val
+#define tcp_trace tcp_param_arr[49].tcp_param_val
+#define tcp_compression_enabled tcp_param_arr[50].tcp_param_val
+#define tcp_ipv6_hoplimit tcp_param_arr[51].tcp_param_val
+#define tcp_mss_def_ipv6 tcp_param_arr[52].tcp_param_val
+#define tcp_mss_max_ipv6 tcp_param_arr[53].tcp_param_val
+#define tcp_rev_src_routes tcp_param_arr[54].tcp_param_val
+#define tcp_local_dack_interval tcp_param_arr[55].tcp_param_val
+#define tcp_ndd_get_info_interval tcp_param_arr[56].tcp_param_val
+#define tcp_local_dacks_max tcp_param_arr[57].tcp_param_val
+#define tcp_ecn_permitted tcp_param_arr[58].tcp_param_val
+#define tcp_rst_sent_rate_enabled tcp_param_arr[59].tcp_param_val
+#define tcp_rst_sent_rate tcp_param_arr[60].tcp_param_val
+#define tcp_push_timer_interval tcp_param_arr[61].tcp_param_val
+#define tcp_use_smss_as_mss_opt tcp_param_arr[62].tcp_param_val
+#define tcp_keepalive_abort_interval_high tcp_param_arr[63].tcp_param_max
+#define tcp_keepalive_abort_interval tcp_param_arr[63].tcp_param_val
+#define tcp_keepalive_abort_interval_low tcp_param_arr[63].tcp_param_min
+
+/* Kstats */
+typedef struct tcp_stat {
+ kstat_named_t tcp_time_wait;
+ kstat_named_t tcp_time_wait_syn;
+ kstat_named_t tcp_time_wait_syn_success;
+ kstat_named_t tcp_time_wait_syn_fail;
+ kstat_named_t tcp_reinput_syn;
+ kstat_named_t tcp_ip_output;
+ kstat_named_t tcp_detach_non_time_wait;
+ kstat_named_t tcp_detach_time_wait;
+ kstat_named_t tcp_time_wait_reap;
+ kstat_named_t tcp_clean_death_nondetached;
+ kstat_named_t tcp_reinit_calls;
+ kstat_named_t tcp_eager_err1;
+ kstat_named_t tcp_eager_err2;
+ kstat_named_t tcp_eager_blowoff_calls;
+ kstat_named_t tcp_eager_blowoff_q;
+ kstat_named_t tcp_eager_blowoff_q0;
+ kstat_named_t tcp_not_hard_bound;
+ kstat_named_t tcp_no_listener;
+ kstat_named_t tcp_found_eager;
+ kstat_named_t tcp_wrong_queue;
+ kstat_named_t tcp_found_eager_binding1;
+ kstat_named_t tcp_found_eager_bound1;
+ kstat_named_t tcp_eager_has_listener1;
+ kstat_named_t tcp_open_alloc;
+ kstat_named_t tcp_open_detached_alloc;
+ kstat_named_t tcp_rput_time_wait;
+ kstat_named_t tcp_listendrop;
+ kstat_named_t tcp_listendropq0;
+ kstat_named_t tcp_wrong_rq;
+ kstat_named_t tcp_rsrv_calls;
+ kstat_named_t tcp_eagerfree2;
+ kstat_named_t tcp_eagerfree3;
+ kstat_named_t tcp_eagerfree4;
+ kstat_named_t tcp_eagerfree5;
+ kstat_named_t tcp_timewait_syn_fail;
+ kstat_named_t tcp_listen_badflags;
+ kstat_named_t tcp_timeout_calls;
+ kstat_named_t tcp_timeout_cached_alloc;
+ kstat_named_t tcp_timeout_cancel_reqs;
+ kstat_named_t tcp_timeout_canceled;
+ kstat_named_t tcp_timermp_alloced;
+ kstat_named_t tcp_timermp_freed;
+ kstat_named_t tcp_timermp_allocfail;
+ kstat_named_t tcp_timermp_allocdblfail;
+ kstat_named_t tcp_push_timer_cnt;
+ kstat_named_t tcp_ack_timer_cnt;
+ kstat_named_t tcp_ire_null1;
+ kstat_named_t tcp_ire_null;
+ kstat_named_t tcp_ip_send;
+ kstat_named_t tcp_ip_ire_send;
+ kstat_named_t tcp_wsrv_called;
+ kstat_named_t tcp_flwctl_on;
+ kstat_named_t tcp_timer_fire_early;
+ kstat_named_t tcp_timer_fire_miss;
+ kstat_named_t tcp_freelist_cleanup;
+ kstat_named_t tcp_rput_v6_error;
+ kstat_named_t tcp_out_sw_cksum;
+ kstat_named_t tcp_out_sw_cksum_bytes;
+ kstat_named_t tcp_zcopy_on;
+ kstat_named_t tcp_zcopy_off;
+ kstat_named_t tcp_zcopy_backoff;
+ kstat_named_t tcp_zcopy_disable;
+ kstat_named_t tcp_mdt_pkt_out;
+ kstat_named_t tcp_mdt_pkt_out_v4;
+ kstat_named_t tcp_mdt_pkt_out_v6;
+ kstat_named_t tcp_mdt_discarded;
+ kstat_named_t tcp_mdt_conn_halted1;
+ kstat_named_t tcp_mdt_conn_halted2;
+ kstat_named_t tcp_mdt_conn_halted3;
+ kstat_named_t tcp_mdt_conn_resumed1;
+ kstat_named_t tcp_mdt_conn_resumed2;
+ kstat_named_t tcp_mdt_legacy_small;
+ kstat_named_t tcp_mdt_legacy_all;
+ kstat_named_t tcp_mdt_legacy_ret;
+ kstat_named_t tcp_mdt_allocfail;
+ kstat_named_t tcp_mdt_addpdescfail;
+ kstat_named_t tcp_mdt_allocd;
+ kstat_named_t tcp_mdt_linked;
+ kstat_named_t tcp_fusion_flowctl;
+ kstat_named_t tcp_fusion_backenabled;
+ kstat_named_t tcp_fusion_urg;
+ kstat_named_t tcp_fusion_putnext;
+ kstat_named_t tcp_fusion_unfusable;
+ kstat_named_t tcp_fusion_aborted;
+ kstat_named_t tcp_fusion_unqualified;
+ kstat_named_t tcp_fusion_rrw_busy;
+ kstat_named_t tcp_fusion_rrw_msgcnt;
+ kstat_named_t tcp_in_ack_unsent_drop;
+ kstat_named_t tcp_sock_fallback;
+} tcp_stat_t;
+
+extern tcp_stat_t tcp_statistics;
+
+#define TCP_STAT(x) (tcp_statistics.x.value.ui64++)
+#define TCP_STAT_UPDATE(x, n) (tcp_statistics.x.value.ui64 += (n))
+#define TCP_STAT_SET(x, n) (tcp_statistics.x.value.ui64 = (n))
+
+extern struct qinit tcp_loopback_rinit, tcp_rinit;
+extern boolean_t do_tcp_fusion;
+
+extern int tcp_maxpsz_set(tcp_t *, boolean_t);
+extern void tcp_timers_stop(tcp_t *);
+extern void tcp_rcv_enqueue(tcp_t *, mblk_t *, uint_t);
+extern void tcp_push_timer(void *);
+extern timeout_id_t tcp_timeout(conn_t *, void (*)(void *), clock_t);
+extern clock_t tcp_timeout_cancel(conn_t *, timeout_id_t);
+
+extern void tcp_fuse(tcp_t *, uchar_t *, tcph_t *);
+extern void tcp_unfuse(tcp_t *);
+extern boolean_t tcp_fuse_output(tcp_t *, mblk_t *, uint32_t);
+extern void tcp_fuse_output_urg(tcp_t *, mblk_t *);
+extern boolean_t tcp_fuse_rcv_drain(queue_t *, tcp_t *, mblk_t **);
+extern void tcp_fuse_syncstr_enable_pair(tcp_t *);
+extern void tcp_fuse_disable_pair(tcp_t *, boolean_t);
+extern int tcp_fuse_rrw(queue_t *, struiod_t *);
+extern int tcp_fuse_rinfop(queue_t *, infod_t *);
+extern size_t tcp_fuse_set_rcv_hiwat(tcp_t *, size_t);
+extern int tcp_fuse_maxpsz_set(tcp_t *);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _INET_TCP_IMPL_H */
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index 5bed5bf992..d804018911 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -31,6 +31,8 @@ const char udp_version[] = "%Z%%M% %I% %E% SMI";
#include <sys/types.h>
#include <sys/stream.h>
+#include <sys/dlpi.h>
+#include <sys/pattr.h>
#include <sys/stropts.h>
#include <sys/strlog.h>
#include <sys/strsun.h>
@@ -50,6 +52,7 @@ const char udp_version[] = "%Z%%M% %I% %E% SMI";
#include <sys/zone.h>
#include <sys/socket.h>
+#include <sys/sockio.h>
#include <sys/vtrace.h>
#include <sys/debug.h>
#include <sys/isa_defs.h>
@@ -59,11 +62,15 @@ const char udp_version[] = "%Z%%M% %I% %E% SMI";
#include <netinet/icmp6.h>
#include <netinet/udp.h>
#include <net/if.h>
+#include <net/route.h>
#include <inet/common.h>
#include <inet/ip.h>
+#include <inet/ip_impl.h>
#include <inet/ip6.h>
#include <inet/ip_ire.h>
+#include <inet/ip_if.h>
+#include <inet/ip_multi.h>
#include <inet/mi.h>
#include <inet/mib2.h>
#include <inet/nd.h>
@@ -71,9 +78,12 @@ const char udp_version[] = "%Z%%M% %I% %E% SMI";
#include <inet/snmpcom.h>
#include <inet/kstatcom.h>
#include <inet/udp_impl.h>
+#include <inet/ipclassifier.h>
+#include <inet/ipsec_impl.h>
+#include <inet/ipp_common.h>
/*
- * The ipsec_info.h header file is here since it has the defination for the
+ * The ipsec_info.h header file is here since it has the definition for the
* M_CTL message types used by IP to convey information to the ULP. The
* ipsec_info.h needs the pfkeyv2.h, hence the latters presence.
*/
@@ -81,40 +91,138 @@ const char udp_version[] = "%Z%%M% %I% %E% SMI";
#include <inet/ipsec_info.h>
/*
- * Object to represent database of options to search passed to
- * {sock,tpi}optcom_req() interface routine to take care of option
- * management and associated methods.
- * XXX. These and other externs should really move to a udp header file.
- */
-extern optdb_obj_t udp_opt_obj;
-extern uint_t udp_max_optsize;
-
-
-/*
* Synchronization notes:
*
- * UDP uses a combination of the queue-pair STREAMS perimeter, a global
- * lock and a set of bind hash locks to protect its data structures.
+ * UDP uses a combination of its internal perimeter, a global lock and
+ * a set of bind hash locks to protect its data structures. Please see
+ * the note above udp_mode_assertions for details about the internal
+ * perimeter.
*
- * The queue-pair perimeter is not acquired exclusively in the put
- * procedures thus when udp_rput or udp_wput needs exclusive access to
- * the udp_t instance structure it will use qwriter(..., PERIM_INNER) to
- * asynchronously acquire exclusive access to the udp_t instance.
- *
- * When UDP global data needs to be modified the udp_g_lock mutex is acquired.
- * Currently, udp_g_head and udp_g_epriv_ports[] are protected by it.
- *
- * When an UDP endpoint is bound to a local port, it is inserted into
+ * When a UDP endpoint is bound to a local port, it is inserted into
* a bind hash list. The list consists of an array of udp_fanout_t buckets.
* The size of the array is controlled by the udp_bind_fanout_size variable.
* This variable can be changed in /etc/system if the default value is
- * not large enough. Each bind hash bucket is protected by a per bucket lock.
- * It protects the udp_bind_hash and udp_ptpbhn fields in the udp_t
+ * not large enough. Each bind hash bucket is protected by a per bucket
+ * lock. It protects the udp_bind_hash and udp_ptpbhn fields in the udp_t
* structure. An UDP endpoint is removed from the bind hash list only
* when it is being unbound or being closed. The per bucket lock also
- * protects an UDP endpoint's state changes.
+ * protects a UDP endpoint's state changes.
+ *
+ * Plumbing notes:
+ *
+ * Both udp and ip are merged, but the streams plumbing is kept unchanged
+ * in that udp is always pushed atop /dev/ip. This is done to preserve
+ * backwards compatibility for certain applications which rely on such
+ * plumbing geometry to do things such as issuing I_POP on the stream
+ * in order to obtain direct access to /dev/ip, etc.
+ *
+ * All UDP processings happen in the /dev/ip instance; the udp module
+ * instance does not possess any state about the endpoint, and merely
+ * acts as a dummy module whose presence is to keep the streams plumbing
+ * appearance unchanged. At open time /dev/ip allocates a conn_t that
+ * happens to embed a udp_t. This stays dormant until the time udp is
+ * pushed, which indicates to /dev/ip that it must convert itself from
+ * an IP to a UDP endpoint.
+ *
+ * We only allow for the following plumbing cases:
+ *
+ * Normal:
+ * /dev/ip is first opened and later udp is pushed directly on top.
+ * This is the default action that happens when a udp socket or
+ * /dev/udp is opened. The conn_t created by /dev/ip instance is
+ * now shared and is marked with IPCL_UDP.
+ *
+ * SNMP-only:
+ * udp is pushed on top of a module other than /dev/ip. When this
+ * happens it will support only SNMP semantics. A new conn_t is
+ * allocated and marked with IPCL_UDPMOD.
+ *
+ * The above cases imply that we don't support any intermediate module to
+ * reside in between /dev/ip and udp -- in fact, we never supported such
+ * scenario in the past as the inter-layer communication semantics have
+ * always been private. Also note that the normal case allows for SNMP
+ * requests to be processed in addition to the rest of UDP operations.
+ *
+ * The normal case plumbing is depicted by the following diagram:
+ *
+ * +---------------+---------------+
+ * | | | udp
+ * | udp_wq | udp_rq |
+ * | | UDP_RD |
+ * | | |
+ * +---------------+---------------+
+ * | ^
+ * v |
+ * +---------------+---------------+
+ * | | | /dev/ip
+ * | ip_wq | ip_rq | conn_t
+ * | UDP_WR | |
+ * | | |
+ * +---------------+---------------+
+ *
+ * Messages arriving at udp_wq from above will end up in ip_wq before
+ * it gets processed, i.e. udp write entry points will advance udp_wq
+ * and use its q_next value as ip_wq in order to use the conn_t that
+ * is stored in its q_ptr. Likewise, messages generated by ip to the
+ * module above udp will appear as if they are originated from udp_rq,
+ * i.e. putnext() calls to the module above udp is done using the
+ * udp_rq instead of ip_rq in order to avoid udp_rput() which does
+ * nothing more than calling putnext().
+ *
+ * The above implies the following rule of thumb:
+ *
+ * 1. udp_t is obtained from conn_t, which is created by the /dev/ip
+ * instance and is stored in q_ptr of both ip_wq and ip_rq. There
+ * is no direct reference to conn_t from either udp_wq or udp_rq.
+ *
+ * 2. Write-side entry points of udp can obtain the conn_t via the
+ * Q_TO_CONN() macro, using the queue value obtain from UDP_WR().
+ *
+ * 3. While in /dev/ip context, putnext() to the module above udp can
+ * be done by supplying the queue value obtained from UDP_RD().
+ *
*/
+static queue_t *UDP_WR(queue_t *);
+static queue_t *UDP_RD(queue_t *);
+
+udp_stat_t udp_statistics = {
+ { "udp_ip_send", KSTAT_DATA_UINT64 },
+ { "udp_ip_ire_send", KSTAT_DATA_UINT64 },
+ { "udp_ire_null", KSTAT_DATA_UINT64 },
+ { "udp_drain", KSTAT_DATA_UINT64 },
+ { "udp_sock_fallback", KSTAT_DATA_UINT64 },
+ { "udp_rrw_busy", KSTAT_DATA_UINT64 },
+ { "udp_rrw_msgcnt", KSTAT_DATA_UINT64 },
+ { "udp_out_sw_cksum", KSTAT_DATA_UINT64 },
+ { "udp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 },
+ { "udp_out_opt", KSTAT_DATA_UINT64 },
+ { "udp_out_err_notconn", KSTAT_DATA_UINT64 },
+ { "udp_out_err_output", KSTAT_DATA_UINT64 },
+ { "udp_out_err_tudr", KSTAT_DATA_UINT64 },
+ { "udp_in_pktinfo", KSTAT_DATA_UINT64 },
+ { "udp_in_recvdstaddr", KSTAT_DATA_UINT64 },
+ { "udp_in_recvopts", KSTAT_DATA_UINT64 },
+ { "udp_in_recvif", KSTAT_DATA_UINT64 },
+ { "udp_in_recvslla", KSTAT_DATA_UINT64 },
+ { "udp_in_recvucred", KSTAT_DATA_UINT64 },
+ { "udp_in_recvttl", KSTAT_DATA_UINT64 },
+ { "udp_in_recvhopopts", KSTAT_DATA_UINT64 },
+ { "udp_in_recvhoplimit", KSTAT_DATA_UINT64 },
+ { "udp_in_recvdstopts", KSTAT_DATA_UINT64 },
+ { "udp_in_recvrtdstopts", KSTAT_DATA_UINT64 },
+ { "udp_in_recvrthdr", KSTAT_DATA_UINT64 },
+ { "udp_in_recvpktinfo", KSTAT_DATA_UINT64 },
+ { "udp_in_recvtclass", KSTAT_DATA_UINT64 },
+#ifdef DEBUG
+ { "udp_data_conn", KSTAT_DATA_UINT64 },
+ { "udp_data_notconn", KSTAT_DATA_UINT64 },
+#endif
+};
+
+static kstat_t *udp_ksp;
+struct kmem_cache *udp_cache;
+
/*
* Bind hash list size and hash function. It has to be a power of 2 for
* hashing.
@@ -151,14 +259,6 @@ static clock_t udp_last_ndd_get_info_time;
"later.\n"
#define NDD_OUT_OF_BUF_MSG "<< Out of buffer >>\n"
-/* Named Dispatch Parameter Management Structure */
-typedef struct udpparam_s {
- uint32_t udp_param_min;
- uint32_t udp_param_max;
- uint32_t udp_param_value;
- char *udp_param_name;
-} udpparam_t;
-
static void udp_addr_req(queue_t *q, mblk_t *mp);
static void udp_bind(queue_t *q, mblk_t *mp);
static void udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp);
@@ -188,15 +288,6 @@ static int udp_open(queue_t *q, dev_t *devp, int flag, int sflag,
static int udp_unitdata_opt_process(queue_t *q, mblk_t *mp,
int *errorp, void *thisdg_attrs);
static boolean_t udp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name);
-int udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name,
- uchar_t *ptr);
-int udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name,
- uchar_t *ptr);
-int udp_opt_set(queue_t *q, uint_t optset_context,
- int level, int name,
- uint_t inlen, uchar_t *invalp,
- uint_t *outlenp, uchar_t *outvalp,
- void *thisdg_attrs, cred_t *cr, mblk_t *mblk);
static int udp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
static boolean_t udp_param_register(udpparam_t *udppa, int cnt);
static int udp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
@@ -205,62 +296,91 @@ static int udp_pkt_set(uchar_t *invalp, uint_t inlen, boolean_t sticky,
uchar_t **optbufp, uint_t *optlenp);
static void udp_report_item(mblk_t *mp, udp_t *udp);
static void udp_rput(queue_t *q, mblk_t *mp);
+static void udp_rput_other(queue_t *, mblk_t *);
+static int udp_rinfop(queue_t *q, infod_t *dp);
+static int udp_rrw(queue_t *q, struiod_t *dp);
static void udp_rput_bind_ack(queue_t *q, mblk_t *mp);
-static void udp_rput_other(queue_t *q, mblk_t *mp);
-static int udp_snmp_get(queue_t *q, mblk_t *mpctl);
-static int udp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
- uchar_t *ptr, int len);
static int udp_status_report(queue_t *q, mblk_t *mp, caddr_t cp,
cred_t *cr);
-static void udp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err);
+static void udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha);
+static void udp_ud_err(queue_t *q, mblk_t *mp, uchar_t *destaddr,
+ t_scalar_t destlen, t_scalar_t err);
static void udp_unbind(queue_t *q, mblk_t *mp);
static in_port_t udp_update_next_port(in_port_t port, boolean_t random);
static void udp_wput(queue_t *q, mblk_t *mp);
-static void udp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6,
- t_scalar_t tudr_optlen);
+static mblk_t *udp_output_v4(conn_t *, mblk_t *mp, ipaddr_t v4dst,
+ uint16_t port, uint_t srcid, int *error);
+static mblk_t *udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6,
+ t_scalar_t tudr_optlen, int *error);
static void udp_wput_other(queue_t *q, mblk_t *mp);
static void udp_wput_iocdata(queue_t *q, mblk_t *mp);
+static void udp_output(conn_t *connp, mblk_t *mp, struct sockaddr *addr,
+ socklen_t addrlen);
+static size_t udp_set_rcv_hiwat(udp_t *udp, size_t size);
static void udp_kstat_init(void);
static void udp_kstat_fini(void);
static int udp_kstat_update(kstat_t *kp, int rw);
+static void udp_input_wrapper(void *arg, mblk_t *mp, void *arg2);
+static void udp_rput_other_wrapper(void *arg, mblk_t *mp, void *arg2);
+static void udp_wput_other_wrapper(void *arg, mblk_t *mp, void *arg2);
+static void udp_resume_bind_cb(void *arg, mblk_t *mp, void *arg2);
+
+static void udp_rcv_enqueue(queue_t *q, udp_t *udp, mblk_t *mp,
+ uint_t pkt_len);
+static void udp_rcv_drain(queue_t *q, udp_t *udp, boolean_t closing);
+static void udp_enter(conn_t *, mblk_t *, sqproc_t, uint8_t);
+static void udp_exit(conn_t *);
+static void udp_become_writer(conn_t *, mblk_t *, sqproc_t, uint8_t);
+#ifdef DEBUG
+static void udp_mode_assertions(udp_t *, int);
+#endif /* DEBUG */
major_t UDP6_MAJ;
-#define UDP6 "udp6"
+#define UDP6 "udp6"
+
+#define UDP_RECV_HIWATER (56 * 1024)
+#define UDP_RECV_LOWATER 128
+#define UDP_XMIT_HIWATER (56 * 1024)
+#define UDP_XMIT_LOWATER 1024
-#define UDP_MAXPACKET_IPV4 \
- (IP_MAXPACKET - UDPH_SIZE - IP_SIMPLE_HDR_LENGTH)
-#define UDP_MAXPACKET_IPV6 \
- (IP_MAXPACKET - UDPH_SIZE - IPV6_HDR_LEN)
+static struct module_info udp_info = {
+ UDP_MOD_ID, UDP_MOD_NAME, 1, INFPSZ, UDP_RECV_HIWATER, UDP_RECV_LOWATER
+};
+
+static struct qinit udp_rinit = {
+ (pfi_t)udp_rput, NULL, udp_open, udp_close, NULL,
+ &udp_info, NULL, udp_rrw, udp_rinfop, STRUIOT_STANDARD
+};
-static struct module_info info = {
- 5607, "udp", 1, INFPSZ, 512, 128
+static struct qinit udp_winit = {
+ (pfi_t)udp_wput, NULL, NULL, NULL, NULL,
+ &udp_info, NULL, NULL, NULL, STRUIOT_NONE
};
-static struct qinit rinit = {
- (pfi_t)udp_rput, NULL, udp_open, udp_close, NULL, &info
+/* Support for just SNMP if UDP is not pushed directly over device IP */
+struct qinit udp_snmp_rinit = {
+ (pfi_t)putnext, NULL, udp_open, ip_snmpmod_close, NULL,
+ &udp_info, NULL, NULL, NULL, STRUIOT_NONE
};
-static struct qinit winit = {
- (pfi_t)udp_wput, NULL, NULL, NULL, NULL, &info
+struct qinit udp_snmp_winit = {
+ (pfi_t)ip_snmpmod_wput, NULL, udp_open, ip_snmpmod_close, NULL,
+ &udp_info, NULL, NULL, NULL, STRUIOT_NONE
};
struct streamtab udpinfo = {
- &rinit, &winit
+ &udp_rinit, &udp_winit
};
static sin_t sin_null; /* Zero address for quick clears */
static sin6_t sin6_null; /* Zero address for quick clears */
-/* Protected by udp_g_lock */
-static void *udp_g_head; /* Head for list of open udp streams. */
-kmutex_t udp_g_lock; /* Protects the above variable */
-
/* Hint not protected by any lock */
static in_port_t udp_g_next_port_to_try;
/*
- * Extra privileged ports. In host byte order. Protected by udp_g_lock.
+ * Extra privileged ports. In host byte order.
*/
#define UDP_NUM_EPRIV_PORTS 64
static int udp_g_num_epriv_ports = UDP_NUM_EPRIV_PORTS;
@@ -273,6 +393,7 @@ static IDP udp_g_nd; /* Points to table of UDP ND variables. */
static mib2_udp_t udp_mib; /* SNMP fixed size info */
static kstat_t *udp_mibkp; /* kstat exporting udp_mib data */
+#define UDP_MAXPACKET_IPV4 (IP_MAXPACKET - UDPH_SIZE - IP_SIMPLE_HDR_LENGTH)
/* Default structure copied into T_INFO_ACK messages */
static struct T_info_ack udp_g_t_info_ack_ipv4 = {
@@ -289,6 +410,8 @@ static struct T_info_ack udp_g_t_info_ack_ipv4 = {
(XPG4_1|SENDZERO) /* PROVIDER_flag */
};
+#define UDP_MAXPACKET_IPV6 (IP_MAXPACKET - UDPH_SIZE - IPV6_HDR_LEN)
+
static struct T_info_ack udp_g_t_info_ack_ipv6 = {
T_INFO_ACK,
UDP_MAXPACKET_IPV6, /* TSDU_size. Excl. headers */
@@ -311,33 +434,23 @@ static struct T_info_ack udp_g_t_info_ack_ipv6 = {
* in udp_open.
* All of these are alterable, within the min/max values given, at run time.
*/
-static udpparam_t udp_param_arr[] = {
- /* min max value name */
- { 0L, 256, 32, "udp_wroff_extra" },
- { 1L, 255, 255, "udp_ipv4_ttl" },
- { 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS, "udp_ipv6_hoplimit"},
- { 1024, (32 * 1024), 1024, "udp_smallest_nonpriv_port" },
- { 0, 1, 1, "udp_do_checksum" },
- { 1024, UDP_MAX_PORT, (32 * 1024), "udp_smallest_anon_port" },
- { 1024, UDP_MAX_PORT, UDP_MAX_PORT, "udp_largest_anon_port" },
- { 4096, 1024*1024, 56*1024, "udp_xmit_hiwat"},
- { 0, 1024*1024, 1024, "udp_xmit_lowat"},
- { 4096, 1024*1024, 56*1024, "udp_recv_hiwat"},
- { 65536, 1024*1024*1024, 2*1024*1024, "udp_max_buf"},
- { 100, 60000, 1000, "udp_ndd_get_info_interval"},
+/* BEGIN CSTYLED */
+udpparam_t udp_param_arr[] = {
+ /*min max value name */
+ { 0L, 256, 32, "udp_wroff_extra" },
+ { 1L, 255, 255, "udp_ipv4_ttl" },
+ { 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS, "udp_ipv6_hoplimit"},
+ { 1024, (32 * 1024), 1024, "udp_smallest_nonpriv_port" },
+ { 0, 1, 1, "udp_do_checksum" },
+ { 1024, UDP_MAX_PORT, (32 * 1024), "udp_smallest_anon_port" },
+ { 1024, UDP_MAX_PORT, UDP_MAX_PORT, "udp_largest_anon_port" },
+ { UDP_XMIT_LOWATER, (1<<30), UDP_XMIT_HIWATER, "udp_xmit_hiwat"},
+ { 0, (1<<30), UDP_XMIT_LOWATER, "udp_xmit_lowat"},
+ { UDP_RECV_LOWATER, (1<<30), UDP_RECV_HIWATER, "udp_recv_hiwat"},
+ { 65536, (1<<30), 2*1024*1024, "udp_max_buf"},
+ { 100, 60000, 1000, "udp_ndd_get_info_interval"},
};
-#define udp_wroff_extra udp_param_arr[0].udp_param_value
-#define udp_ipv4_ttl udp_param_arr[1].udp_param_value
-#define udp_ipv6_hoplimit udp_param_arr[2].udp_param_value
-#define udp_smallest_nonpriv_port udp_param_arr[3].udp_param_value
-#define udp_do_checksum udp_param_arr[4].udp_param_value
-#define udp_smallest_anon_port udp_param_arr[5].udp_param_value
-#define udp_largest_anon_port udp_param_arr[6].udp_param_value
-#define udp_xmit_hiwat udp_param_arr[7].udp_param_value
-#define udp_xmit_lowat udp_param_arr[8].udp_param_value
-#define udp_recv_hiwat udp_param_arr[9].udp_param_value
-#define udp_max_buf udp_param_arr[10].udp_param_value
-#define udp_ndd_get_info_interval udp_param_arr[11].udp_param_value
+/* END CSTYLED */
/*
* The smallest anonymous port in the priviledged port range which UDP
@@ -354,9 +467,434 @@ uint32_t udp_random_anon_port = 1;
*/
void (*cl_inet_bind)(uchar_t protocol, sa_family_t addr_family,
- uint8_t *laddrp, in_port_t lport) = NULL;
+ uint8_t *laddrp, in_port_t lport) = NULL;
void (*cl_inet_unbind)(uint8_t protocol, sa_family_t addr_family,
- uint8_t *laddrp, in_port_t lport) = NULL;
+ uint8_t *laddrp, in_port_t lport) = NULL;
+
+typedef union T_primitives *t_primp_t;
+
+#define UDP_ENQUEUE_MP(udp, mp, proc, tag) { \
+ ASSERT((mp)->b_prev == NULL && (mp)->b_queue == NULL); \
+ ASSERT(MUTEX_HELD(&(udp)->udp_connp->conn_lock)); \
+ (mp)->b_queue = (queue_t *)((uintptr_t)tag); \
+ (mp)->b_prev = (mblk_t *)proc; \
+ if ((udp)->udp_mphead == NULL) \
+ (udp)->udp_mphead = (mp); \
+ else \
+ (udp)->udp_mptail->b_next = (mp); \
+ (udp)->udp_mptail = (mp); \
+ (udp)->udp_mpcount++; \
+}
+
+#define UDP_READERS_INCREF(udp) { \
+ ASSERT(MUTEX_HELD(&(udp)->udp_connp->conn_lock)); \
+ (udp)->udp_reader_count++; \
+}
+
+#define UDP_READERS_DECREF(udp) { \
+ ASSERT(MUTEX_HELD(&(udp)->udp_connp->conn_lock)); \
+ (udp)->udp_reader_count--; \
+ if ((udp)->udp_reader_count == 0) \
+ cv_broadcast(&(udp)->udp_connp->conn_cv); \
+}
+
+#define UDP_SQUEUE_DECREF(udp) { \
+ ASSERT(MUTEX_HELD(&(udp)->udp_connp->conn_lock)); \
+ (udp)->udp_squeue_count--; \
+ if ((udp)->udp_squeue_count == 0) \
+ cv_broadcast(&(udp)->udp_connp->conn_cv); \
+}
+
+/*
+ * Notes on UDP endpoint synchronization:
+ *
+ * UDP needs exclusive operation on a per endpoint basis, when executing
+ * functions that modify the endpoint state. udp_rput_other() deals with
+ * packets with IP options, and processing these packets end up having
+ * to update the endpoint's option related state. udp_wput_other() deals
+ * with control operations from the top, e.g. connect() that needs to
+ * update the endpoint state. These could be synchronized using locks,
+ * but the current version uses squeues for this purpose. squeues may
+ * give performance improvement for certain cases such as connected UDP
+ * sockets; thus the framework allows for using squeues.
+ *
+ * The perimeter routines are described as follows:
+ *
+ * udp_enter():
+ * Enter the UDP endpoint perimeter.
+ *
+ * udp_become_writer():
+ * Become exclusive on the UDP endpoint. Specifies a function
+ * that will be called exclusively either immediately or later
+ * when the perimeter is available exclusively.
+ *
+ * udp_exit():
+ * Exit the UDP perimeter.
+ *
+ * Entering UDP from the top or from the bottom must be done using
+ * udp_enter(). No lock must be held while attempting to enter the UDP
+ * perimeter. When finished, udp_exit() must be called to get out of
+ * the perimeter.
+ *
+ * UDP operates in either MT_HOT mode or in SQUEUE mode. In MT_HOT mode,
+ * multiple threads may enter a UDP endpoint concurrently. This is used
+ * for sending and/or receiving normal data. Control operations and other
+ * special cases call udp_become_writer() to become exclusive on a per
+ * endpoint basis and this results in transitioning to SQUEUE mode. squeue
+ * by definition serializes access to the conn_t. When there are no more
+ * pending messages on the squeue for the UDP connection, the endpoint
+ * reverts to MT_HOT mode. During the interregnum when not all MT threads
+ * of an endpoint have finished, messages are queued in the UDP endpoint
+ * and the UDP is in UDP_MT_QUEUED mode or UDP_QUEUED_SQUEUE mode.
+ *
+ * These modes have the following analogs:
+ *
+ * UDP_MT_HOT/udp_reader_count==0 none
+ * UDP_MT_HOT/udp_reader_count>0 RW_READ_LOCK
+ * UDP_MT_QUEUED RW_WRITE_WANTED
+ * UDP_SQUEUE or UDP_QUEUED_SQUEUE RW_WRITE_LOCKED
+ *
+ * Stable modes: UDP_MT_HOT, UDP_SQUEUE
+ * Transient modes: UDP_MT_QUEUED, UDP_QUEUED_SQUEUE
+ *
+ * While in stable modes, UDP keeps track of the number of threads
+ * operating on the endpoint. The udp_reader_count variable represents
+ * the number of threads entering the endpoint as readers while it is
+ * in UDP_MT_HOT mode. Transitioning to UDP_SQUEUE happens when there
+ * is only a single reader, i.e. when this counter drops to 1. Likewise,
+ * udp_squeue_count represents the number of threads operating on the
+ * endpoint's squeue while it is in UDP_SQUEUE mode. The mode transition
+ * to UDP_MT_HOT happens after the last thread exits the endpoint, i.e.
+ * when this counter drops to 0.
+ *
+ * The default mode is set to UDP_MT_HOT and UDP alternates between
+ * UDP_MT_HOT and UDP_SQUEUE as shown in the state transition below.
+ *
+ * Mode transition:
+ * ----------------------------------------------------------------
+ * old mode Event New mode
+ * ----------------------------------------------------------------
+ * UDP_MT_HOT Call to udp_become_writer() UDP_SQUEUE
+ * and udp_reader_count == 1
+ *
+ * UDP_MT_HOT Call to udp_become_writer() UDP_MT_QUEUED
+ * and udp_reader_count > 1
+ *
+ * UDP_MT_QUEUED udp_reader_count drops to zero UDP_QUEUED_SQUEUE
+ *
+ * UDP_QUEUED_SQUEUE All messages enqueued on the UDP_SQUEUE
+ * internal UDP queue successfully
+ * moved to squeue AND udp_squeue_count != 0
+ *
+ * UDP_QUEUED_SQUEUE All messages enqueued on the UDP_MT_HOT
+ * internal UDP queue successfully
+ * moved to squeue AND udp_squeue_count
+ * drops to zero
+ *
+ * UDP_SQUEUE udp_squeue_count drops to zero UDP_MT_HOT
+ * ----------------------------------------------------------------
+ */
+
+static queue_t *
+UDP_WR(queue_t *q)
+{
+ ASSERT(q->q_ptr == NULL && _OTHERQ(q)->q_ptr == NULL);
+ ASSERT(WR(q)->q_next != NULL && WR(q)->q_next->q_ptr != NULL);
+ ASSERT(IPCL_IS_UDP(Q_TO_CONN(WR(q)->q_next)));
+
+ return (_WR(q)->q_next);
+}
+
+static queue_t *
+UDP_RD(queue_t *q)
+{
+ ASSERT(q->q_ptr != NULL && _OTHERQ(q)->q_ptr != NULL);
+ ASSERT(IPCL_IS_UDP(Q_TO_CONN(q)));
+ ASSERT(RD(q)->q_next != NULL && RD(q)->q_next->q_ptr == NULL);
+
+ return (_RD(q)->q_next);
+}
+
+#ifdef DEBUG
+#define UDP_MODE_ASSERTIONS(udp, caller) udp_mode_assertions(udp, caller)
+#else
+#define UDP_MODE_ASSERTIONS(udp, caller)
+#endif
+
+/* Invariants */
+#ifdef DEBUG
+
+uint32_t udp_count[4];
+
+/* Context of udp_mode_assertions */
+#define UDP_ENTER 1
+#define UDP_BECOME_WRITER 2
+#define UDP_EXIT 3
+
+static void
+udp_mode_assertions(udp_t *udp, int caller)
+{
+ ASSERT(MUTEX_HELD(&udp->udp_connp->conn_lock));
+
+ switch (udp->udp_mode) {
+ case UDP_MT_HOT:
+ /*
+ * Messages have not yet been enqueued on the internal queue,
+ * otherwise we would have switched to UDP_MT_QUEUED. Likewise
+ * by definition, there can't be any messages enqueued on the
+ * squeue. The UDP could be quiescent, so udp_reader_count
+ * could be zero at entry.
+ */
+ ASSERT(udp->udp_mphead == NULL && udp->udp_mpcount == 0 &&
+ udp->udp_squeue_count == 0);
+ ASSERT(caller == UDP_ENTER || udp->udp_reader_count != 0);
+ udp_count[0]++;
+ break;
+
+ case UDP_MT_QUEUED:
+ /*
+ * The last MT thread to exit the udp perimeter empties the
+ * internal queue and then switches the UDP to
+ * UDP_QUEUED_SQUEUE mode. Since we are still in UDP_MT_QUEUED
+ * mode, it means there must be at least 1 MT thread still in
+ * the perimeter and at least 1 message on the internal queue.
+ */
+ ASSERT(udp->udp_reader_count >= 1 && udp->udp_mphead != NULL &&
+ udp->udp_mpcount != 0 && udp->udp_squeue_count == 0);
+ udp_count[1]++;
+ break;
+
+ case UDP_QUEUED_SQUEUE:
+ /*
+ * The switch has happened from MT to SQUEUE. So there can't
+ * any MT threads. Messages could still pile up on the internal
+ * queue until the transition is complete and we move to
+ * UDP_SQUEUE mode. We can't assert on nonzero udp_squeue_count
+ * since the squeue could drain any time.
+ */
+ ASSERT(udp->udp_reader_count == 0);
+ udp_count[2]++;
+ break;
+
+ case UDP_SQUEUE:
+ /*
+ * The transition is complete. Thre can't be any messages on
+ * the internal queue. The udp could be quiescent or the squeue
+ * could drain any time, so we can't assert on nonzero
+ * udp_squeue_count during entry. Nor can we assert that
+ * udp_reader_count is zero, since, a reader thread could have
+ * directly become writer in line by calling udp_become_writer
+ * without going through the queued states.
+ */
+ ASSERT(udp->udp_mphead == NULL && udp->udp_mpcount == 0);
+ ASSERT(caller == UDP_ENTER || udp->udp_squeue_count != 0);
+ udp_count[3]++;
+ break;
+ }
+}
+#endif
+
+#define _UDP_ENTER(connp, mp, proc, tag) { \
+ udp_t *_udp = (connp)->conn_udp; \
+ \
+ mutex_enter(&(connp)->conn_lock); \
+ if ((connp)->conn_state_flags & CONN_CLOSING) { \
+ mutex_exit(&(connp)->conn_lock); \
+ freemsg(mp); \
+ } else { \
+ UDP_MODE_ASSERTIONS(_udp, UDP_ENTER); \
+ \
+ switch (_udp->udp_mode) { \
+ case UDP_MT_HOT: \
+ /* We can execute as reader right away. */ \
+ UDP_READERS_INCREF(_udp); \
+ mutex_exit(&(connp)->conn_lock); \
+ (*(proc))(connp, mp, (connp)->conn_sqp); \
+ break; \
+ \
+ case UDP_SQUEUE: \
+ /* \
+ * We are in squeue mode, send the \
+ * packet to the squeue \
+ */ \
+ _udp->udp_squeue_count++; \
+ CONN_INC_REF_LOCKED(connp); \
+ mutex_exit(&(connp)->conn_lock); \
+ squeue_enter((connp)->conn_sqp, mp, proc, \
+ connp, tag); \
+ break; \
+ \
+ case UDP_MT_QUEUED: \
+ case UDP_QUEUED_SQUEUE: \
+ /* \
+ * Some messages may have been enqueued \
+ * ahead of us. Enqueue the new message \
+ * at the tail of the internal queue to \
+ * preserve message ordering. \
+ */ \
+ UDP_ENQUEUE_MP(_udp, mp, proc, tag); \
+ mutex_exit(&(connp)->conn_lock); \
+ break; \
+ } \
+ } \
+}
+
+static void
+udp_enter(conn_t *connp, mblk_t *mp, sqproc_t proc, uint8_t tag)
+{
+ _UDP_ENTER(connp, mp, proc, tag);
+}
+
+static void
+udp_become_writer(conn_t *connp, mblk_t *mp, sqproc_t proc, uint8_t tag)
+{
+ udp_t *udp;
+
+ udp = connp->conn_udp;
+
+ mutex_enter(&connp->conn_lock);
+
+ UDP_MODE_ASSERTIONS(udp, UDP_BECOME_WRITER);
+
+ switch (udp->udp_mode) {
+ case UDP_MT_HOT:
+ if (udp->udp_reader_count == 1) {
+ /*
+ * We are the only MT thread. Switch to squeue mode
+ * immediately.
+ */
+ udp->udp_mode = UDP_SQUEUE;
+ udp->udp_squeue_count = 1;
+ CONN_INC_REF_LOCKED(connp);
+ mutex_exit(&connp->conn_lock);
+ squeue_enter(connp->conn_sqp, mp, proc, connp, tag);
+ return;
+ }
+ /* FALLTHRU */
+
+ case UDP_MT_QUEUED:
+ /* Enqueue the packet internally in UDP */
+ udp->udp_mode = UDP_MT_QUEUED;
+ UDP_ENQUEUE_MP(udp, mp, proc, tag);
+ mutex_exit(&connp->conn_lock);
+ return;
+
+ case UDP_SQUEUE:
+ case UDP_QUEUED_SQUEUE:
+ /*
+ * We are already exclusive. i.e. we are already
+ * writer. Simply call the desired function.
+ */
+ udp->udp_squeue_count++;
+ mutex_exit(&connp->conn_lock);
+ (*proc)(connp, mp, connp->conn_sqp);
+ return;
+ }
+}
+
+/*
+ * Transition from MT mode to SQUEUE mode, when the last MT thread
+ * is exiting the UDP perimeter. Move all messages from the internal
+ * udp queue to the squeue. A better way would be to move all the
+ * messages in one shot, this needs more support from the squeue framework
+ */
+static void
+udp_switch_to_squeue(udp_t *udp)
+{
+ mblk_t *mp;
+ mblk_t *mp_next;
+ sqproc_t proc;
+ uint8_t tag;
+ conn_t *connp = udp->udp_connp;
+
+ ASSERT(MUTEX_HELD(&connp->conn_lock));
+ ASSERT(udp->udp_mode == UDP_MT_QUEUED);
+ while (udp->udp_mphead != NULL) {
+ mp = udp->udp_mphead;
+ udp->udp_mphead = NULL;
+ udp->udp_mptail = NULL;
+ udp->udp_mpcount = 0;
+ udp->udp_mode = UDP_QUEUED_SQUEUE;
+ mutex_exit(&connp->conn_lock);
+ /*
+ * It is best not to hold any locks across the calls
+ * to squeue functions. Since we drop the lock we
+ * need to go back and check the udp_mphead once again
+ * after the squeue_fill and hence the while loop at
+ * the top of this function
+ */
+ for (; mp != NULL; mp = mp_next) {
+ mp_next = mp->b_next;
+ proc = (sqproc_t)mp->b_prev;
+ tag = (uint8_t)((uintptr_t)mp->b_queue);
+ mp->b_next = NULL;
+ mp->b_prev = NULL;
+ mp->b_queue = NULL;
+ CONN_INC_REF(connp);
+ udp->udp_squeue_count++;
+ squeue_fill(connp->conn_sqp, mp, proc, connp,
+ tag);
+ }
+ mutex_enter(&connp->conn_lock);
+ }
+ /*
+ * udp_squeue_count of zero implies that the squeue has drained
+ * even before we arrived here (i.e. after the squeue_fill above)
+ */
+ udp->udp_mode = (udp->udp_squeue_count != 0) ?
+ UDP_SQUEUE : UDP_MT_HOT;
+}
+
+#define _UDP_EXIT(connp) { \
+ udp_t *_udp = (connp)->conn_udp; \
+ \
+ mutex_enter(&(connp)->conn_lock); \
+ UDP_MODE_ASSERTIONS(_udp, UDP_EXIT); \
+ \
+ switch (_udp->udp_mode) { \
+ case UDP_MT_HOT: \
+ UDP_READERS_DECREF(_udp); \
+ mutex_exit(&(connp)->conn_lock); \
+ break; \
+ \
+ case UDP_SQUEUE: \
+ UDP_SQUEUE_DECREF(_udp); \
+ if (_udp->udp_squeue_count == 0) \
+ _udp->udp_mode = UDP_MT_HOT; \
+ mutex_exit(&(connp)->conn_lock); \
+ break; \
+ \
+ case UDP_MT_QUEUED: \
+ /* \
+ * If this is the last MT thread, we need to \
+ * switch to squeue mode \
+ */ \
+ UDP_READERS_DECREF(_udp); \
+ if (_udp->udp_reader_count == 0) \
+ udp_switch_to_squeue(_udp); \
+ mutex_exit(&(connp)->conn_lock); \
+ break; \
+ \
+ case UDP_QUEUED_SQUEUE: \
+ UDP_SQUEUE_DECREF(_udp); \
+ /* \
+ * Even if the udp_squeue_count drops to zero, we \
+ * don't want to change udp_mode to UDP_MT_HOT here. \
+ * The thread in udp_switch_to_squeue will take care \
+ * of the transition to UDP_MT_HOT, after emptying \
+ * any more new messages that have been enqueued in \
+ * udp_mphead. \
+ */ \
+ mutex_exit(&(connp)->conn_lock); \
+ break; \
+ } \
+}
+
+static void
+udp_exit(conn_t *connp)
+{
+ _UDP_EXIT(connp);
+}
/*
* Return the next anonymous port in the priviledged port range for
@@ -379,9 +917,13 @@ static int
udp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
{
udp_fanout_t *udpf;
- udp_t *udp;
int i;
zoneid_t zoneid;
+ conn_t *connp;
+ udp_t *udp;
+
+ connp = Q_TO_CONN(q);
+ udp = connp->conn_udp;
/* Refer to comments in udp_status_report(). */
if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) {
@@ -403,8 +945,7 @@ udp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
" zone lport src addr dest addr port state");
/* 1234 12345 xxx.xxx.xxx.xxx xxx.xxx.xxx.xxx 12345 UNBOUND */
- udp = (udp_t *)q->q_ptr;
- zoneid = udp->udp_zoneid;
+ zoneid = connp->conn_zoneid;
for (i = 0; i < udp_bind_fanout_size; i++) {
udpf = &udp_bind_fanout[i];
@@ -415,7 +956,7 @@ udp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
if (zoneid != GLOBAL_ZONEID) {
/* skip to first entry in this zone; might be none */
while (udp != NULL &&
- udp->udp_zoneid != zoneid)
+ udp->udp_connp->conn_zoneid != zoneid)
udp = udp->udp_bind_hash;
}
if (udp != NULL) {
@@ -432,7 +973,7 @@ udp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
}
for (; udp != NULL; udp = udp->udp_bind_hash) {
if (zoneid == GLOBAL_ZONEID ||
- zoneid == udp->udp_zoneid)
+ zoneid == udp->udp_connp->conn_zoneid)
udp_report_item(mp->b_cont, udp);
}
}
@@ -542,7 +1083,6 @@ udp_bind(queue_t *q, mblk_t *mp)
in_port_t port; /* Host byte order */
in_port_t requested_port; /* Host byte order */
struct T_bind_req *tbr;
- udp_t *udp;
int count;
in6_addr_t v6src;
boolean_t bind_to_req_port_only;
@@ -550,8 +1090,11 @@ udp_bind(queue_t *q, mblk_t *mp)
udp_fanout_t *udpf;
in_port_t lport; /* Network byte order */
zoneid_t zoneid;
+ conn_t *connp;
+ udp_t *udp;
- udp = (udp_t *)q->q_ptr;
+ connp = Q_TO_CONN(q);
+ udp = connp->conn_udp;
if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
"udp_bind: bad req, len %u",
@@ -559,6 +1102,7 @@ udp_bind(queue_t *q, mblk_t *mp)
udp_err_ack(q, mp, TPROTO, 0);
return;
}
+
if (udp->udp_state != TS_UNBND) {
(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
"udp_bind: bad state, %u", udp->udp_state);
@@ -673,7 +1217,7 @@ udp_bind(queue_t *q, mblk_t *mp)
}
if (priv) {
- cred_t *cr = DB_CREDDEF(mp, udp->udp_credp);
+ cred_t *cr = DB_CREDDEF(mp, connp->conn_cred);
if (secpolicy_net_privaddr(cr, port) != 0) {
udp_err_ack(q, mp, TACCES, 0);
@@ -736,7 +1280,7 @@ udp_bind(queue_t *q, mblk_t *mp)
loopmax = udp_largest_anon_port - udp_smallest_anon_port + 1;
}
- zoneid = udp->udp_zoneid;
+ zoneid = connp->conn_zoneid;
for (;;) {
udp_t *udp1;
boolean_t is_inaddr_any;
@@ -753,7 +1297,7 @@ udp_bind(queue_t *q, mblk_t *mp)
for (udp1 = udpf->uf_udp; udp1 != NULL;
udp1 = udp1->udp_bind_hash) {
if (lport != udp1->udp_port ||
- zoneid != udp1->udp_zoneid)
+ zoneid != udp1->udp_connp->conn_zoneid)
continue;
/*
@@ -933,7 +1477,39 @@ udp_bind(queue_t *q, mblk_t *mp)
mp->b_cont->b_wptr += sizeof (ire_t);
mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
}
- putnext(q, mp);
+ if (udp->udp_family == AF_INET6)
+ mp = ip_bind_v6(q, mp, connp, NULL);
+ else
+ mp = ip_bind_v4(q, mp, connp);
+
+ if (mp != NULL)
+ udp_rput_other(_RD(q), mp);
+ else
+ CONN_INC_REF(connp);
+}
+
+
+void
+udp_resume_bind(conn_t *connp, mblk_t *mp)
+{
+ udp_enter(connp, mp, udp_resume_bind_cb, SQTAG_BIND_RETRY);
+}
+
+/*
+ * This is called from ip_wput_nondata to resume a deferred UDP bind.
+ */
+/* ARGSUSED */
+static void
+udp_resume_bind_cb(void *arg, mblk_t *mp, void *arg2)
+{
+ conn_t *connp = arg;
+
+ ASSERT(connp != NULL && IPCL_IS_UDP(connp));
+
+ udp_rput_other(connp->conn_rq, mp);
+
+ CONN_OPER_PENDING_DONE(connp);
+ udp_exit(connp);
}
/*
@@ -958,15 +1534,16 @@ udp_connect(queue_t *q, mblk_t *mp)
sin6_t *sin6;
sin_t *sin;
struct T_conn_req *tcr;
- udp_t *udp, *udp1;
in6_addr_t v6dst;
ipaddr_t v4dst;
uint16_t dstport;
uint32_t flowinfo;
mblk_t *mp1, *mp2;
udp_fanout_t *udpf;
+ udp_t *udp, *udp1;
+
+ udp = Q_TO_UDP(q);
- udp = (udp_t *)q->q_ptr;
tcr = (struct T_conn_req *)mp->b_rptr;
/* A bit of sanity checking */
@@ -987,6 +1564,7 @@ udp_connect(queue_t *q, mblk_t *mp)
ASSERT(udp->udp_port != 0 && udp->udp_ptpbhn != NULL);
udpf = &udp_bind_fanout[UDP_BIND_HASH(udp->udp_port)];
+
if (udp->udp_state == TS_DATA_XFER) {
/* Already connected - clear out state */
mutex_enter(&udpf->uf_lock);
@@ -1185,20 +1763,67 @@ bind_failed:
linkb(mp1, mp);
linkb(mp1, mp2);
- putnext(q, mp1);
+ if (udp->udp_family == AF_INET)
+ mp1 = ip_bind_v4(q, mp1, udp->udp_connp);
+ else
+ mp1 = ip_bind_v6(q, mp1, udp->udp_connp, NULL);
+
+ if (mp1 != NULL)
+ udp_rput_other(_RD(q), mp1);
+ else
+ CONN_INC_REF(udp->udp_connp);
}
-/* This is the close routine for udp. It frees the per-stream data. */
static int
udp_close(queue_t *q)
{
- udp_t *udp = (udp_t *)q->q_ptr;
+ conn_t *connp = Q_TO_CONN(UDP_WR(q));
+ udp_t *udp;
+ queue_t *ip_rq = RD(UDP_WR(q));
- TRACE_1(TR_FAC_UDP, TR_UDP_CLOSE,
- "udp_close: q %p", q);
+ ASSERT(connp != NULL && IPCL_IS_UDP(connp));
+ udp = connp->conn_udp;
+
+ ip_quiesce_conn(connp);
+ /*
+ * Disable read-side synchronous stream
+ * interface and drain any queued data.
+ */
+ udp_rcv_drain(q, udp, B_TRUE);
+ ASSERT(!udp->udp_direct_sockfs);
qprocsoff(q);
+ /* restore IP module's high and low water marks to default values */
+ ip_rq->q_hiwat = ip_rq->q_qinfo->qi_minfo->mi_hiwat;
+ WR(ip_rq)->q_hiwat = WR(ip_rq)->q_qinfo->qi_minfo->mi_hiwat;
+ WR(ip_rq)->q_lowat = WR(ip_rq)->q_qinfo->qi_minfo->mi_lowat;
+
+ ASSERT(udp->udp_rcv_cnt == 0);
+ ASSERT(udp->udp_rcv_msgcnt == 0);
+ ASSERT(udp->udp_rcv_list_head == NULL);
+ ASSERT(udp->udp_rcv_list_tail == NULL);
+
+ /* connp is now single threaded. */
+ udp_close_free(connp);
+ /*
+ * Restore connp as an IP endpoint. We don't need
+ * any locks since we are now single threaded
+ */
+ connp->conn_flags &= ~IPCL_UDP;
+ connp->conn_state_flags &=
+ ~(CONN_CLOSING | CONN_CONDEMNED | CONN_QUIESCED);
+ return (0);
+}
+
+/*
+ * Called in the close path from IP (ip_quiesce_conn) to quiesce the conn
+ */
+void
+udp_quiesce_conn(conn_t *connp)
+{
+ udp_t *udp = connp->conn_udp;
+
if (cl_inet_unbind != NULL && udp->udp_state == TS_IDLE) {
/*
* Running in cluster mode - register unbind information
@@ -1215,16 +1840,30 @@ udp_close(queue_t *q)
}
udp_bind_hash_remove(udp, B_FALSE);
- mutex_enter(&udp_g_lock);
- /* Unlink the udp structure and release the minor device number. */
- mi_close_unlink(&udp_g_head, (IDP)udp);
- mutex_exit(&udp_g_lock);
+
+ mutex_enter(&connp->conn_lock);
+ while (udp->udp_reader_count != 0 || udp->udp_squeue_count != 0 ||
+ udp->udp_mode != UDP_MT_HOT) {
+ cv_wait(&connp->conn_cv, &connp->conn_lock);
+ }
+ mutex_exit(&connp->conn_lock);
+}
+
+void
+udp_close_free(conn_t *connp)
+{
+ udp_t *udp = connp->conn_udp;
+
/* If there are any options associated with the stream, free them. */
- if (udp->udp_ip_snd_options)
+ if (udp->udp_ip_snd_options) {
mi_free((char *)udp->udp_ip_snd_options);
+ udp->udp_ip_snd_options = NULL;
+ }
- if (udp->udp_ip_rcv_options)
+ if (udp->udp_ip_rcv_options) {
mi_free((char *)udp->udp_ip_rcv_options);
+ udp->udp_ip_rcv_options = NULL;
+ }
/* Free memory associated with sticky options */
if (udp->udp_sticky_hdrs_len != 0) {
@@ -1233,30 +1872,33 @@ udp_close(queue_t *q)
udp->udp_sticky_hdrs = NULL;
udp->udp_sticky_hdrs_len = 0;
}
+
if (udp->udp_sticky_ipp.ipp_fields & IPPF_HOPOPTS) {
kmem_free(udp->udp_sticky_ipp.ipp_hopopts,
udp->udp_sticky_ipp.ipp_hopoptslen);
+ udp->udp_sticky_ipp.ipp_hopopts = NULL;
}
if (udp->udp_sticky_ipp.ipp_fields & IPPF_RTDSTOPTS) {
kmem_free(udp->udp_sticky_ipp.ipp_rtdstopts,
udp->udp_sticky_ipp.ipp_rtdstoptslen);
+ udp->udp_sticky_ipp.ipp_rtdstopts = NULL;
}
if (udp->udp_sticky_ipp.ipp_fields & IPPF_RTHDR) {
kmem_free(udp->udp_sticky_ipp.ipp_rthdr,
udp->udp_sticky_ipp.ipp_rthdrlen);
+ udp->udp_sticky_ipp.ipp_rthdr = NULL;
}
if (udp->udp_sticky_ipp.ipp_fields & IPPF_DSTOPTS) {
kmem_free(udp->udp_sticky_ipp.ipp_dstopts,
udp->udp_sticky_ipp.ipp_dstoptslen);
+ udp->udp_sticky_ipp.ipp_dstopts = NULL;
}
udp->udp_sticky_ipp.ipp_fields &=
~(IPPF_HOPOPTS|IPPF_RTDSTOPTS|IPPF_RTHDR|IPPF_DSTOPTS);
- crfree(udp->udp_credp);
- /* Free the data structure */
- mi_close_free((IDP)udp);
- q->q_ptr = WR(q)->q_ptr = NULL;
- return (0);
+ udp->udp_connp = NULL;
+ connp->conn_udp = NULL;
+ kmem_cache_free(udp_cache, udp);
}
/*
@@ -1277,12 +1919,10 @@ udp_close(queue_t *q)
static void
udp_disconnect(queue_t *q, mblk_t *mp)
{
- udp_t *udp;
+ udp_t *udp = Q_TO_UDP(q);
mblk_t *mp1;
udp_fanout_t *udpf;
- udp = (udp_t *)q->q_ptr;
-
if (udp->udp_state != TS_DATA_XFER) {
(void) mi_strlog(q, 1, SL_ERROR|SL_TRACE,
"udp_disconnect: bad state, %u", udp->udp_state);
@@ -1331,7 +1971,16 @@ udp_disconnect(queue_t *q, mblk_t *mp)
/* Append the T_OK_ACK to the T_BIND_REQ for udp_rput */
linkb(mp1, mp);
- putnext(q, mp1);
+
+ if (udp->udp_family == AF_INET6)
+ mp1 = ip_bind_v6(q, mp1, udp->udp_connp, NULL);
+ else
+ mp1 = ip_bind_v4(q, mp1, udp->udp_connp);
+
+ if (mp1 != NULL)
+ udp_rput_other(_RD(q), mp1);
+ else
+ CONN_INC_REF(udp->udp_connp);
}
/* This routine creates a T_ERROR_ACK message and passes it upstream. */
@@ -1339,7 +1988,7 @@ static void
udp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error)
{
if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
- qreply(q, mp);
+ putnext(UDP_RD(q), mp);
}
/* Shorthand to generate and send TPI error acks to our client */
@@ -1355,7 +2004,7 @@ udp_err_ack_prim(queue_t *q, mblk_t *mp, int primitive, t_scalar_t t_error,
teackp->ERROR_prim = primitive;
teackp->TLI_error = t_error;
teackp->UNIX_error = sys_error;
- qreply(q, mp);
+ putnext(UDP_RD(q), mp);
}
}
@@ -1372,10 +2021,6 @@ udp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
return (0);
}
-/*
- * Hold udp_g_lock to prevent multiple threads from changing udp_g_epriv_ports
- * at the same time.
- */
/* ARGSUSED */
static int
udp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
@@ -1393,11 +2038,9 @@ udp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
return (EINVAL);
}
- mutex_enter(&udp_g_lock);
/* Check if the value is already in the list */
for (i = 0; i < udp_g_num_epriv_ports; i++) {
if (new_value == udp_g_epriv_ports[i]) {
- mutex_exit(&udp_g_lock);
return (EEXIST);
}
}
@@ -1407,20 +2050,14 @@ udp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
break;
}
if (i == udp_g_num_epriv_ports) {
- mutex_exit(&udp_g_lock);
return (EOVERFLOW);
}
/* Set the new value */
udp_g_epriv_ports[i] = (in_port_t)new_value;
- mutex_exit(&udp_g_lock);
return (0);
}
-/*
- * Hold udp_g_lock to prevent multiple threads from changing udp_g_epriv_ports
- * at the same time.
- */
/* ARGSUSED */
static int
udp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
@@ -1438,20 +2075,17 @@ udp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
return (EINVAL);
}
- mutex_enter(&udp_g_lock);
/* Check that the value is already in the list */
for (i = 0; i < udp_g_num_epriv_ports; i++) {
if (udp_g_epriv_ports[i] == new_value)
break;
}
if (i == udp_g_num_epriv_ports) {
- mutex_exit(&udp_g_lock);
return (ESRCH);
}
/* Clear the value */
udp_g_epriv_ports[i] = 0;
- mutex_exit(&udp_g_lock);
return (0);
}
@@ -1478,8 +2112,8 @@ udp_icmp_error(queue_t *q, mblk_t *mp)
sin6_t sin6;
mblk_t *mp1;
int error = 0;
- udp_t *udp = (udp_t *)q->q_ptr;
size_t mp_size = MBLKL(mp);
+ udp_t *udp = Q_TO_UDP(q);
/*
* Assume IP provides aligned packets - otherwise toss
@@ -1495,7 +2129,7 @@ udp_icmp_error(queue_t *q, mblk_t *mp)
*/
if (!udp->udp_dgram_errind || mp_size < sizeof (ipha_t)) {
noticmpv4:
- putnext(q, mp);
+ putnext(UDP_RD(q), mp);
return;
}
@@ -1590,7 +2224,7 @@ noticmpv4:
break;
}
if (mp1)
- putnext(q, mp1);
+ putnext(UDP_RD(q), mp1);
freemsg(mp);
}
@@ -1609,7 +2243,6 @@ noticmpv4:
static void
udp_icmp_error_ipv6(queue_t *q, mblk_t *mp)
{
- udp_t *udp = (udp_t *)q->q_ptr;
icmp6_t *icmp6;
ip6_t *ip6h, *outer_ip6h;
uint16_t hdr_length;
@@ -1619,13 +2252,14 @@ udp_icmp_error_ipv6(queue_t *q, mblk_t *mp)
mblk_t *mp1;
int error = 0;
size_t mp_size = MBLKL(mp);
+ udp_t *udp = Q_TO_UDP(q);
/*
* Verify that we have a complete IP header. If not, send it upstream.
*/
if (mp_size < sizeof (ip6_t)) {
noticmpv6:
- putnext(q, mp);
+ putnext(UDP_RD(q), mp);
return;
}
@@ -1736,7 +2370,7 @@ noticmpv6:
* message. Free it, then send our empty message.
*/
freemsg(mp);
- putnext(q, newmp);
+ putnext(UDP_RD(q), newmp);
return;
}
case ICMP6_TIME_EXCEEDED:
@@ -1766,7 +2400,7 @@ noticmpv6:
mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), NULL, 0,
error);
if (mp1)
- putnext(q, mp1);
+ putnext(UDP_RD(q), mp1);
freemsg(mp);
}
@@ -1780,11 +2414,11 @@ noticmpv6:
static void
udp_addr_req(queue_t *q, mblk_t *mp)
{
- udp_t *udp = (udp_t *)q->q_ptr;
sin_t *sin;
sin6_t *sin6;
mblk_t *ackmp;
struct T_addr_ack *taa;
+ udp_t *udp = Q_TO_UDP(q);
/* Make it large enough for worst case */
ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
@@ -1894,7 +2528,7 @@ udp_addr_req(queue_t *q, mblk_t *mp)
}
}
ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
- qreply(q, ackmp);
+ putnext(UDP_RD(q), ackmp);
}
static void
@@ -1918,9 +2552,9 @@ udp_copy_info(struct T_info_ack *tap, udp_t *udp)
static void
udp_capability_req(queue_t *q, mblk_t *mp)
{
- udp_t *udp = (udp_t *)q->q_ptr;
t_uscalar_t cap_bits1;
struct T_capability_ack *tcap;
+ udp_t *udp = Q_TO_UDP(q);
cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
@@ -1937,7 +2571,7 @@ udp_capability_req(queue_t *q, mblk_t *mp)
tcap->CAP_bits1 |= TC1_INFO;
}
- qreply(q, mp);
+ putnext(UDP_RD(q), mp);
}
/*
@@ -1948,7 +2582,7 @@ udp_capability_req(queue_t *q, mblk_t *mp)
static void
udp_info_req(queue_t *q, mblk_t *mp)
{
- udp_t *udp = (udp_t *)q->q_ptr;
+ udp_t *udp = Q_TO_UDP(q);
/* Create a T_INFO_ACK message. */
mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
@@ -1956,7 +2590,7 @@ udp_info_req(queue_t *q, mblk_t *mp)
if (!mp)
return;
udp_copy_info((struct T_info_ack *)mp->b_rptr, udp);
- qreply(q, mp);
+ putnext(UDP_RD(q), mp);
}
/*
@@ -2102,20 +2736,19 @@ udp_ip_bind_mp(udp_t *udp, t_scalar_t bind_prim, t_scalar_t addr_length)
* This is the open routine for udp. It allocates a udp_t structure for
* the stream and, on the first open of the module, creates an ND table.
*/
+/* ARGSUSED */
static int
udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
{
int err;
udp_t *udp;
+ conn_t *connp;
+ zoneid_t zoneid = getzoneid();
+ queue_t *ip_wq;
+ char *name;
TRACE_1(TR_FAC_UDP, TR_UDP_OPEN, "udp_open: q %p", q);
- /*
- * Defer the qprocson until everything is initialized since
- * we are D_MTPERQ and after qprocson the rput routine can
- * run.
- */
-
/* If the stream is already open, return immediately. */
if (q->q_ptr != NULL)
return (0);
@@ -2124,85 +2757,110 @@ udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
if (sflag != MODOPEN)
return (EINVAL);
+ q->q_hiwat = udp_recv_hiwat;
+ WR(q)->q_hiwat = udp_xmit_hiwat;
+ WR(q)->q_lowat = udp_xmit_lowat;
+
+ /* Insert ourselves in the stream since we're about to walk q_next */
+ qprocson(q);
+
+ udp = kmem_cache_alloc(udp_cache, KM_SLEEP);
+ bzero(udp, sizeof (*udp));
+
/*
- * Create and initialize a udp_t structure for this stream.
+ * UDP is supported only as a module and it has to be pushed directly
+ * above the device instance of IP. If UDP is pushed anywhere else
+ * on a stream, it will support just T_SVR4_OPTMGMT_REQ for the
+ * sake of MIB browsers and fail everything else.
*/
- udp = (udp_t *)mi_open_alloc_sleep(sizeof (udp_t));
+ ip_wq = WR(q)->q_next;
+ if (ip_wq->q_next != NULL ||
+ (name = ip_wq->q_qinfo->qi_minfo->mi_idname) == NULL ||
+ strcmp(name, IP_MOD_NAME) != 0 ||
+ ip_wq->q_qinfo->qi_minfo->mi_idnum != IP_MOD_ID) {
+ /* Support just SNMP for MIB browsers */
+ connp = ipcl_conn_create(IPCL_IPCCONN, KM_SLEEP);
+ connp->conn_rq = q;
+ connp->conn_wq = WR(q);
+ connp->conn_flags |= IPCL_UDPMOD;
+ connp->conn_cred = credp;
+ connp->conn_zoneid = zoneid;
+ connp->conn_udp = udp;
+ udp->udp_connp = connp;
+ q->q_ptr = WR(q)->q_ptr = connp;
+ crhold(credp);
+ q->q_qinfo = &udp_snmp_rinit;
+ WR(q)->q_qinfo = &udp_snmp_winit;
+ return (0);
+ }
+
+ /*
+ * Initialize the udp_t structure for this stream.
+ */
+ q = RD(ip_wq);
+ connp = Q_TO_CONN(q);
+ mutex_enter(&connp->conn_lock);
+ connp->conn_proto = IPPROTO_UDP;
+ connp->conn_flags |= IPCL_UDP;
+ connp->conn_sqp = IP_SQUEUE_GET(lbolt);
+ connp->conn_udp = udp;
/* Set the initial state of the stream and the privilege status. */
- q->q_ptr = WR(q)->q_ptr = udp;
+ udp->udp_connp = connp;
udp->udp_state = TS_UNBND;
+ udp->udp_mode = UDP_MT_HOT;
if (getmajor(*devp) == (major_t)UDP6_MAJ) {
udp->udp_family = AF_INET6;
udp->udp_ipversion = IPV6_VERSION;
udp->udp_max_hdr_len = IPV6_HDR_LEN + UDPH_SIZE;
udp->udp_ttl = udp_ipv6_hoplimit;
+ connp->conn_af_isv6 = B_TRUE;
+ connp->conn_flags |= IPCL_ISV6;
} else {
udp->udp_family = AF_INET;
udp->udp_ipversion = IPV4_VERSION;
udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE;
udp->udp_ttl = udp_ipv4_ttl;
+ connp->conn_af_isv6 = B_FALSE;
+ connp->conn_flags &= ~IPCL_ISV6;
}
- /*
- * The receive hiwat is only looked at on the stream head queue.
- * Store in q_hiwat in order to return on SO_RCVBUF getsockopts.
- */
- q->q_hiwat = udp_recv_hiwat;
-
udp->udp_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
- udp->udp_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
- udp->udp_credp = credp;
- crhold(credp);
-
- udp->udp_zoneid = getzoneid();
-
- /*
- * Acquire the lock and link it into the list of open streams.
- */
- mutex_enter(&udp_g_lock);
- err = mi_open_link(&udp_g_head, (IDP)udp, devp, flag, sflag, credp);
- mutex_exit(&udp_g_lock);
- if (err != 0)
- goto error;
+ connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
+ connp->conn_zoneid = zoneid;
- qprocson(q);
+ if (connp->conn_flags & IPCL_SOCKET) {
+ udp->udp_issocket = B_TRUE;
+ udp->udp_direct_sockfs = B_TRUE;
+ }
+ mutex_exit(&connp->conn_lock);
/*
* The transmit hiwat/lowat is only looked at on IP's queue.
- * Store in q_hiwat in order to return on SO_SNDBUF
+ * Store in q_hiwat in order to return on SO_SNDBUF/SO_RCVBUF
* getsockopts.
*/
+ q->q_hiwat = udp_recv_hiwat;
WR(q)->q_hiwat = udp_xmit_hiwat;
- WR(q)->q_next->q_hiwat = WR(q)->q_hiwat;
WR(q)->q_lowat = udp_xmit_lowat;
- WR(q)->q_next->q_lowat = WR(q)->q_lowat;
if (udp->udp_family == AF_INET6) {
/* Build initial header template for transmit */
if ((err = udp_build_hdrs(q, udp)) != 0) {
- qprocsoff(q);
- /*
- * Unlink the udp structure and release
- * the minor device number.
- */
- mutex_enter(&udp_g_lock);
- mi_close_unlink(&udp_g_head, (IDP)udp);
- mutex_exit(&udp_g_lock);
- goto error;
+ qprocsoff(UDP_RD(q));
+ udp->udp_connp = NULL;
+ connp->conn_udp = NULL;
+ kmem_cache_free(udp_cache, udp);
+ return (err);
}
}
- /* Set the Stream head write offset. */
- (void) mi_set_sth_wroff(q, udp->udp_max_hdr_len + udp_wroff_extra);
- (void) mi_set_sth_hiwat(q, q->q_hiwat);
- return (0);
+ /* Set the Stream head write offset and high watermark. */
+ (void) mi_set_sth_wroff(UDP_RD(q),
+ udp->udp_max_hdr_len + udp_wroff_extra);
+ (void) mi_set_sth_hiwat(UDP_RD(q), udp_set_rcv_hiwat(udp, q->q_hiwat));
-error:
- q->q_ptr = WR(q)->q_ptr = NULL;
- crfree(credp);
- mi_close_free((IDP)udp);
- return (err);
+ return (0);
}
/*
@@ -2212,7 +2870,6 @@ error:
static boolean_t
udp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name)
{
-
return (B_TRUE);
}
@@ -2255,15 +2912,22 @@ udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
}
/*
- * This routine retrieves the current status of socket options.
- * It returns the size of the option retrieved.
+ * This routine retrieves the current status of socket options
+ * and expects the caller to pass in the queue pointer of the
+ * upper instance. It returns the size of the option retrieved.
*/
int
udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
{
int *i1 = (int *)ptr;
- udp_t *udp = (udp_t *)q->q_ptr;
- ip6_pkt_t *ipp = &udp->udp_sticky_ipp;
+ conn_t *connp;
+ udp_t *udp;
+ ip6_pkt_t *ipp;
+
+ q = UDP_WR(q);
+ connp = Q_TO_CONN(q);
+ udp = connp->conn_udp;
+ ipp = &udp->udp_sticky_ipp;
switch (level) {
case SOL_SOCKET:
@@ -2333,7 +2997,7 @@ udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
*(uchar_t *)ptr = udp->udp_multicast_ttl;
return (sizeof (uchar_t));
case IP_MULTICAST_LOOP:
- *ptr = udp->udp_multicast_loop;
+ *ptr = connp->conn_multicast_loop;
return (sizeof (uint8_t));
case IP_RECVOPTS:
*i1 = udp->udp_recvopts;
@@ -2394,7 +3058,7 @@ udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
*i1 = udp->udp_multicast_ttl;
break; /* goto sizeof (int) option return */
case IPV6_MULTICAST_LOOP:
- *i1 = udp->udp_multicast_loop;
+ *i1 = connp->conn_multicast_loop;
break; /* goto sizeof (int) option return */
case IPV6_JOIN_GROUP:
case IPV6_LEAVE_GROUP:
@@ -2520,18 +3184,26 @@ udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
return (sizeof (int));
}
-/* This routine sets socket options. */
+/*
+ * This routine sets socket options; it expects the caller
+ * to pass in the queue pointer of the upper instance.
+ */
/* ARGSUSED */
int
udp_opt_set(queue_t *q, uint_t optset_context, int level,
int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
{
- udp_t *udp = (udp_t *)q->q_ptr;
int *i1 = (int *)invalp;
boolean_t onoff = (*i1 == 0) ? 0 : 1;
boolean_t checkonly;
int error;
+ conn_t *connp;
+ udp_t *udp;
+
+ q = UDP_WR(q);
+ connp = Q_TO_CONN(q);
+ udp = connp->conn_udp;
switch (optset_context) {
case SETFN_OPTCOM_CHECKONLY:
@@ -2619,7 +3291,7 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level,
}
if (!checkonly) {
q->q_hiwat = *i1;
- q->q_next->q_hiwat = *i1;
+ WR(UDP_RD(q))->q_hiwat = *i1;
}
break;
case SO_RCVBUF:
@@ -2629,7 +3301,9 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level,
}
if (!checkonly) {
RD(q)->q_hiwat = *i1;
- (void) mi_set_sth_hiwat(RD(q), *i1);
+ UDP_RD(q)->q_hiwat = *i1;
+ (void) mi_set_sth_hiwat(UDP_RD(q),
+ udp_set_rcv_hiwat(udp, *i1));
}
break;
case SO_DGRAM_ERRIND:
@@ -2709,7 +3383,7 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level,
break;
case IP_MULTICAST_LOOP:
if (!checkonly)
- udp->udp_multicast_loop = *invalp;
+ connp->conn_multicast_loop = *invalp;
break;
case IP_RECVOPTS:
if (!checkonly)
@@ -2847,7 +3521,7 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level,
return (EINVAL);
}
if (!checkonly)
- udp->udp_multicast_loop = *i1;
+ connp->conn_multicast_loop = *i1;
break;
case IPV6_JOIN_GROUP:
case IPV6_LEAVE_GROUP:
@@ -3093,6 +3767,7 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level,
ipp->ipp_rtdstopts = NULL;
ipp->ipp_rtdstoptslen = 0;
}
+
ipp->ipp_fields &= ~IPPF_RTDSTOPTS;
ipp->ipp_sticky_ignored |= IPPF_RTDSTOPTS;
} else {
@@ -3447,12 +4122,13 @@ udp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr)
}
static void
-udp_rput(queue_t *q, mblk_t *mp)
+udp_input(conn_t *connp, mblk_t *mp)
{
struct T_unitdata_ind *tudi;
- uchar_t *rptr;
- int hdr_length;
+ uchar_t *rptr; /* Pointer to IP header */
+ int hdr_length; /* Length of IP+UDP headers */
int udi_size; /* Size of T_unitdata_ind */
+ int mp_len;
udp_t *udp;
udpha_t *udpha;
int ipversion;
@@ -3462,104 +4138,56 @@ udp_rput(queue_t *q, mblk_t *mp)
mblk_t *mp1;
mblk_t *options_mp = NULL;
in_pktinfo_t *pinfo = NULL;
- size_t mp_size = MBLKL(mp);
cred_t *cr = NULL;
+ queue_t *q = connp->conn_rq;
pid_t cpid;
TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_START,
"udp_rput_start: q %p mp %p", q, mp);
- udp = (udp_t *)q->q_ptr;
+ udp = connp->conn_udp;
rptr = mp->b_rptr;
+ ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_CTL);
+ ASSERT(OK_32PTR(rptr));
- switch (mp->b_datap->db_type) {
- case M_DATA:
- /*
- * M_DATA messages contain IP datagrams. They are handled
- * after this switch.
- */
- break;
- case M_PROTO:
- case M_PCPROTO:
- /* M_PROTO messages contain some type of TPI message. */
- if ((mp->b_wptr - rptr) < sizeof (t_scalar_t)) {
- freemsg(mp);
- TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
- "udp_rput_end: q %p (%S)", q, "protoshort");
- return;
- }
- qwriter(q, mp, udp_rput_other, PERIM_INNER);
- TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
- "udp_rput_end: q %p (%S)", q, "proto");
- return;
- case M_FLUSH:
- if (*mp->b_rptr & FLUSHR)
- flushq(q, FLUSHDATA);
- putnext(q, mp);
- TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
- "udp_rput_end: q %p (%S)", q, "flush");
- return;
- case M_CTL:
- if (udp->udp_recvif || udp->udp_recvslla ||
- udp->udp_ipv6_recvpktinfo) {
+ /*
+ * IP should have prepended the options data in an M_CTL
+ * Check M_CTL "type" to make sure are not here bcos of
+ * a valid ICMP message
+ */
+ if (DB_TYPE(mp) == M_CTL) {
+ if (MBLKL(mp) == sizeof (in_pktinfo_t) &&
+ ((in_pktinfo_t *)mp->b_rptr)->in_pkt_ulp_type ==
+ IN_PKTINFO) {
/*
- * IP should have prepended the options data in an M_CTL
- * Check M_CTL "type" to make sure are not here bcos of
- * a valid ICMP message
+ * IP_RECVIF or IP_RECVSLLA information has been
+ * appended to the packet by IP. We need to
+ * extract the mblk and adjust the rptr
*/
- if (mp_size == sizeof (in_pktinfo_t) &&
- ((in_pktinfo_t *)mp->b_rptr)->in_pkt_ulp_type ==
- IN_PKTINFO) {
- pinfo = (in_pktinfo_t *)mp->b_rptr;
- /*
- * Jump to normal data processing, this is not
- * an ICMP message
- */
- break;
- }
+ pinfo = (in_pktinfo_t *)mp->b_rptr;
+ options_mp = mp;
+ mp = mp->b_cont;
+ rptr = mp->b_rptr;
+ UDP_STAT(udp_in_pktinfo);
+ } else {
+ /*
+ * ICMP messages.
+ */
+ udp_icmp_error(q, mp);
+ TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
+ "udp_rput_end: q %p (%S)", q, "m_ctl");
+ return;
}
- /*
- * ICMP messages.
- */
- udp_icmp_error(q, mp);
- TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
- "udp_rput_end: q %p (%S)", q, "m_ctl");
- return;
- default:
- putnext(q, mp);
- TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
- "udp_rput_end: q %p (%S)", q, "default");
- return;
}
- /*
- * If we are here bcos the IP_RECVIF or IP_RECVSLLA then we need to
- * extract the mblk and adjust the rptr
- */
- if (pinfo != NULL) {
- ASSERT(mp->b_datap->db_type == M_CTL);
- options_mp = mp;
- mp = mp->b_cont;
- rptr = mp->b_rptr;
- mp_size = MBLKL(mp);
- }
+ mp_len = msgdsize(mp);
/*
* This is the inbound data path.
* First, we check to make sure the IP version number is correct,
* and then pull the IP and UDP headers into the first mblk.
- */
- /*
* Assume IP provides aligned packets - otherwise toss.
* Also, check if we have a complete IP header.
*/
- if (!OK_32PTR(rptr) || (mp_size < sizeof (ipha_t))) {
-tossit:
- freemsg(mp);
- if (options_mp != NULL)
- freeb(options_mp);
- BUMP_MIB(&udp_mib, udpInErrors);
- return;
- }
/* Initialize regardless if ipversion is IPv4 or IPv6 */
ipp.ipp_fields = 0;
@@ -3567,10 +4195,9 @@ tossit:
ipversion = IPH_HDR_VERSION(rptr);
switch (ipversion) {
case IPV4_VERSION:
+ ASSERT(MBLKL(mp) >= sizeof (ipha_t));
+ ASSERT(((ipha_t *)rptr)->ipha_protocol == IPPROTO_UDP);
hdr_length = IPH_HDR_LENGTH(rptr) + UDPH_SIZE;
- /* Verify this is a UDP packet */
- if (((ipha_t *)rptr)->ipha_protocol != IPPROTO_UDP)
- goto tossit;
if ((hdr_length > IP_SIMPLE_HDR_LENGTH + UDPH_SIZE) ||
(udp->udp_ip_rcv_options_len)) {
/*
@@ -3587,7 +4214,7 @@ tossit:
* the packet.
*/
udpha = (udpha_t *)(rptr + (hdr_length - UDPH_SIZE));
- if (msgdsize(mp) != (ntohs(udpha->uha_length) +
+ if (mp_len != (ntohs(udpha->uha_length) +
hdr_length - UDPH_SIZE)) {
goto tossit;
}
@@ -3597,14 +4224,16 @@ tossit:
*/
if (pinfo != NULL)
mp = options_mp;
- qwriter(q, mp, udp_rput_other, PERIM_INNER);
+ udp_become_writer(connp, mp, udp_rput_other_wrapper,
+ SQTAG_UDP_INPUT);
TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
"udp_rput_end: q %p (%S)", q, "end");
return;
}
/* Handle IPV6_RECVHOPLIMIT. */
- if ((udp->udp_family == AF_INET6) && (pinfo != NULL)) {
+ if ((udp->udp_family == AF_INET6) && (pinfo != NULL) &&
+ udp->udp_ipv6_recvpktinfo) {
if (pinfo->in_pkt_flags & IPF_RECVIF) {
ipp.ipp_fields |= IPPF_IFINDEX;
ipp.ipp_ifindex = pinfo->in_pkt_ifindex;
@@ -3620,8 +4249,7 @@ tossit:
ASSERT(udp->udp_family == AF_INET6);
ip6h = (ip6_t *)rptr;
- if ((uchar_t *)&ip6h[1] > mp->b_wptr)
- goto tossit;
+ ASSERT((uchar_t *)&ip6h[1] <= mp->b_wptr);
if (ip6h->ip6_nxt != IPPROTO_UDP) {
uint8_t nexthdrp;
@@ -3647,6 +4275,7 @@ tossit:
if (MBLKL(mp) < (IPV6_HDR_LEN + UDPH_SIZE))
goto tossit;
ip6h = (ip6_t *)rptr;
+ mp_len = msgdsize(mp);
}
/*
* Find any potentially interesting extension headers
@@ -3655,18 +4284,14 @@ tossit:
*/
hdr_length = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdrp) +
UDPH_SIZE;
- /* Verify this is a UDP packet */
- if (nexthdrp != IPPROTO_UDP)
- goto tossit;
+ ASSERT(nexthdrp == IPPROTO_UDP);
} else {
hdr_length = IPV6_HDR_LEN + UDPH_SIZE;
ip6i = NULL;
}
break;
default:
- TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END,
- "udp_rput_end: q %p (%S)", q, "Unknown IP version");
- goto tossit;
+ ASSERT(0);
}
/*
@@ -3677,14 +4302,15 @@ tossit:
*/
udpha = (udpha_t *)(rptr + (hdr_length - UDPH_SIZE));
if ((MBLKL(mp) < hdr_length) ||
- (msgdsize(mp) != (ntohs(udpha->uha_length) +
- hdr_length - UDPH_SIZE))) {
+ (mp_len != (ntohs(udpha->uha_length) + hdr_length - UDPH_SIZE))) {
goto tossit;
}
/* Walk past the headers. */
- if (!udp->udp_rcvhdr)
+ if (!udp->udp_rcvhdr) {
mp->b_rptr = rptr + hdr_length;
+ mp_len -= hdr_length;
+ }
/*
* This is the inbound data path. Packets are passed upstream as
@@ -3706,6 +4332,7 @@ tossit:
if (udp->udp_recvdstaddr) {
udi_size += sizeof (struct T_opthdr) +
sizeof (struct in_addr);
+ UDP_STAT(udp_in_recvdstaddr);
}
/*
@@ -3714,25 +4341,28 @@ tossit:
*/
if (udp->udp_recvif && (pinfo != NULL) &&
(pinfo->in_pkt_flags & IPF_RECVIF)) {
- udi_size += sizeof (struct T_opthdr) +
- sizeof (uint_t);
+ udi_size += sizeof (struct T_opthdr) + sizeof (uint_t);
+ UDP_STAT(udp_in_recvif);
}
if (udp->udp_recvslla && (pinfo != NULL) &&
(pinfo->in_pkt_flags & IPF_RECVSLLA)) {
udi_size += sizeof (struct T_opthdr) +
- sizeof (struct sockaddr_dl);
+ sizeof (struct sockaddr_dl);
+ UDP_STAT(udp_in_recvslla);
}
if (udp->udp_recvucred && (cr = DB_CRED(mp)) != NULL) {
udi_size += sizeof (struct T_opthdr) + ucredsize;
cpid = DB_CPID(mp);
+ UDP_STAT(udp_in_recvucred);
}
/*
* If IP_RECVTTL is set allocate the appropriate sized buffer
*/
if (udp->udp_recvttl) {
udi_size += sizeof (struct T_opthdr) + sizeof (uint8_t);
+ UDP_STAT(udp_in_recvttl);
}
ASSERT(IPH_HDR_LENGTH((ipha_t *)rptr) == IP_SIMPLE_HDR_LENGTH);
@@ -3889,12 +4519,14 @@ tossit:
(ipp.ipp_fields & IPPF_HOPOPTS)) {
udi_size += sizeof (struct T_opthdr) +
ipp.ipp_hopoptslen;
+ UDP_STAT(udp_in_recvhopopts);
}
if ((udp->udp_ipv6_recvdstopts ||
udp->udp_old_ipv6_recvdstopts) &&
(ipp.ipp_fields & IPPF_DSTOPTS)) {
udi_size += sizeof (struct T_opthdr) +
ipp.ipp_dstoptslen;
+ UDP_STAT(udp_in_recvdstopts);
}
if (((udp->udp_ipv6_recvdstopts &&
udp->udp_ipv6_recvrthdr &&
@@ -3903,29 +4535,37 @@ tossit:
(ipp.ipp_fields & IPPF_RTDSTOPTS)) {
udi_size += sizeof (struct T_opthdr) +
ipp.ipp_rtdstoptslen;
+ UDP_STAT(udp_in_recvrtdstopts);
}
if (udp->udp_ipv6_recvrthdr &&
(ipp.ipp_fields & IPPF_RTHDR)) {
udi_size += sizeof (struct T_opthdr) +
ipp.ipp_rthdrlen;
+ UDP_STAT(udp_in_recvrthdr);
}
if (udp->udp_ipv6_recvpktinfo &&
(ipp.ipp_fields & IPPF_IFINDEX)) {
udi_size += sizeof (struct T_opthdr) +
sizeof (struct in6_pktinfo);
+ UDP_STAT(udp_in_recvpktinfo);
}
}
if (udp->udp_recvucred && (cr = DB_CRED(mp)) != NULL) {
udi_size += sizeof (struct T_opthdr) + ucredsize;
cpid = DB_CPID(mp);
+ UDP_STAT(udp_in_recvucred);
}
- if (udp->udp_ipv6_recvhoplimit)
+ if (udp->udp_ipv6_recvhoplimit) {
udi_size += sizeof (struct T_opthdr) + sizeof (int);
+ UDP_STAT(udp_in_recvhoplimit);
+ }
- if (udp->udp_ipv6_recvtclass)
+ if (udp->udp_ipv6_recvtclass) {
udi_size += sizeof (struct T_opthdr) + sizeof (int);
+ UDP_STAT(udp_in_recvtclass);
+ }
mp1 = allocb(udi_size, BPRI_MED);
if (mp1 == NULL) {
@@ -3960,7 +4600,7 @@ tossit:
sin6->sin6_flowinfo = 0;
sin6->sin6_scope_id = 0;
sin6->__sin6_src_id = ip_srcid_find_addr(&v6dst,
- udp->udp_zoneid);
+ connp->conn_zoneid);
} else {
sin6->sin6_addr = ip6h->ip6_src;
/* No sin6_flowinfo per API */
@@ -3971,8 +4611,8 @@ tossit:
sin6->sin6_scope_id = ipp.ipp_ifindex;
else
sin6->sin6_scope_id = 0;
- sin6->__sin6_src_id =
- ip_srcid_find_addr(&ip6h->ip6_dst, udp->udp_zoneid);
+ sin6->__sin6_src_id = ip_srcid_find_addr(
+ &ip6h->ip6_dst, connp->conn_zoneid);
}
sin6->sin6_port = udpha->uha_src_port;
sin6->sin6_family = udp->udp_family;
@@ -4133,7 +4773,45 @@ tossit:
"udp_rput_end: q %p (%S)", q, "end");
if (options_mp != NULL)
freeb(options_mp);
- putnext(q, mp);
+
+ if (udp->udp_direct_sockfs) {
+ /*
+ * There is nothing above us except for the stream head;
+ * use the read-side synchronous stream interface in
+ * order to reduce the time spent in interrupt thread.
+ */
+ ASSERT(udp->udp_issocket);
+ udp_rcv_enqueue(UDP_RD(q), udp, mp, mp_len);
+ } else {
+ /*
+ * Use regular STREAMS interface to pass data upstream
+ * if this is not a socket endpoint, or if we have
+ * switched over to the slow mode due to sockmod being
+ * popped or a module being pushed on top of us.
+ */
+ putnext(UDP_RD(q), mp);
+ }
+ return;
+
+tossit:
+ freemsg(mp);
+ if (options_mp != NULL)
+ freeb(options_mp);
+ BUMP_MIB(&udp_mib, udpInErrors);
+}
+
+void
+udp_conn_recv(conn_t *connp, mblk_t *mp)
+{
+ _UDP_ENTER(connp, mp, udp_input_wrapper, SQTAG_UDP_FANOUT);
+}
+
+/* ARGSUSED */
+static void
+udp_input_wrapper(void *arg, mblk_t *mp, void *arg2)
+{
+ udp_input((conn_t *)arg, mp);
+ _UDP_EXIT((conn_t *)arg);
}
/*
@@ -4152,18 +4830,17 @@ udp_rput_other(queue_t *q, mblk_t *mp)
int opt_len; /* Length of IP options */
sin_t *sin;
struct T_error_ack *tea;
- udp_t *udp;
mblk_t *options_mp = NULL;
in_pktinfo_t *pinfo;
boolean_t recv_on = B_FALSE;
cred_t *cr = NULL;
+ udp_t *udp = Q_TO_UDP(q);
pid_t cpid;
TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_START,
"udp_rput_other: q %p mp %p", q, mp);
ASSERT(OK_32PTR(mp->b_rptr));
- udp = (udp_t *)q->q_ptr;
rptr = mp->b_rptr;
switch (mp->b_datap->db_type) {
@@ -4258,7 +4935,7 @@ udp_rput_other(queue_t *q, mblk_t *mp)
freemsg(mp);
return;
}
- putnext(q, mp);
+ putnext(UDP_RD(q), mp);
return;
}
@@ -4323,9 +5000,12 @@ udp_rput_other(queue_t *q, mblk_t *mp)
udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t);
if (udp->udp_recvdstaddr) {
udi_size += sizeof (struct T_opthdr) + sizeof (struct in_addr);
+ UDP_STAT(udp_in_recvdstaddr);
}
- if (udp->udp_recvopts && opt_len > 0)
+ if (udp->udp_recvopts && opt_len > 0) {
udi_size += sizeof (struct T_opthdr) + opt_len;
+ UDP_STAT(udp_in_recvopts);
+ }
/*
* If the IP_RECVSLLA or the IP_RECVIF is set then allocate
@@ -4333,25 +5013,28 @@ udp_rput_other(queue_t *q, mblk_t *mp)
*/
if (udp->udp_recvif && recv_on &&
(pinfo->in_pkt_flags & IPF_RECVIF)) {
- udi_size += sizeof (struct T_opthdr) +
- sizeof (uint_t);
+ udi_size += sizeof (struct T_opthdr) + sizeof (uint_t);
+ UDP_STAT(udp_in_recvif);
}
if (udp->udp_recvslla && recv_on &&
(pinfo->in_pkt_flags & IPF_RECVSLLA)) {
udi_size += sizeof (struct T_opthdr) +
sizeof (struct sockaddr_dl);
+ UDP_STAT(udp_in_recvslla);
}
if (udp->udp_recvucred && (cr = DB_CRED(mp)) != NULL) {
udi_size += sizeof (struct T_opthdr) + ucredsize;
cpid = DB_CPID(mp);
+ UDP_STAT(udp_in_recvucred);
}
/*
* If IP_RECVTTL is set allocate the appropriate sized buffer
*/
if (udp->udp_recvttl) {
udi_size += sizeof (struct T_opthdr) + sizeof (uint8_t);
+ UDP_STAT(udp_in_recvttl);
}
/* Allocate a message block for the T_UNITDATA_IND structure. */
@@ -4502,7 +5185,34 @@ udp_rput_other(queue_t *q, mblk_t *mp)
"udp_rput_other_end: q %p (%S)", q, "end");
if (options_mp != NULL)
freeb(options_mp);
- putnext(q, mp);
+
+ if (udp->udp_direct_sockfs) {
+ /*
+ * There is nothing above us except for the stream head;
+ * use the read-side synchronous stream interface in
+ * order to reduce the time spent in interrupt thread.
+ */
+ ASSERT(udp->udp_issocket);
+ udp_rcv_enqueue(UDP_RD(q), udp, mp, msgdsize(mp));
+ } else {
+ /*
+ * Use regular STREAMS interface to pass data upstream
+ * if this is not a socket endpoint, or if we have
+ * switched over to the slow mode due to sockmod being
+ * popped or a module being pushed on top of us.
+ */
+ putnext(UDP_RD(q), mp);
+ }
+}
+
+/* ARGSUSED */
+static void
+udp_rput_other_wrapper(void *arg, mblk_t *mp, void *arg2)
+{
+ conn_t *connp = arg;
+
+ udp_rput_other(connp->conn_rq, mp);
+ udp_exit(connp);
}
/*
@@ -4511,7 +5221,7 @@ udp_rput_other(queue_t *q, mblk_t *mp)
static void
udp_rput_bind_ack(queue_t *q, mblk_t *mp)
{
- udp_t *udp = (udp_t *)q->q_ptr;
+ udp_t *udp = Q_TO_UDP(q);
mblk_t *mp1;
ire_t *ire;
struct T_bind_ack *tba;
@@ -4602,20 +5312,20 @@ udp_rput_bind_ack(queue_t *q, mblk_t *mp)
while (mp != NULL) {
mp1 = mp->b_cont;
mp->b_cont = NULL;
- putnext(q, mp);
+ putnext(UDP_RD(q), mp);
mp = mp1;
}
return;
}
freemsg(mp->b_cont);
mp->b_cont = NULL;
- putnext(q, mp);
+ putnext(UDP_RD(q), mp);
}
/*
* return SNMP stuff in buffer in mpdata
*/
-static int
+int
udp_snmp_get(queue_t *q, mblk_t *mpctl)
{
mblk_t *mpdata;
@@ -4626,12 +5336,14 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl)
mblk_t *mp_conn_tail = NULL;
mblk_t *mp6_conn_tail = NULL;
struct opthdr *optp;
- IDP idp;
- udp_t *udp;
mib2_udpEntry_t ude;
mib2_udp6Entry_t ude6;
int state;
zoneid_t zoneid;
+ int i;
+ connf_t *connfp;
+ conn_t *connp = Q_TO_CONN(q);
+ udp_t *udp = connp->conn_udp;
if (mpctl == NULL ||
(mpdata = mpctl->b_cont) == NULL ||
@@ -4644,8 +5356,7 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl)
mp_conn_data = mp_conn_ctl->b_cont;
mp6_conn_data = mp6_conn_ctl->b_cont;
- udp = (udp_t *)q->q_ptr;
- zoneid = udp->udp_zoneid;
+ zoneid = connp->conn_zoneid;
/* fixed length structure for IPv4 and IPv6 counters */
SET_MIB(udp_mib.udpEntrySize, sizeof (mib2_udpEntry_t));
@@ -4657,76 +5368,88 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl)
optp->len = msgdsize(mpdata);
qreply(q, mpctl);
- mutex_enter(&udp_g_lock);
- for (idp = mi_first_ptr(&udp_g_head);
- (udp = (udp_t *)idp) != 0;
- idp = mi_next_ptr(&udp_g_head, idp)) {
+ for (i = 0; i < CONN_G_HASH_SIZE; i++) {
+ connfp = &ipcl_globalhash_fanout[i];
+ connp = NULL;
- if (zoneid != udp->udp_zoneid)
- continue;
+ while ((connp = ipcl_get_next_conn(connfp, connp,
+ IPCL_UDP))) {
+ udp = connp->conn_udp;
+ if (zoneid != connp->conn_zoneid)
+ continue;
- /* Note that the port numbers are sent in host byte order */
+ /*
+ * Note that the port numbers are sent in
+ * host byte order
+ */
- if (udp->udp_state == TS_UNBND)
- state = MIB2_UDP_unbound;
- else if (udp->udp_state == TS_IDLE)
- state = MIB2_UDP_idle;
- else if (udp->udp_state == TS_DATA_XFER)
- state = MIB2_UDP_connected;
- else
- state = MIB2_UDP_unknown;
+ if (udp->udp_state == TS_UNBND)
+ state = MIB2_UDP_unbound;
+ else if (udp->udp_state == TS_IDLE)
+ state = MIB2_UDP_idle;
+ else if (udp->udp_state == TS_DATA_XFER)
+ state = MIB2_UDP_connected;
+ else
+ state = MIB2_UDP_unknown;
- /*
- * Create an IPv4 table entry for IPv4 entries and also
- * any IPv6 entries which are bound to in6addr_any
- * (i.e. anything a IPv4 peer could connect/send to).
- */
- if (udp->udp_ipversion == IPV4_VERSION ||
- (udp->udp_state <= TS_IDLE &&
- IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src))) {
- ude.udpEntryInfo.ue_state = state;
- /* If in6addr_any this will set it to INADDR_ANY */
- ude.udpLocalAddress = V4_PART_OF_V6(udp->udp_v6src);
- ude.udpLocalPort = ntohs(udp->udp_port);
- if (udp->udp_state == TS_DATA_XFER) {
+ /*
+ * Create an IPv4 table entry for IPv4 entries and also
+ * any IPv6 entries which are bound to in6addr_any
+ * (i.e. anything a IPv4 peer could connect/send to).
+ */
+ if (udp->udp_ipversion == IPV4_VERSION ||
+ (udp->udp_state <= TS_IDLE &&
+ IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src))) {
+ ude.udpEntryInfo.ue_state = state;
/*
- * Can potentially get here for v6 socket
- * if another process (say, ping) has just
- * done a sendto(), changing the state
- * from the TS_IDLE above to TS_DATA_XFER
- * by the time we hit this part of the code.
+ * If in6addr_any this will set it to
+ * INADDR_ANY
*/
- ude.udpEntryInfo.ue_RemoteAddress =
- V4_PART_OF_V6(udp->udp_v6dst);
- ude.udpEntryInfo.ue_RemotePort =
- ntohs(udp->udp_dstport);
- } else {
- ude.udpEntryInfo.ue_RemoteAddress = 0;
- ude.udpEntryInfo.ue_RemotePort = 0;
+ ude.udpLocalAddress =
+ V4_PART_OF_V6(udp->udp_v6src);
+ ude.udpLocalPort = ntohs(udp->udp_port);
+ if (udp->udp_state == TS_DATA_XFER) {
+ /*
+ * Can potentially get here for
+ * v6 socket if another process
+ * (say, ping) has just done a
+ * sendto(), changing the state
+ * from the TS_IDLE above to
+ * TS_DATA_XFER by the time we hit
+ * this part of the code.
+ */
+ ude.udpEntryInfo.ue_RemoteAddress =
+ V4_PART_OF_V6(udp->udp_v6dst);
+ ude.udpEntryInfo.ue_RemotePort =
+ ntohs(udp->udp_dstport);
+ } else {
+ ude.udpEntryInfo.ue_RemoteAddress = 0;
+ ude.udpEntryInfo.ue_RemotePort = 0;
+ }
+ (void) snmp_append_data2(mp_conn_data,
+ &mp_conn_tail, (char *)&ude, sizeof (ude));
}
- (void) snmp_append_data2(mp_conn_data, &mp_conn_tail,
- (char *)&ude, sizeof (ude));
- }
- if (udp->udp_ipversion == IPV6_VERSION) {
- ude6.udp6EntryInfo.ue_state = state;
- ude6.udp6LocalAddress = udp->udp_v6src;
- ude6.udp6LocalPort = ntohs(udp->udp_port);
- ude6.udp6IfIndex = udp->udp_bound_if;
- if (udp->udp_state == TS_DATA_XFER) {
- ude6.udp6EntryInfo.ue_RemoteAddress =
- udp->udp_v6dst;
- ude6.udp6EntryInfo.ue_RemotePort =
- ntohs(udp->udp_dstport);
- } else {
- ude6.udp6EntryInfo.ue_RemoteAddress =
- sin6_null.sin6_addr;
- ude6.udp6EntryInfo.ue_RemotePort = 0;
+ if (udp->udp_ipversion == IPV6_VERSION) {
+ ude6.udp6EntryInfo.ue_state = state;
+ ude6.udp6LocalAddress = udp->udp_v6src;
+ ude6.udp6LocalPort = ntohs(udp->udp_port);
+ ude6.udp6IfIndex = udp->udp_bound_if;
+ if (udp->udp_state == TS_DATA_XFER) {
+ ude6.udp6EntryInfo.ue_RemoteAddress =
+ udp->udp_v6dst;
+ ude6.udp6EntryInfo.ue_RemotePort =
+ ntohs(udp->udp_dstport);
+ } else {
+ ude6.udp6EntryInfo.ue_RemoteAddress =
+ sin6_null.sin6_addr;
+ ude6.udp6EntryInfo.ue_RemotePort = 0;
+ }
+ (void) snmp_append_data2(mp6_conn_data,
+ &mp6_conn_tail, (char *)&ude6,
+ sizeof (ude6));
}
- (void) snmp_append_data2(mp6_conn_data, &mp6_conn_tail,
- (char *)&ude6, sizeof (ude6));
}
}
- mutex_exit(&udp_g_lock);
/* IPv4 UDP endpoints */
optp = (struct opthdr *)&mp_conn_ctl->b_rptr[
@@ -4754,7 +5477,7 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl)
* to do the appropriate locking.
*/
/* ARGSUSED */
-static int
+int
udp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name,
uchar_t *ptr, int len)
{
@@ -4789,7 +5512,7 @@ udp_report_item(mblk_t *mp, udp_t *udp)
state = "UnkState";
print_len = snprintf((char *)mp->b_wptr, buf_len,
MI_COL_PTRFMT_STR "%4d %5u %s %s %5u %s\n",
- (void *)udp, udp->udp_zoneid, ntohs(udp->udp_port),
+ (void *)udp, udp->udp_connp->conn_zoneid, ntohs(udp->udp_port),
inet_ntop(AF_INET6, &udp->udp_v6src,
addrbuf1, sizeof (addrbuf1)),
inet_ntop(AF_INET6, &udp->udp_v6dst,
@@ -4807,9 +5530,11 @@ udp_report_item(mblk_t *mp, udp_t *udp)
static int
udp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
{
- IDP idp;
- udp_t *udp;
zoneid_t zoneid;
+ connf_t *connfp;
+ conn_t *connp = Q_TO_CONN(q);
+ udp_t *udp = connp->conn_udp;
+ int i;
/*
* Because of the ndd constraint, at most we can have 64K buffer
@@ -4837,21 +5562,22 @@ udp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
" zone lport src addr dest addr port state");
/* 1234 12345 xxx.xxx.xxx.xxx xxx.xxx.xxx.xxx 12345 UNBOUND */
- udp = (udp_t *)q->q_ptr;
- zoneid = udp->udp_zoneid;
+ zoneid = connp->conn_zoneid;
- mutex_enter(&udp_g_lock);
- for (idp = mi_first_ptr(&udp_g_head);
- (udp = (udp_t *)idp) != 0;
- idp = mi_next_ptr(&udp_g_head, idp)) {
+ for (i = 0; i < CONN_G_HASH_SIZE; i++) {
+ connfp = &ipcl_globalhash_fanout[i];
+ connp = NULL;
- if (zoneid != GLOBAL_ZONEID &&
- zoneid != udp->udp_zoneid)
- continue;
+ while ((connp = ipcl_get_next_conn(connfp, connp,
+ IPCL_UDP))) {
+ udp = connp->conn_udp;
+ if (zoneid != GLOBAL_ZONEID &&
+ zoneid != connp->conn_zoneid)
+ continue;
- udp_report_item(mp->b_cont, udp);
+ udp_report_item(mp->b_cont, udp);
+ }
}
- mutex_exit(&udp_g_lock);
udp_last_ndd_get_info_time = ddi_get_lbolt();
return (0);
}
@@ -4862,32 +5588,44 @@ udp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
* passed in mp. This message is freed.
*/
static void
-udp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err)
+udp_ud_err(queue_t *q, mblk_t *mp, uchar_t *destaddr, t_scalar_t destlen,
+ t_scalar_t err)
{
+ struct T_unitdata_req *tudr;
mblk_t *mp1;
- struct T_unitdata_req *tudr = (struct T_unitdata_req *)mp->b_rptr;
- uchar_t *destaddr, *optaddr;
+ uchar_t *optaddr;
+ t_scalar_t optlen;
- if ((mp->b_wptr < mp->b_rptr) ||
- (mp->b_wptr - mp->b_rptr) < sizeof (struct T_unitdata_req)) {
- goto done;
- }
- destaddr = mp->b_rptr + tudr->DEST_offset;
- if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
- destaddr + tudr->DEST_length < mp->b_rptr ||
- destaddr + tudr->DEST_length > mp->b_wptr) {
- goto done;
- }
- optaddr = mp->b_rptr + tudr->OPT_offset;
- if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
- optaddr + tudr->OPT_length < mp->b_rptr ||
- optaddr + tudr->OPT_length > mp->b_wptr) {
- goto done;
+ if (DB_TYPE(mp) == M_DATA) {
+ ASSERT(destaddr != NULL && destlen != 0);
+ optaddr = NULL;
+ optlen = 0;
+ } else {
+ if ((mp->b_wptr < mp->b_rptr) ||
+ (MBLKL(mp)) < sizeof (struct T_unitdata_req)) {
+ goto done;
+ }
+ tudr = (struct T_unitdata_req *)mp->b_rptr;
+ destaddr = mp->b_rptr + tudr->DEST_offset;
+ if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr ||
+ destaddr + tudr->DEST_length < mp->b_rptr ||
+ destaddr + tudr->DEST_length > mp->b_wptr) {
+ goto done;
+ }
+ optaddr = mp->b_rptr + tudr->OPT_offset;
+ if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr ||
+ optaddr + tudr->OPT_length < mp->b_rptr ||
+ optaddr + tudr->OPT_length > mp->b_wptr) {
+ goto done;
+ }
+ destlen = tudr->DEST_length;
+ optlen = tudr->OPT_length;
}
- mp1 = mi_tpi_uderror_ind((char *)destaddr, tudr->DEST_length,
- (char *)optaddr, tudr->OPT_length, err);
- if (mp1)
- qreply(q, mp1);
+
+ mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen,
+ (char *)optaddr, optlen, err);
+ if (mp1 != NULL)
+ putnext(UDP_RD(q), mp1);
done:
freemsg(mp);
@@ -4900,9 +5638,8 @@ done:
static void
udp_unbind(queue_t *q, mblk_t *mp)
{
- udp_t *udp;
+ udp_t *udp = Q_TO_UDP(q);
- udp = (udp_t *)q->q_ptr;
/* If a bind has not been done, we can't unbind. */
if (udp->udp_state == TS_UNBND) {
udp_err_ack(q, mp, TOUTSTATE, 0);
@@ -4939,8 +5676,13 @@ udp_unbind(queue_t *q, mblk_t *mp)
return;
}
}
- /* Pass the unbind to IP */
- putnext(q, mp);
+ /*
+ * Pass the unbind to IP; T_UNBIND_REQ is larger than T_OK_ACK
+ * and therefore ip_unbind must never return NULL.
+ */
+ mp = ip_unbind(q, mp);
+ ASSERT(mp != NULL);
+ putnext(UDP_RD(q), mp);
}
/*
@@ -4994,193 +5736,47 @@ retry:
return (port);
}
-/*
- * This routine handles all messages passed downstream. It either
- * consumes the message or passes it downstream; it never queues a
- * a message.
- */
-static void
-udp_wput(queue_t *q, mblk_t *mp)
+static mblk_t *
+udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port,
+ uint_t srcid, int *error)
{
- uchar_t *rptr = mp->b_rptr;
- struct datab *db;
- ipha_t *ipha;
- udpha_t *udpha;
- mblk_t *mp1;
- int ip_hdr_length;
-#define tudr ((struct T_unitdata_req *)rptr)
- uint32_t ip_len;
- udp_t *udp;
- sin6_t *sin6;
- sin_t *sin;
- ipaddr_t v4dst;
- uint16_t port;
- uint_t srcid;
-
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_START,
- "udp_wput_start: q %p mp %p", q, mp);
-
- db = mp->b_datap;
- switch (db->db_type) {
- case M_PROTO:
- case M_PCPROTO:
- ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
- if (mp->b_wptr - rptr >= sizeof (struct T_unitdata_req)) {
- /* Detect valid T_UNITDATA_REQ here */
- if (((union T_primitives *)rptr)->type
- == T_UNITDATA_REQ)
- break;
- }
- /* FALLTHRU */
- default:
- qwriter(q, mp, udp_wput_other, PERIM_INNER);
- return;
- }
-
- udp = (udp_t *)q->q_ptr;
-
- /* Handle UNITDATA_REQ messages here */
- if (udp->udp_state == TS_UNBND) {
- /* If a port has not been bound to the stream, fail. */
- BUMP_MIB(&udp_mib, udpOutErrors);
- udp_ud_err(q, mp, EPROTO);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
- "udp_wput_end: q %p (%S)", q, "outstate");
- return;
- }
- mp1 = mp->b_cont;
- if (mp1 == NULL) {
- BUMP_MIB(&udp_mib, udpOutErrors);
- udp_ud_err(q, mp, EPROTO);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
- "udp_wput_end: q %p (%S)", q, "badaddr");
- return;
- }
-
- if ((rptr + tudr->DEST_offset + tudr->DEST_length) > mp->b_wptr) {
- BUMP_MIB(&udp_mib, udpOutErrors);
- udp_ud_err(q, mp, EADDRNOTAVAIL);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
- "udp_wput_end: q %p (%S)", q, "badaddr");
- return;
- }
-
- switch (udp->udp_family) {
- case AF_INET6:
- sin6 = (sin6_t *)&rptr[tudr->DEST_offset];
- if (!OK_32PTR((char *)sin6) ||
- tudr->DEST_length != sizeof (sin6_t) ||
- sin6->sin6_family != AF_INET6) {
- BUMP_MIB(&udp_mib, udpOutErrors);
- udp_ud_err(q, mp, EADDRNOTAVAIL);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
- "udp_wput_end: q %p (%S)", q, "badaddr");
- return;
- }
-
- if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
- /*
- * Destination is a non-IPv4-compatible IPv6 address.
- * Send out an IPv6 format packet.
- */
- udp_wput_ipv6(q, mp, sin6, tudr->OPT_length);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
- "udp_wput_end: q %p (%S)", q, "udp_wput_ipv6");
- return;
- }
- /*
- * If the local address is not zero or a mapped address return
- * an error.
- * I would be possible to send an IPv4 packet but the
- * response would never make it back to the application
- * since it is bound to a non-mapped address.
- */
- if (!IN6_IS_ADDR_V4MAPPED(&udp->udp_v6src) &&
- !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) {
- BUMP_MIB(&udp_mib, udpOutErrors);
- udp_ud_err(q, mp, EADDRNOTAVAIL);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
- "udp_wput_end: q %p (%S)", q, "badaddr");
- return;
- }
- /* Send IPv4 packet without modifying udp_ipversion */
- /* Extract port and ipaddr */
- port = sin6->sin6_port;
- IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, v4dst);
- srcid = sin6->__sin6_src_id;
- break;
-
- case AF_INET:
- sin = (sin_t *)&rptr[tudr->DEST_offset];
- if (!OK_32PTR((char *)sin) ||
- tudr->DEST_length != sizeof (sin_t) ||
- sin->sin_family != AF_INET) {
- BUMP_MIB(&udp_mib, udpOutErrors);
- udp_ud_err(q, mp, EADDRNOTAVAIL);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
- "udp_wput_end: q %p (%S)", q, "badaddr");
- return;
- }
- /* Extract port and ipaddr */
- port = sin->sin_port;
- v4dst = sin->sin_addr.s_addr;
- srcid = 0;
- break;
- }
+ udp_t *udp = connp->conn_udp;
+ queue_t *q = connp->conn_wq;
+ mblk_t *mp1 = (DB_TYPE(mp) == M_DATA ? mp : mp->b_cont);
+ mblk_t *mp2;
+ ipha_t *ipha;
+ int ip_hdr_length;
+ uint32_t ip_len;
+ udpha_t *udpha;
+ *error = 0;
- /*
- * If options passed in, feed it for verification and handling
- */
- if (tudr->OPT_length != 0) {
- int error;
-
- if (udp_unitdata_opt_process(q, mp, &error, NULL) < 0) {
- /* failure */
- BUMP_MIB(&udp_mib, udpOutErrors);
- udp_ud_err(q, mp, error);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
- "udp_wput_end: q %p (%S)", q,
- "udp_unitdata_opt_process");
- return;
- }
- ASSERT(error == 0);
- /*
- * Note: success in processing options.
- * mp option buffer represented by
- * OPT_length/offset now potentially modified
- * and contain option setting results
- */
- }
+ /* mp1 points to the M_DATA mblk carrying the packet */
+ ASSERT(mp1 != NULL && DB_TYPE(mp1) == M_DATA);
/* Add an IP header */
ip_hdr_length = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE +
udp->udp_ip_snd_options_len;
ipha = (ipha_t *)&mp1->b_rptr[-ip_hdr_length];
- if ((mp1->b_datap->db_ref != 1) ||
- ((uchar_t *)ipha < mp1->b_datap->db_base) ||
+ if (DB_REF(mp1) != 1 || (uchar_t *)ipha < DB_BASE(mp1) ||
!OK_32PTR(ipha)) {
- uchar_t *wptr;
-
- mp1 = allocb(ip_hdr_length + udp_wroff_extra, BPRI_LO);
- if (!mp1) {
- BUMP_MIB(&udp_mib, udpOutErrors);
- udp_ud_err(q, mp, ENOMEM);
+ mp2 = allocb(ip_hdr_length + udp_wroff_extra, BPRI_LO);
+ if (mp2 == NULL) {
TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
- "udp_wput_end: q %p (%S)", q, "allocbfail2");
- return;
- }
- mp1->b_cont = mp->b_cont;
- mp->b_cont = mp1;
- wptr = mp1->b_datap->db_lim;
- mp1->b_wptr = wptr;
- ipha = (ipha_t *)(wptr - ip_hdr_length);
- }
- mp1->b_rptr = (uchar_t *)ipha;
-
- ASSERT((uintptr_t)(mp1->b_wptr - (uchar_t *)ipha) <=
- (uintptr_t)UINT_MAX);
+ "udp_wput_end: q %p (%S)", q, "allocbfail2");
+ *error = ENOMEM;
+ goto done;
+ }
+ mp2->b_wptr = DB_LIM(mp2);
+ mp2->b_cont = mp1;
+ mp1 = mp2;
+ if (DB_TYPE(mp) != M_DATA)
+ mp->b_cont = mp1;
+ else
+ mp = mp1;
+ ipha = (ipha_t *)(mp1->b_wptr - ip_hdr_length);
+ }
ip_hdr_length -= UDPH_SIZE;
#ifdef _BIG_ENDIAN
/* Set version, header length, and tos */
@@ -5206,24 +5802,25 @@ udp_wput(queue_t *q, mblk_t *mp)
if (srcid != 0 && ipha->ipha_src == INADDR_ANY) {
in6_addr_t v6src;
- ip_srcid_find_id(srcid, &v6src, udp->udp_zoneid);
+ ip_srcid_find_id(srcid, &v6src, connp->conn_zoneid);
IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src);
}
ipha->ipha_fragment_offset_and_flags = 0;
ipha->ipha_ident = 0;
+ mp1->b_rptr = (uchar_t *)ipha;
+
+ ASSERT((uintptr_t)(mp1->b_wptr - (uchar_t *)ipha) <=
+ (uintptr_t)UINT_MAX);
+
/* Determine length of packet */
ip_len = (uint32_t)(mp1->b_wptr - (uchar_t *)ipha);
- {
- mblk_t *mp2;
- if ((mp2 = mp1->b_cont) != NULL) {
- do {
- ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr)
- <= (uintptr_t)UINT_MAX);
- ip_len += (uint32_t)(mp2->b_wptr - mp2->b_rptr);
- } while ((mp2 = mp2->b_cont) != NULL);
- }
+ if ((mp2 = mp1->b_cont) != NULL) {
+ do {
+ ASSERT((uintptr_t)MBLKL(mp2) <= (uintptr_t)UINT_MAX);
+ ip_len += (uint32_t)MBLKL(mp2);
+ } while ((mp2 = mp2->b_cont) != NULL);
}
/*
* If the size of the packet is greater than the maximum allowed by
@@ -5231,19 +5828,18 @@ udp_wput(queue_t *q, mblk_t *mp)
* the size will have wrapped and be inconsistent with the msg size.
*/
if (ip_len > IP_MAXPACKET) {
- BUMP_MIB(&udp_mib, udpOutErrors);
- udp_ud_err(q, mp, EMSGSIZE);
TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
"udp_wput_end: q %p (%S)", q, "IP length exceeded");
- return;
+ *error = EMSGSIZE;
+ goto done;
}
ipha->ipha_length = htons((uint16_t)ip_len);
ip_len -= ip_hdr_length;
ip_len = htons((uint16_t)ip_len);
udpha = (udpha_t *)(((uchar_t *)ipha) + ip_hdr_length);
+
/*
- * Copy in the destination address and port from the T_UNITDATA
- * request
+ * Copy in the destination address
*/
if (v4dst == INADDR_ANY)
ipha->ipha_dst = htonl(INADDR_LOOPBACK);
@@ -5310,41 +5906,648 @@ udp_wput(queue_t *q, mblk_t *mp)
/* Set UDP length and checksum */
*((uint32_t *)&udpha->uha_length) = ip_len;
- freeb(mp);
+ if (DB_TYPE(mp) != M_DATA) {
+ ASSERT(mp != mp1);
+ freeb(mp);
+ }
+
+ /* mp has been consumed and we'll return success */
+ ASSERT(*error == 0);
+ mp = NULL;
/* We're done. Pass the packet to ip. */
BUMP_MIB(&udp_mib, udpOutDatagrams);
TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
"udp_wput_end: q %p (%S)", q, "end");
- putnext(q, mp1);
-#undef tudr
+
+ if ((connp->conn_flags & IPCL_CHECK_POLICY) != 0 ||
+ CONN_OUTBOUND_POLICY_PRESENT(connp) ||
+ connp->conn_dontroute || connp->conn_xmit_if_ill != NULL ||
+ connp->conn_nofailover_ill != NULL ||
+ connp->conn_outgoing_ill != NULL ||
+ ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION ||
+ IPP_ENABLED(IPP_LOCAL_OUT) || ip_g_mrouter != NULL) {
+ UDP_STAT(udp_ip_send);
+ ip_output(connp, mp1, connp->conn_wq, IP_WPUT);
+ } else {
+ udp_send_data(udp, connp->conn_wq, mp1, ipha);
+ }
+
+done:
+ if (*error != 0) {
+ ASSERT(mp != NULL);
+ BUMP_MIB(&udp_mib, udpOutErrors);
+ }
+ return (mp);
+}
+
+static void
+udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha)
+{
+ conn_t *connp = udp->udp_connp;
+ ipaddr_t src, dst;
+ ill_t *ill;
+ ire_t *ire;
+ ipif_t *ipif = NULL;
+ mblk_t *ire_fp_mp;
+ uint_t ire_fp_mp_len;
+ uint16_t *up;
+ uint32_t cksum, hcksum_txflags;
+ queue_t *dev_q;
+ boolean_t retry_caching;
+
+ dst = ipha->ipha_dst;
+ src = ipha->ipha_src;
+ ASSERT(ipha->ipha_ident == 0);
+
+ if (CLASSD(dst)) {
+ int err;
+
+ ipif = conn_get_held_ipif(connp,
+ &connp->conn_multicast_ipif, &err);
+
+ if (ipif == NULL || ipif->ipif_isv6 ||
+ (ipif->ipif_ill->ill_phyint->phyint_flags &
+ PHYI_LOOPBACK)) {
+ if (ipif != NULL)
+ ipif_refrele(ipif);
+ UDP_STAT(udp_ip_send);
+ ip_output(connp, mp, q, IP_WPUT);
+ return;
+ }
+ }
+
+ retry_caching = B_FALSE;
+ mutex_enter(&connp->conn_lock);
+ ire = connp->conn_ire_cache;
+ ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT));
+
+ if (ire == NULL || ire->ire_addr != dst ||
+ (ire->ire_marks & IRE_MARK_CONDEMNED)) {
+ retry_caching = B_TRUE;
+ } else if (CLASSD(dst) && (ire->ire_type & IRE_CACHE)) {
+ ill_t *stq_ill = (ill_t *)ire->ire_stq->q_ptr;
+
+ ASSERT(ipif != NULL);
+ if (stq_ill != ipif->ipif_ill && (stq_ill->ill_group == NULL ||
+ stq_ill->ill_group != ipif->ipif_ill->ill_group))
+ retry_caching = B_TRUE;
+ }
+
+ if (!retry_caching) {
+ ASSERT(ire != NULL);
+ IRE_REFHOLD(ire);
+ mutex_exit(&connp->conn_lock);
+ } else {
+ boolean_t cached = B_FALSE;
+
+ connp->conn_ire_cache = NULL;
+ mutex_exit(&connp->conn_lock);
+
+ /* Release the old ire */
+ if (ire != NULL) {
+ IRE_REFRELE_NOTR(ire);
+ ire = NULL;
+ }
+
+ if (CLASSD(dst)) {
+ ASSERT(ipif != NULL);
+ ire = ire_ctable_lookup(dst, 0, 0, ipif,
+ connp->conn_zoneid, MATCH_IRE_ILL_GROUP);
+ } else {
+ ASSERT(ipif == NULL);
+ ire = ire_cache_lookup(dst, connp->conn_zoneid);
+ }
+
+ if (ire == NULL) {
+ if (ipif != NULL)
+ ipif_refrele(ipif);
+ UDP_STAT(udp_ire_null);
+ ip_output(connp, mp, q, IP_WPUT);
+ return;
+ }
+ IRE_REFHOLD_NOTR(ire);
+
+ mutex_enter(&connp->conn_lock);
+ if (!(connp->conn_state_flags & CONN_CLOSING) &&
+ connp->conn_ire_cache == NULL) {
+ rw_enter(&ire->ire_bucket->irb_lock, RW_READER);
+ if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) {
+ connp->conn_ire_cache = ire;
+ cached = B_TRUE;
+ }
+ rw_exit(&ire->ire_bucket->irb_lock);
+ }
+ mutex_exit(&connp->conn_lock);
+
+ /*
+ * We can continue to use the ire but since it was not
+ * cached, we should drop the extra reference.
+ */
+ if (!cached)
+ IRE_REFRELE_NOTR(ire);
+ }
+ ASSERT(ire != NULL && ire->ire_ipversion == IPV4_VERSION);
+ ASSERT(!CLASSD(dst) || ipif != NULL);
+
+ if ((ire->ire_type & (IRE_BROADCAST|IRE_LOCAL|IRE_LOOPBACK)) ||
+ (ire->ire_flags & RTF_MULTIRT) || ire->ire_stq == NULL ||
+ ire->ire_max_frag < ntohs(ipha->ipha_length) ||
+ (ire_fp_mp = ire->ire_fp_mp) == NULL ||
+ (ire_fp_mp_len = MBLKL(ire_fp_mp)) > MBLKHEAD(mp)) {
+ if (ipif != NULL)
+ ipif_refrele(ipif);
+ UDP_STAT(udp_ip_ire_send);
+ IRE_REFRELE(ire);
+ ip_output(connp, mp, q, IP_WPUT);
+ return;
+ }
+
+ BUMP_MIB(&ip_mib, ipOutRequests);
+
+ ill = ire_to_ill(ire);
+ ASSERT(ill != NULL);
+
+ dev_q = ire->ire_stq->q_next;
+ ASSERT(dev_q != NULL);
+ /*
+ * If the service thread is already running, or if the driver
+ * queue is currently flow-controlled, queue this packet.
+ */
+ if ((q->q_first != NULL || connp->conn_draining) ||
+ ((dev_q->q_next || dev_q->q_first) && !canput(dev_q))) {
+ if (ip_output_queue) {
+ (void) putq(q, mp);
+ } else {
+ BUMP_MIB(&ip_mib, ipOutDiscards);
+ freemsg(mp);
+ }
+ if (ipif != NULL)
+ ipif_refrele(ipif);
+ IRE_REFRELE(ire);
+ return;
+ }
+
+ ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1);
+#ifndef _BIG_ENDIAN
+ ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8);
+#endif
+
+ if (src == INADDR_ANY && !connp->conn_unspec_src) {
+ if (CLASSD(dst) && !(ire->ire_flags & RTF_SETSRC))
+ src = ipha->ipha_src = ipif->ipif_src_addr;
+ else
+ src = ipha->ipha_src = ire->ire_src_addr;
+ }
+
+ if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
+ ASSERT(ill->ill_hcksum_capab != NULL);
+ hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags;
+ } else {
+ hcksum_txflags = 0;
+ }
+
+ /* pseudo-header checksum (do it in parts for IP header checksum) */
+ cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
+
+ ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION);
+ up = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
+ if (*up != 0) {
+ IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags,
+ mp, ipha, up, IPPROTO_UDP, IP_SIMPLE_HDR_LENGTH,
+ ntohs(ipha->ipha_length), cksum);
+
+ /* Software checksum? */
+ if (DB_CKSUMFLAGS(mp) == 0) {
+ UDP_STAT(udp_out_sw_cksum);
+ UDP_STAT_UPDATE(udp_out_sw_cksum_bytes,
+ ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH);
+ }
+ }
+
+ ipha->ipha_fragment_offset_and_flags |=
+ (uint32_t)htons(ire->ire_frag_flag);
+
+ /* Calculate IP header checksum if hardware isn't capable */
+ if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) {
+ IP_HDR_CKSUM(ipha, cksum, ((uint32_t *)ipha)[0],
+ ((uint16_t *)ipha)[4]);
+ }
+
+ if (CLASSD(dst)) {
+ ilm_t *ilm;
+
+ ILM_WALKER_HOLD(ill);
+ ilm = ilm_lookup_ill(ill, dst, ALL_ZONES);
+ ILM_WALKER_RELE(ill);
+ if (ilm != NULL) {
+ ip_multicast_loopback(q, ill, mp,
+ connp->conn_multicast_loop ? 0 :
+ IP_FF_NO_MCAST_LOOP, connp->conn_zoneid);
+ }
+
+ /* If multicast TTL is 0 then we are done */
+ if (ipha->ipha_ttl == 0) {
+ if (ipif != NULL)
+ ipif_refrele(ipif);
+ freemsg(mp);
+ IRE_REFRELE(ire);
+ return;
+ }
+ }
+
+ ASSERT(DB_TYPE(ire_fp_mp) == M_DATA);
+ mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len;
+ bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len);
+
+ UPDATE_OB_PKT_COUNT(ire);
+ ire->ire_last_used_time = lbolt;
+
+ if (ILL_POLL_CAPABLE(ill)) {
+ /*
+ * Send the packet directly to DLD, where it may be queued
+ * depending on the availability of transmit resources at
+ * the media layer.
+ */
+ IP_POLL_ILL_TX(ill, mp);
+ } else {
+ putnext(ire->ire_stq, mp);
+ }
+
+ if (ipif != NULL)
+ ipif_refrele(ipif);
+ IRE_REFRELE(ire);
}
/*
- * udp_wput_ipv6():
- * Assumes that udp_wput did some sanity checking on the destination
- * address.
+ * This routine handles all messages passed downstream. It either
+ * consumes the message or passes it downstream; it never queues a
+ * a message.
*/
static void
-udp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen)
+udp_output(conn_t *connp, mblk_t *mp, struct sockaddr *addr, socklen_t addrlen)
{
- ip6_t *ip6h;
- ip6i_t *ip6i; /* mp1->b_rptr even if no ip6i_t */
- mblk_t *mp1;
- int udp_ip_hdr_len = IPV6_HDR_LEN + UDPH_SIZE;
- size_t ip_len;
- udpha_t *udph;
- udp_t *udp;
- ip6_pkt_t ipp_s; /* For ancillary data options */
- ip6_pkt_t *ipp = &ipp_s;
- ip6_pkt_t *tipp; /* temporary ipp */
- uint32_t csum = 0;
- uint_t ignore = 0;
- uint_t option_exists = 0, is_sticky = 0;
- uint8_t *cp;
- uint8_t *nxthdr_ptr;
+ sin6_t *sin6;
+ sin_t *sin;
+ ipaddr_t v4dst;
+ uint16_t port;
+ uint_t srcid;
+ queue_t *q = connp->conn_wq;
+ udp_t *udp = connp->conn_udp;
+ t_scalar_t optlen;
+ int error = 0;
+ struct sockaddr_storage ss;
- udp = (udp_t *)q->q_ptr;
+ TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_START,
+ "udp_wput_start: connp %p mp %p", connp, mp);
+
+ /*
+ * We directly handle several cases here: T_UNITDATA_REQ message
+ * coming down as M_PROTO/M_PCPROTO and M_DATA messages for both
+ * connected and non-connected socket. The latter carries the
+ * address structure along when this routine gets called.
+ */
+ switch (DB_TYPE(mp)) {
+ case M_DATA:
+ if (!udp->udp_direct_sockfs || udp->udp_state != TS_DATA_XFER) {
+ if (!udp->udp_direct_sockfs ||
+ addr == NULL || addrlen == 0) {
+ /* Not connected; address is required */
+ BUMP_MIB(&udp_mib, udpOutErrors);
+ UDP_STAT(udp_out_err_notconn);
+ freemsg(mp);
+ TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+ "udp_wput_end: connp %p (%S)", connp,
+ "not-connected; address required");
+ return;
+ }
+ ASSERT(udp->udp_issocket);
+ UDP_DBGSTAT(udp_data_notconn);
+ /* Not connected; do some more checks below */
+ optlen = 0;
+ break;
+ }
+ /* M_DATA for connected socket */
+ UDP_DBGSTAT(udp_data_conn);
+ IN6_V4MAPPED_TO_IPADDR(&udp->udp_v6dst, v4dst);
+
+ /* Initialize addr and addrlen as if they're passed in */
+ if (udp->udp_family == AF_INET) {
+ sin = (sin_t *)&ss;
+ sin->sin_family = AF_INET;
+ sin->sin_port = udp->udp_dstport;
+ sin->sin_addr.s_addr = v4dst;
+ addr = (struct sockaddr *)sin;
+ addrlen = sizeof (*sin);
+ } else {
+ sin6 = (sin6_t *)&ss;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = udp->udp_dstport;
+ sin6->sin6_flowinfo = udp->udp_flowinfo;
+ sin6->sin6_addr = udp->udp_v6dst;
+ sin6->sin6_scope_id = 0;
+ sin6->__sin6_src_id = 0;
+ addr = (struct sockaddr *)sin6;
+ addrlen = sizeof (*sin6);
+ }
+
+ if (udp->udp_family == AF_INET ||
+ IN6_IS_ADDR_V4MAPPED(&udp->udp_v6dst)) {
+ /*
+ * Handle both AF_INET and AF_INET6; the latter
+ * for IPV4 mapped destination addresses. Note
+ * here that both addr and addrlen point to the
+ * corresponding struct depending on the address
+ * family of the socket.
+ */
+ mp = udp_output_v4(connp, mp, v4dst,
+ udp->udp_dstport, 0, &error);
+ } else {
+ mp = udp_output_v6(connp, mp, sin6, 0, &error);
+ }
+ if (error != 0) {
+ ASSERT(addr != NULL && addrlen != 0);
+ goto ud_error;
+ }
+ return;
+ case M_PROTO:
+ case M_PCPROTO: {
+ struct T_unitdata_req *tudr;
+
+ ASSERT((uintptr_t)MBLKL(mp) <= (uintptr_t)INT_MAX);
+ tudr = (struct T_unitdata_req *)mp->b_rptr;
+
+ /* Handle valid T_UNITDATA_REQ here */
+ if (MBLKL(mp) >= sizeof (*tudr) &&
+ ((t_primp_t)mp->b_rptr)->type == T_UNITDATA_REQ) {
+ if (mp->b_cont == NULL) {
+ TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+ "udp_wput_end: q %p (%S)", q, "badaddr");
+ error = EPROTO;
+ goto ud_error;
+ }
+
+ if (!MBLKIN(mp, 0, tudr->DEST_offset +
+ tudr->DEST_length)) {
+ TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+ "udp_wput_end: q %p (%S)", q, "badaddr");
+ error = EADDRNOTAVAIL;
+ goto ud_error;
+ }
+ /*
+ * If a port has not been bound to the stream, fail.
+ * This is not a problem when sockfs is directly
+ * above us, because it will ensure that the socket
+ * is first bound before allowing data to be sent.
+ */
+ if (udp->udp_state == TS_UNBND) {
+ TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+ "udp_wput_end: q %p (%S)", q, "outstate");
+ error = EPROTO;
+ goto ud_error;
+ }
+ addr = (struct sockaddr *)
+ &mp->b_rptr[tudr->DEST_offset];
+ addrlen = tudr->DEST_length;
+ optlen = tudr->OPT_length;
+ if (optlen != 0)
+ UDP_STAT(udp_out_opt);
+ break;
+ }
+ /* FALLTHRU */
+ }
+ default:
+ udp_become_writer(connp, mp, udp_wput_other_wrapper,
+ SQTAG_UDP_OUTPUT);
+ return;
+ }
+ ASSERT(addr != NULL);
+
+ switch (udp->udp_family) {
+ case AF_INET6:
+ sin6 = (sin6_t *)addr;
+ if (!OK_32PTR((char *)sin6) || addrlen != sizeof (sin6_t) ||
+ sin6->sin6_family != AF_INET6) {
+ TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+ "udp_wput_end: q %p (%S)", q, "badaddr");
+ error = EADDRNOTAVAIL;
+ goto ud_error;
+ }
+
+ if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
+ /*
+ * Destination is a non-IPv4-compatible IPv6 address.
+ * Send out an IPv6 format packet.
+ */
+ mp = udp_output_v6(connp, mp, sin6, optlen, &error);
+ if (error != 0)
+ goto ud_error;
+
+ TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+ "udp_wput_end: q %p (%S)", q, "udp_output_v6");
+ return;
+ }
+ /*
+ * If the local address is not zero or a mapped address
+ * return an error. It would be possible to send an IPv4
+ * packet but the response would never make it back to the
+ * application since it is bound to a non-mapped address.
+ */
+ if (!IN6_IS_ADDR_V4MAPPED(&udp->udp_v6src) &&
+ !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) {
+ TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+ "udp_wput_end: q %p (%S)", q, "badaddr");
+ error = EADDRNOTAVAIL;
+ goto ud_error;
+ }
+ /* Send IPv4 packet without modifying udp_ipversion */
+ /* Extract port and ipaddr */
+ port = sin6->sin6_port;
+ IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, v4dst);
+ srcid = sin6->__sin6_src_id;
+ break;
+
+ case AF_INET:
+ sin = (sin_t *)addr;
+ if (!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t) ||
+ sin->sin_family != AF_INET) {
+ TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+ "udp_wput_end: q %p (%S)", q, "badaddr");
+ error = EADDRNOTAVAIL;
+ goto ud_error;
+ }
+ /* Extract port and ipaddr */
+ port = sin->sin_port;
+ v4dst = sin->sin_addr.s_addr;
+ srcid = 0;
+ break;
+ }
+
+ /*
+ * If options passed in, feed it for verification and handling
+ */
+ if (optlen != 0) {
+ ASSERT(DB_TYPE(mp) != M_DATA);
+ if (udp_unitdata_opt_process(q, mp, &error, NULL) < 0) {
+ /* failure */
+ TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END,
+ "udp_wput_end: q %p (%S)", q,
+ "udp_unitdata_opt_process");
+ goto ud_error;
+ }
+ /*
+ * Note: success in processing options.
+ * mp option buffer represented by
+ * OPT_length/offset now potentially modified
+ * and contain option setting results
+ */
+ }
+ ASSERT(error == 0);
+ mp = udp_output_v4(connp, mp, v4dst, port, srcid, &error);
+ if (error != 0) {
+ud_error:
+ UDP_STAT(udp_out_err_output);
+ ASSERT(mp != NULL);
+ /* mp is freed by the following routine */
+ udp_ud_err(q, mp, (uchar_t *)addr, (t_scalar_t)addrlen,
+ (t_scalar_t)error);
+ }
+}
+
+/* ARGSUSED */
+static void
+udp_output_wrapper(void *arg, mblk_t *mp, void *arg2)
+{
+ udp_output((conn_t *)arg, mp, NULL, 0);
+ _UDP_EXIT((conn_t *)arg);
+}
+
+static void
+udp_wput(queue_t *q, mblk_t *mp)
+{
+ _UDP_ENTER(Q_TO_CONN(UDP_WR(q)), mp, udp_output_wrapper,
+ SQTAG_UDP_WPUT);
+}
+
+/*
+ * Allocate and prepare a T_UNITDATA_REQ message.
+ */
+static mblk_t *
+udp_tudr_alloc(struct sockaddr *addr, socklen_t addrlen)
+{
+ struct T_unitdata_req *tudr;
+ mblk_t *mp;
+
+ mp = allocb(sizeof (*tudr) + addrlen, BPRI_MED);
+ if (mp != NULL) {
+ mp->b_wptr += sizeof (*tudr) + addrlen;
+ DB_TYPE(mp) = M_PROTO;
+
+ tudr = (struct T_unitdata_req *)mp->b_rptr;
+ tudr->PRIM_type = T_UNITDATA_REQ;
+ tudr->DEST_length = addrlen;
+ tudr->DEST_offset = (t_scalar_t)sizeof (*tudr);
+ tudr->OPT_length = 0;
+ tudr->OPT_offset = 0;
+ bcopy(addr, tudr+1, addrlen);
+ }
+ return (mp);
+}
+
+/*
+ * Entry point for sockfs when udp is in "direct sockfs" mode. This mode
+ * is valid when we are directly beneath the stream head, and thus sockfs
+ * is able to bypass STREAMS and directly call us, passing along the sockaddr
+ * structure without the cumbersome T_UNITDATA_REQ interface. Note that
+ * this is done for both connected and non-connected endpoint.
+ */
+void
+udp_wput_data(queue_t *q, mblk_t *mp, struct sockaddr *addr, socklen_t addrlen)
+{
+ conn_t *connp;
+ udp_t *udp;
+
+ q = UDP_WR(q);
+ connp = Q_TO_CONN(q);
+ udp = connp->conn_udp;
+
+ /* udpsockfs should only send down M_DATA for this entry point */
+ ASSERT(DB_TYPE(mp) == M_DATA);
+
+ mutex_enter(&connp->conn_lock);
+ UDP_MODE_ASSERTIONS(udp, UDP_ENTER);
+
+ if (udp->udp_mode != UDP_MT_HOT) {
+ /*
+ * We can't enter this conn right away because another
+ * thread is currently executing as writer; therefore we
+ * need to deposit the message into the squeue to be
+ * drained later. If a socket address is present, we
+ * need to create a T_UNITDATA_REQ message as placeholder.
+ */
+ if (addr != NULL && addrlen != 0) {
+ mblk_t *tudr_mp = udp_tudr_alloc(addr, addrlen);
+
+ if (tudr_mp == NULL) {
+ mutex_exit(&connp->conn_lock);
+ BUMP_MIB(&udp_mib, udpOutErrors);
+ UDP_STAT(udp_out_err_tudr);
+ freemsg(mp);
+ return;
+ }
+ /* Tag the packet with T_UNITDATA_REQ */
+ tudr_mp->b_cont = mp;
+ mp = tudr_mp;
+ }
+ mutex_exit(&connp->conn_lock);
+ udp_enter(connp, mp, udp_output_wrapper, SQTAG_UDP_WPUT);
+ return;
+ }
+
+ /* We can execute as reader right away. */
+ UDP_READERS_INCREF(udp);
+ mutex_exit(&connp->conn_lock);
+
+ udp_output(connp, mp, addr, addrlen);
+
+ mutex_enter(&connp->conn_lock);
+ UDP_MODE_ASSERTIONS(udp, UDP_EXIT);
+ UDP_READERS_DECREF(udp);
+ mutex_exit(&connp->conn_lock);
+}
+
+/*
+ * udp_output_v6():
+ * Assumes that udp_wput did some sanity checking on the destination
+ * address.
+ */
+static mblk_t *
+udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen,
+ int *error)
+{
+ ip6_t *ip6h;
+ ip6i_t *ip6i; /* mp1->b_rptr even if no ip6i_t */
+ mblk_t *mp1 = (DB_TYPE(mp) == M_DATA ? mp : mp->b_cont);
+ mblk_t *mp2;
+ int udp_ip_hdr_len = IPV6_HDR_LEN + UDPH_SIZE;
+ size_t ip_len;
+ udpha_t *udph;
+ udp_t *udp = connp->conn_udp;
+ queue_t *q = connp->conn_wq;
+ ip6_pkt_t ipp_s; /* For ancillary data options */
+ ip6_pkt_t *ipp = &ipp_s;
+ ip6_pkt_t *tipp; /* temporary ipp */
+ uint32_t csum = 0;
+ uint_t ignore = 0;
+ uint_t option_exists = 0, is_sticky = 0;
+ uint8_t *cp;
+ uint8_t *nxthdr_ptr;
+
+ *error = 0;
+
+ /* mp1 points to the M_DATA mblk carrying the packet */
+ ASSERT(mp1 != NULL && DB_TYPE(mp1) == M_DATA);
+ ASSERT(tudr_optlen == 0 || DB_TYPE(mp) != M_DATA);
/*
* If the local address is a mapped address return
@@ -5354,9 +6557,8 @@ udp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen)
* since it is bound to a mapped address.
*/
if (IN6_IS_ADDR_V4MAPPED(&udp->udp_v6src)) {
- BUMP_MIB(&udp_mib, udpOutErrors);
- udp_ud_err(q, mp, EADDRNOTAVAIL);
- return;
+ *error = EADDRNOTAVAIL;
+ goto done;
}
ipp->ipp_fields = 0;
@@ -5366,17 +6568,12 @@ udp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen)
* If TPI options passed in, feed it for verification and handling
*/
if (tudr_optlen != 0) {
- int error;
-
- if (udp_unitdata_opt_process(q, mp, &error,
- (void *)ipp) < 0) {
+ if (udp_unitdata_opt_process(q, mp, error, (void *)ipp) < 0) {
/* failure */
- BUMP_MIB(&udp_mib, udpOutErrors);
- udp_ud_err(q, mp, error);
- return;
+ goto done;
}
ignore = ipp->ipp_sticky_ignored;
- ASSERT(error == 0);
+ ASSERT(*error == 0);
}
if (sin6->sin6_scope_id != 0 &&
@@ -5389,8 +6586,7 @@ udp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen)
option_exists |= IPPF_SCOPE_ID;
}
- if ((udp->udp_sticky_ipp.ipp_fields == 0) &&
- (ipp->ipp_fields == 0)) {
+ if ((udp->udp_sticky_ipp.ipp_fields == 0) && (ipp->ipp_fields == 0)) {
/* No sticky options nor ancillary data. */
goto no_options;
}
@@ -5475,7 +6671,8 @@ udp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen)
if (!(ignore & IPPF_USE_MIN_MTU)) {
if (ipp->ipp_fields & IPPF_USE_MIN_MTU) {
option_exists |= IPPF_USE_MIN_MTU;
- } else if (udp->udp_sticky_ipp.ipp_fields & IPPF_USE_MIN_MTU) {
+ } else if (udp->udp_sticky_ipp.ipp_fields &
+ IPPF_USE_MIN_MTU) {
option_exists |= IPPF_USE_MIN_MTU;
is_sticky |= IPPF_USE_MIN_MTU;
}
@@ -5518,26 +6715,28 @@ no_options:
udp_ip_hdr_len += sizeof (ip6i_t);
/* check/fix buffer config, setup pointers into it */
- mp1 = mp->b_cont;
ip6h = (ip6_t *)&mp1->b_rptr[-udp_ip_hdr_len];
- if ((mp1->b_datap->db_ref != 1) ||
- ((unsigned char *)ip6h < mp1->b_datap->db_base) ||
+ if (DB_REF(mp1) != 1 || ((unsigned char *)ip6h < DB_BASE(mp1)) ||
!OK_32PTR(ip6h)) {
/* Try to get everything in a single mblk next time */
if (udp_ip_hdr_len > udp->udp_max_hdr_len) {
udp->udp_max_hdr_len = udp_ip_hdr_len;
- (void) mi_set_sth_wroff(RD(q),
+ (void) mi_set_sth_wroff(UDP_RD(q),
udp->udp_max_hdr_len + udp_wroff_extra);
}
- mp1 = allocb(udp_ip_hdr_len + udp_wroff_extra, BPRI_LO);
- if (!mp1) {
- BUMP_MIB(&udp_mib, udpOutErrors);
- udp_ud_err(q, mp, ENOMEM);
- return;
+ mp2 = allocb(udp_ip_hdr_len + udp_wroff_extra, BPRI_LO);
+ if (mp2 == NULL) {
+ *error = ENOMEM;
+ goto done;
}
- mp1->b_cont = mp->b_cont;
- mp->b_cont = mp1;
- mp1->b_wptr = mp1->b_datap->db_lim;
+ mp2->b_wptr = DB_LIM(mp2);
+ mp2->b_cont = mp1;
+ mp1 = mp2;
+ if (DB_TYPE(mp) != M_DATA)
+ mp->b_cont = mp1;
+ else
+ mp = mp1;
+
ip6h = (ip6_t *)(mp1->b_wptr - udp_ip_hdr_len);
}
mp1->b_rptr = (unsigned char *)ip6h;
@@ -5624,7 +6823,7 @@ no_options:
if (sin6->__sin6_src_id != 0 &&
IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) {
ip_srcid_find_id(sin6->__sin6_src_id,
- &ip6h->ip6_src, udp->udp_zoneid);
+ &ip6h->ip6_src, connp->conn_zoneid);
}
}
@@ -5731,9 +6930,8 @@ no_options:
* Drop packet - only support Type 0 routing.
* Notify the application as well.
*/
- udp_ud_err(q, mp, EPROTO);
- BUMP_MIB(&udp_mib, udpOutErrors);
- return;
+ *error = EPROTO;
+ goto done;
}
/*
@@ -5741,9 +6939,8 @@ no_options:
* addresses in the header. Thus it must be even.
*/
if (rth->ip6r_len & 0x1) {
- udp_ud_err(q, mp, EPROTO);
- BUMP_MIB(&udp_mib, udpOutErrors);
- return;
+ *error = EPROTO;
+ goto done;
}
/*
* Shuffle the routing header and ip6_dst
@@ -5758,9 +6955,8 @@ no_options:
* for subsequent hops.
*/
if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) {
- udp_ud_err(q, mp, EADDRNOTAVAIL);
- BUMP_MIB(&udp_mib, udpOutErrors);
- return;
+ *error = EADDRNOTAVAIL;
+ goto done;
}
cp += (rth->ip6r_len + 1)*8;
@@ -5769,14 +6965,11 @@ no_options:
/* count up length of UDP packet */
ip_len = (mp1->b_wptr - (unsigned char *)ip6h) - IPV6_HDR_LEN;
- {
- mblk_t *mp2;
-
- if ((mp2 = mp1->b_cont) != NULL) {
- do {
- ip_len += mp2->b_wptr - mp2->b_rptr;
- } while ((mp2 = mp2->b_cont) != NULL);
- }
+ if ((mp2 = mp1->b_cont) != NULL) {
+ do {
+ ASSERT((uintptr_t)MBLKL(mp2) <= (uintptr_t)UINT_MAX);
+ ip_len += (uint32_t)MBLKL(mp2);
+ } while ((mp2 = mp2->b_cont) != NULL);
}
/*
@@ -5785,9 +6978,8 @@ no_options:
* the size will have wrapped and be inconsistent with the msg size.
*/
if (ip_len > IP_MAXPACKET) {
- BUMP_MIB(&udp_mib, udpOutErrors);
- udp_ud_err(q, mp, EMSGSIZE);
- return;
+ *error = EMSGSIZE;
+ goto done;
}
/* Store the UDP length. Subtract length of extension hdrs */
@@ -5810,11 +7002,25 @@ no_options:
#endif
ip6h->ip6_plen = ip_len;
- freeb(mp);
+ if (DB_TYPE(mp) != M_DATA) {
+ ASSERT(mp != mp1);
+ freeb(mp);
+ }
+
+ /* mp has been consumed and we'll return success */
+ ASSERT(*error == 0);
+ mp = NULL;
/* We're done. Pass the packet to IP */
BUMP_MIB(&udp_mib, udpOutDatagrams);
- putnext(q, mp1);
+ ip_output_v6(connp, mp1, q, IP_WPUT);
+
+done:
+ if (*error != 0) {
+ ASSERT(mp != NULL);
+ BUMP_MIB(&udp_mib, udpOutErrors);
+ }
+ return (mp);
}
static void
@@ -5823,26 +7029,18 @@ udp_wput_other(queue_t *q, mblk_t *mp)
uchar_t *rptr = mp->b_rptr;
struct datab *db;
struct iocblk *iocp;
- udp_t *udp;
cred_t *cr;
+ conn_t *connp = Q_TO_CONN(q);
+ udp_t *udp = connp->conn_udp;
TRACE_1(TR_FAC_UDP, TR_UDP_WPUT_OTHER_START,
"udp_wput_other_start: q %p", q);
- udp = (udp_t *)q->q_ptr;
db = mp->b_datap;
- cr = DB_CREDDEF(mp, udp->udp_credp);
+ cr = DB_CREDDEF(mp, connp->conn_cred);
switch (db->db_type) {
- case M_DATA:
- /* Not connected */
- BUMP_MIB(&udp_mib, udpOutErrors);
- freemsg(mp);
- TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
- "udp_wput_other_end: q %p (%S)",
- q, "not-connected");
- return;
case M_PROTO:
case M_PCPROTO:
if (mp->b_wptr - rptr < sizeof (t_scalar_t)) {
@@ -5852,7 +7050,7 @@ udp_wput_other(queue_t *q, mblk_t *mp)
q, "protoshort");
return;
}
- switch (((union T_primitives *)rptr)->type) {
+ switch (((t_primp_t)rptr)->type) {
case T_ADDR_REQ:
udp_addr_req(q, mp);
TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
@@ -5885,7 +7083,7 @@ udp_wput_other(queue_t *q, mblk_t *mp)
* be bad. Valid T_UNITDATA_REQs are handled
* in udp_wput.
*/
- udp_ud_err(q, mp, EADDRNOTAVAIL);
+ udp_ud_err(q, mp, NULL, 0, EADDRNOTAVAIL);
TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
"udp_wput_other_end: q %p (%S)",
q, "unitdatareq");
@@ -5897,14 +7095,26 @@ udp_wput_other(queue_t *q, mblk_t *mp)
return;
case T_SVR4_OPTMGMT_REQ:
if (!snmpcom_req(q, mp, udp_snmp_set, udp_snmp_get, cr))
- (void) svr4_optcom_req(q, mp, cr, &udp_opt_obj);
+ /*
+ * Use upper queue for option processing in
+ * case the request is not handled at this
+ * level and needs to be passed down to IP.
+ */
+ (void) svr4_optcom_req(_WR(UDP_RD(q)),
+ mp, cr, &udp_opt_obj);
TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
"udp_wput_other_end: q %p (%S)",
q, "optmgmtreq");
return;
case T_OPTMGMT_REQ:
- (void) tpi_optcom_req(q, mp, cr, &udp_opt_obj);
+ /*
+ * Use upper queue for option processing in
+ * case the request is not handled at this
+ * level and needs to be passed down to IP.
+ */
+ (void) tpi_optcom_req(_WR(UDP_RD(q)),
+ mp, cr, &udp_opt_obj);
TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
"udp_wput_other_end: q %p (%S)",
q, "optmgmtreq");
@@ -5954,10 +7164,9 @@ udp_wput_other(queue_t *q, mblk_t *mp)
* don't know the peer's name.
*/
iocp->ioc_error = ENOTCONN;
-err_ret:;
iocp->ioc_count = 0;
mp->b_datap->db_type = M_IOCACK;
- qreply(q, mp);
+ putnext(UDP_RD(q), mp);
TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
"udp_wput_other_end: q %p (%S)",
q, "getpeername");
@@ -5982,13 +7191,45 @@ err_ret:;
/* nd_getset performs the necessary checking */
case ND_GET:
if (nd_getset(q, udp_g_nd, mp)) {
- qreply(q, mp);
+ putnext(UDP_RD(q), mp);
TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
"udp_wput_other_end: q %p (%S)",
q, "get");
return;
}
break;
+ case _SIOCSOCKFALLBACK:
+ /*
+ * Either sockmod is about to be popped and the
+ * socket would now be treated as a plain stream,
+ * or a module is about to be pushed so we could
+ * no longer use read-side synchronous stream.
+ * Drain any queued data and disable direct sockfs
+ * interface from now on.
+ */
+ if (!udp->udp_issocket) {
+ DB_TYPE(mp) = M_IOCNAK;
+ iocp->ioc_error = EINVAL;
+ } else {
+ udp->udp_issocket = B_FALSE;
+ if (udp->udp_direct_sockfs) {
+ /*
+ * Disable read-side synchronous
+ * stream interface and drain any
+ * queued data.
+ */
+ udp_rcv_drain(UDP_RD(q), udp,
+ B_FALSE);
+ ASSERT(!udp->udp_direct_sockfs);
+ UDP_STAT(udp_sock_fallback);
+ }
+ DB_TYPE(mp) = M_IOCACK;
+ iocp->ioc_error = 0;
+ }
+ iocp->ioc_count = 0;
+ iocp->ioc_rval = 0;
+ putnext(UDP_RD(q), mp);
+ return;
default:
break;
}
@@ -6004,7 +7245,15 @@ err_ret:;
}
TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END,
"udp_wput_other_end: q %p (%S)", q, "end");
- putnext(q, mp);
+ ip_output(connp, mp, q, IP_WPUT);
+}
+
+/* ARGSUSED */
+static void
+udp_wput_other_wrapper(void *arg, mblk_t *mp, void *arg2)
+{
+ udp_wput_other(((conn_t *)arg)->conn_wq, mp);
+ udp_exit((conn_t *)arg);
}
/*
@@ -6017,11 +7266,11 @@ udp_wput_iocdata(queue_t *q, mblk_t *mp)
mblk_t *mp1;
STRUCT_HANDLE(strbuf, sb);
uint16_t port;
- udp_t *udp;
in6_addr_t v6addr;
ipaddr_t v4addr;
uint32_t flowinfo = 0;
int addrlen;
+ udp_t *udp = Q_TO_UDP(q);
/* Make sure it is one of ours. */
switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) {
@@ -6029,9 +7278,11 @@ udp_wput_iocdata(queue_t *q, mblk_t *mp)
case TI_GETPEERNAME:
break;
default:
- putnext(q, mp);
+ ip_output(Q_TO_CONN(q), mp, q, IP_WPUT);
return;
}
+
+ q = WR(UDP_RD(q));
switch (mi_copy_state(q, mp, &mp1)) {
case -1:
return;
@@ -6068,7 +7319,6 @@ udp_wput_iocdata(queue_t *q, mblk_t *mp)
*/
STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag,
(void *)mp1->b_rptr);
- udp = (udp_t *)q->q_ptr;
if (udp->udp_family == AF_INET)
addrlen = sizeof (sin_t);
else
@@ -6113,6 +7363,10 @@ udp_wput_iocdata(queue_t *q, mblk_t *mp)
port = udp->udp_port;
break;
case TI_GETPEERNAME:
+ if (udp->udp_state != TS_DATA_XFER) {
+ mi_copy_done(q, mp, ENOTCONN);
+ return;
+ }
if (udp->udp_family == AF_INET) {
ASSERT(udp->udp_ipversion == IPV4_VERSION);
v4addr = V4_PART_OF_V6(udp->udp_v6dst);
@@ -6163,21 +7417,23 @@ static int
udp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp,
void *thisdg_attrs)
{
- udp_t *udp;
struct T_unitdata_req *udreqp;
int is_absreq_failure;
cred_t *cr;
+ conn_t *connp = Q_TO_CONN(q);
- ASSERT(((union T_primitives *)mp->b_rptr)->type);
+ ASSERT(((t_primp_t)mp->b_rptr)->type);
- udp = (udp_t *)q->q_ptr;
-
- cr = DB_CREDDEF(mp, udp->udp_credp);
+ cr = DB_CREDDEF(mp, connp->conn_cred);
udreqp = (struct T_unitdata_req *)mp->b_rptr;
*errorp = 0;
- *errorp = tpi_optcom_buf(q, mp, &udreqp->OPT_length,
+ /*
+ * Use upper queue for option processing since the callback
+ * routines expect to be called in UDP instance instead of IP.
+ */
+ *errorp = tpi_optcom_buf(_WR(UDP_RD(q)), mp, &udreqp->OPT_length,
udreqp->OPT_offset, cr, &udp_opt_obj,
thisdg_attrs, &is_absreq_failure);
@@ -6198,7 +7454,6 @@ udp_ddi_init(void)
int i;
UDP6_MAJ = ddi_name_to_major(UDP6);
- mutex_init(&udp_g_lock, NULL, MUTEX_DEFAULT, NULL);
udp_max_optsize = optcom_max_optsize(udp_opt_obj.odb_opt_des_arr,
udp_opt_obj.odb_opt_arr_cnt);
@@ -6218,7 +7473,11 @@ udp_ddi_init(void)
NULL);
}
(void) udp_param_register(udp_param_arr, A_CNT(udp_param_arr));
+
udp_kstat_init();
+
+ udp_cache = kmem_cache_create("udp_cache", sizeof (udp_t),
+ CACHE_ALIGN_SIZE, NULL, NULL, NULL, NULL, NULL, 0);
}
void
@@ -6228,14 +7487,16 @@ udp_ddi_destroy(void)
nd_free(&udp_g_nd);
- mutex_destroy(&udp_g_lock);
for (i = 0; i < udp_bind_fanout_size; i++) {
mutex_destroy(&udp_bind_fanout[i].uf_lock);
}
+
kmem_free(udp_bind_fanout, udp_bind_fanout_size *
sizeof (udp_fanout_t));
+
udp_kstat_fini();
+ kmem_cache_destroy(udp_cache);
}
static void
@@ -6250,9 +7511,9 @@ udp_kstat_init(void)
{ "outErrors", KSTAT_DATA_UINT32, 0 },
};
- udp_mibkp = kstat_create("udp", 0, "udp", "mib2", KSTAT_TYPE_NAMED,
- NUM_OF_FIELDS(udp_named_kstat_t),
- 0);
+ udp_mibkp = kstat_create(UDP_MOD_NAME, 0, UDP_MOD_NAME,
+ "mib2", KSTAT_TYPE_NAMED, NUM_OF_FIELDS(udp_named_kstat_t), 0);
+
if (udp_mibkp == NULL)
return;
@@ -6264,12 +7525,24 @@ udp_kstat_init(void)
udp_mibkp->ks_update = udp_kstat_update;
kstat_install(udp_mibkp);
+
+ if ((udp_ksp = kstat_create(UDP_MOD_NAME, 0, "udpstat",
+ "net", KSTAT_TYPE_NAMED,
+ sizeof (udp_statistics) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL)) != NULL) {
+ udp_ksp->ks_data = &udp_statistics;
+ kstat_install(udp_ksp);
+ }
}
static void
udp_kstat_fini(void)
{
- if (udp_mibkp) {
+ if (udp_ksp != NULL) {
+ kstat_delete(udp_ksp);
+ udp_ksp = NULL;
+ }
+ if (udp_mibkp != NULL) {
kstat_delete(udp_mibkp);
udp_mibkp = NULL;
}
@@ -6296,6 +7569,269 @@ udp_kstat_update(kstat_t *kp, int rw)
return (0);
}
+/* ARGSUSED */
+static void
+udp_rput(queue_t *q, mblk_t *mp)
+{
+ /*
+ * We get here whenever we do qreply() from IP,
+ * i.e as part of handlings ioctls, etc.
+ */
+ putnext(q, mp);
+}
+
+/*
+ * Read-side synchronous stream info entry point, called as a
+ * result of handling certain STREAMS ioctl operations.
+ */
+static int
+udp_rinfop(queue_t *q, infod_t *dp)
+{
+ mblk_t *mp;
+ uint_t cmd = dp->d_cmd;
+ int res = 0;
+ int error = 0;
+ udp_t *udp = Q_TO_UDP(RD(UDP_WR(q)));
+ struct stdata *stp = STREAM(q);
+
+ mutex_enter(&udp->udp_drain_lock);
+ /* If shutdown on read has happened, return nothing */
+ mutex_enter(&stp->sd_lock);
+ if (stp->sd_flag & STREOF) {
+ mutex_exit(&stp->sd_lock);
+ goto done;
+ }
+ mutex_exit(&stp->sd_lock);
+
+ if ((mp = udp->udp_rcv_list_head) == NULL)
+ goto done;
+
+ ASSERT(DB_TYPE(mp) != M_DATA && mp->b_cont != NULL);
+
+ if (cmd & INFOD_COUNT) {
+ /*
+ * Return the number of messages.
+ */
+ dp->d_count += udp->udp_rcv_msgcnt;
+ res |= INFOD_COUNT;
+ }
+ if (cmd & INFOD_BYTES) {
+ /*
+ * Return size of all data messages.
+ */
+ dp->d_bytes += udp->udp_rcv_cnt;
+ res |= INFOD_BYTES;
+ }
+ if (cmd & INFOD_FIRSTBYTES) {
+ /*
+ * Return size of first data message.
+ */
+ dp->d_bytes = msgdsize(mp);
+ res |= INFOD_FIRSTBYTES;
+ dp->d_cmd &= ~INFOD_FIRSTBYTES;
+ }
+ if (cmd & INFOD_COPYOUT) {
+ mblk_t *mp1 = mp->b_cont;
+ int n;
+ /*
+ * Return data contents of first message.
+ */
+ ASSERT(DB_TYPE(mp1) == M_DATA);
+ while (mp1 != NULL && dp->d_uiop->uio_resid > 0) {
+ n = MIN(dp->d_uiop->uio_resid, MBLKL(mp1));
+ if (n != 0 && (error = uiomove((char *)mp1->b_rptr, n,
+ UIO_READ, dp->d_uiop)) != 0) {
+ goto done;
+ }
+ mp1 = mp1->b_cont;
+ }
+ res |= INFOD_COPYOUT;
+ dp->d_cmd &= ~INFOD_COPYOUT;
+ }
+done:
+ mutex_exit(&udp->udp_drain_lock);
+
+ dp->d_res |= res;
+
+ return (error);
+}
+
+/*
+ * Read-side synchronous stream entry point. This is called as a result
+ * of recv/read operation done at sockfs, and is guaranteed to execute
+ * outside of the interrupt thread context. It returns a single datagram
+ * (b_cont chain of T_UNITDATA_IND plus data) to the upper layer.
+ */
+static int
+udp_rrw(queue_t *q, struiod_t *dp)
+{
+ mblk_t *mp;
+ udp_t *udp = Q_TO_UDP(_RD(UDP_WR(q)));
+
+ /* We should never get here when we're in SNMP mode */
+ ASSERT(!(udp->udp_connp->conn_flags & IPCL_UDPMOD));
+
+ /*
+ * Dequeue datagram from the head of the list and return
+ * it to caller; also ensure that RSLEEP sd_wakeq flag is
+ * set/cleared depending on whether or not there's data
+ * remaining in the list.
+ */
+ mutex_enter(&udp->udp_drain_lock);
+ if (!udp->udp_direct_sockfs) {
+ mutex_exit(&udp->udp_drain_lock);
+ UDP_STAT(udp_rrw_busy);
+ return (EBUSY);
+ }
+ if ((mp = udp->udp_rcv_list_head) != NULL) {
+ uint_t size = msgdsize(mp);
+
+ /* Last datagram in the list? */
+ if ((udp->udp_rcv_list_head = mp->b_next) == NULL)
+ udp->udp_rcv_list_tail = NULL;
+ mp->b_next = NULL;
+
+ udp->udp_rcv_cnt -= size;
+ udp->udp_rcv_msgcnt--;
+ UDP_STAT(udp_rrw_msgcnt);
+
+ /* No longer flow-controlling? */
+ if (udp->udp_rcv_cnt < udp->udp_rcv_hiwat &&
+ udp->udp_rcv_msgcnt < udp->udp_rcv_hiwat)
+ udp->udp_drain_qfull = B_FALSE;
+ }
+ if (udp->udp_rcv_list_head == NULL) {
+ /*
+ * Either we just dequeued the last datagram or
+ * we get here from sockfs and have nothing to
+ * return; in this case clear RSLEEP.
+ */
+ ASSERT(udp->udp_rcv_cnt == 0);
+ ASSERT(udp->udp_rcv_msgcnt == 0);
+ ASSERT(udp->udp_rcv_list_tail == NULL);
+ STR_WAKEUP_CLEAR(STREAM(q));
+ } else {
+ /*
+ * More data follows; we need udp_rrw() to be
+ * called in future to pick up the rest.
+ */
+ STR_WAKEUP_SET(STREAM(q));
+ }
+ mutex_exit(&udp->udp_drain_lock);
+ dp->d_mp = mp;
+ return (0);
+}
+
+/*
+ * Enqueue a completely-built T_UNITDATA_IND message into the receive
+ * list; this is typically executed within the interrupt thread context
+ * and so we do things as quickly as possible.
+ */
+static void
+udp_rcv_enqueue(queue_t *q, udp_t *udp, mblk_t *mp, uint_t pkt_len)
+{
+ ASSERT(q == RD(q));
+ ASSERT(pkt_len == msgdsize(mp));
+ ASSERT(mp->b_next == NULL && mp->b_cont != NULL);
+ ASSERT(DB_TYPE(mp) == M_PROTO && DB_TYPE(mp->b_cont) == M_DATA);
+ ASSERT(MBLKL(mp) >= sizeof (struct T_unitdata_ind));
+
+ mutex_enter(&udp->udp_drain_lock);
+ /*
+ * Wake up and signal the receiving app; it is okay to do this
+ * before enqueueing the mp because we are holding the drain lock.
+ * One of the advantages of synchronous stream is the ability for
+ * us to find out when the application performs a read on the
+ * socket by way of udp_rrw() entry point being called. We need
+ * to generate SIGPOLL/SIGIO for each received data in the case
+ * of asynchronous socket just as in the strrput() case. However,
+ * we only wake the application up when necessary, i.e. during the
+ * first enqueue. When udp_rrw() is called, we send up a single
+ * datagram upstream and call STR_WAKEUP_SET() again when there
+ * are still data remaining in our receive queue.
+ */
+ if (udp->udp_rcv_list_head == NULL) {
+ STR_WAKEUP_SET(STREAM(q));
+ udp->udp_rcv_list_head = mp;
+ } else {
+ udp->udp_rcv_list_tail->b_next = mp;
+ }
+ udp->udp_rcv_list_tail = mp;
+ udp->udp_rcv_cnt += pkt_len;
+ udp->udp_rcv_msgcnt++;
+
+ /* Need to flow-control? */
+ if (udp->udp_rcv_cnt >= udp->udp_rcv_hiwat ||
+ udp->udp_rcv_msgcnt >= udp->udp_rcv_hiwat)
+ udp->udp_drain_qfull = B_TRUE;
+
+ /* Update poll events and send SIGPOLL/SIGIO if necessary */
+ STR_SENDSIG(STREAM(q));
+ mutex_exit(&udp->udp_drain_lock);
+}
+
+/*
+ * Drain the contents of receive list to the module upstream; we do
+ * this during close or when we fallback to the slow mode due to
+ * sockmod being popped or a module being pushed on top of us.
+ */
+static void
+udp_rcv_drain(queue_t *q, udp_t *udp, boolean_t closing)
+{
+ mblk_t *mp;
+
+ ASSERT(q == RD(q));
+
+ mutex_enter(&udp->udp_drain_lock);
+ /*
+ * There is no race with a concurrent udp_input() sending
+ * up packets using putnext() after we have cleared the
+ * udp_direct_sockfs flag but before we have completed
+ * sending up the packets in udp_rcv_list, since we are
+ * either a writer or we have quiesced the conn.
+ */
+ udp->udp_direct_sockfs = B_FALSE;
+ mutex_exit(&udp->udp_drain_lock);
+
+ if (udp->udp_rcv_list_head != NULL)
+ UDP_STAT(udp_drain);
+
+ /*
+ * Send up everything via putnext(); note here that we
+ * don't need the udp_drain_lock to protect us since
+ * nothing can enter udp_rrw() and that we currently
+ * have exclusive access to this udp.
+ */
+ while ((mp = udp->udp_rcv_list_head) != NULL) {
+ udp->udp_rcv_list_head = mp->b_next;
+ mp->b_next = NULL;
+ udp->udp_rcv_cnt -= msgdsize(mp);
+ udp->udp_rcv_msgcnt--;
+ if (closing) {
+ freemsg(mp);
+ } else {
+ putnext(q, mp);
+ }
+ }
+ ASSERT(udp->udp_rcv_cnt == 0);
+ ASSERT(udp->udp_rcv_msgcnt == 0);
+ ASSERT(udp->udp_rcv_list_head == NULL);
+ udp->udp_rcv_list_tail = NULL;
+ udp->udp_drain_qfull = B_FALSE;
+}
+
+static size_t
+udp_set_rcv_hiwat(udp_t *udp, size_t size)
+{
+ /* We add a bit of extra buffering */
+ size += size >> 1;
+ if (size > udp_max_buf)
+ size = udp_max_buf;
+
+ udp->udp_rcv_hiwat = size;
+ return (size);
+}
+
/*
* Little helper for IPsec's NAT-T processing.
*/
diff --git a/usr/src/uts/common/inet/udp/udp6ddi.c b/usr/src/uts/common/inet/udp/udp6ddi.c
index 277aa3b970..c5b203c654 100644
--- a/usr/src/uts/common/inet/udp/udp6ddi.c
+++ b/usr/src/uts/common/inet/udp/udp6ddi.c
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 1992,1997-2002 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -36,7 +36,13 @@
#define INET_DEVMINOR IPV6_MINOR
#define INET_DEVDESC "UDP6 STREAMS driver %I%"
#define INET_STRTAB udpinfo
-#define INET_DEVMTFLAGS IP_DEVMTFLAGS /* since we're really ip */
+#define INET_DEVMTFLAGS IP_DEVMTFLAGS
+/*
+ * We define both synchronous STREAMS and sockfs direct-access
+ * mode for UDP module instance, because it is autopushed on
+ * top of /dev/ip for the sockets case.
+ */
+#define INET_MODMTFLAGS (D_MP|D_SYNCSTR|_D_DIRECT)
#include "../inetddi.c"
diff --git a/usr/src/uts/common/inet/udp/udpddi.c b/usr/src/uts/common/inet/udp/udpddi.c
index dcff39b3c9..ad5542295e 100644
--- a/usr/src/uts/common/inet/udp/udpddi.c
+++ b/usr/src/uts/common/inet/udp/udpddi.c
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -32,20 +32,23 @@
#include <sys/modctl.h>
#include <inet/common.h>
#include <inet/ip.h>
+#include <inet/udp_impl.h>
#define INET_NAME "udp"
#define INET_MODDESC "UDP STREAMS module %I%"
#define INET_DEVDESC "UDP STREAMS driver %I%"
#define INET_DEVMINOR IPV4_MINOR
#define INET_STRTAB udpinfo
-#define INET_DEVMTFLAGS IP_DEVMTFLAGS /* since as a driver we're ip */
-#define INET_MODMTFLAGS (D_MP | D_MTQPAIR | D_MTPUTSHARED | _D_MTOCSHARED)
+#define INET_DEVMTFLAGS IP_DEVMTFLAGS
+/*
+ * We define both synchronous STREAMS and sockfs direct-access
+ * mode for UDP module instance, because it is autopushed on
+ * top of /dev/ip for the sockets case.
+ */
+#define INET_MODMTFLAGS (D_MP|D_SYNCSTR|_D_DIRECT)
#include "../inetddi.c"
-extern void udp_ddi_init(void);
-extern void udp_ddi_destroy(void);
-
int
_init(void)
{
diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h
index 8b5c52ba32..66faf934a8 100644
--- a/usr/src/uts/common/inet/udp_impl.h
+++ b/usr/src/uts/common/inet/udp_impl.h
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -29,6 +29,13 @@
#pragma ident "%Z%%M% %I% %E% SMI"
+/*
+ * UDP implementation private declarations. These interfaces are
+ * used to build the IP module and are not meant to be accessed
+ * by any modules except IP itself. They are undocumented and are
+ * subject to change without notice.
+ */
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -43,32 +50,42 @@ extern "C" {
#include <inet/common.h>
#include <inet/ip.h>
+#define UDP_MOD_ID 5607
+
+/* udp_mode. UDP_MT_HOT and UDP_SQUEUE are stable modes. Rest are transient */
+typedef enum {
+ UDP_MT_HOT = 0, /* UDP endpoint is MT HOT */
+ UDP_MT_QUEUED = 1, /* Messages enqueued in udp_mphead */
+ UDP_QUEUED_SQUEUE = 2, /* Messages enqueued in conn_sqp */
+ UDP_SQUEUE = 3 /* Single threaded using squeues */
+} udp_mode_t;
+
/* Internal udp control structure, one per open stream */
typedef struct udp_s {
- uint32_t udp_state; /* TPI state */
- in_port_t udp_port; /* Port bound to this stream */
- in_port_t udp_dstport; /* Connected port */
- in6_addr_t udp_v6src; /* Source address of this stream */
- in6_addr_t udp_bound_v6src; /* Explicitly bound address */
- in6_addr_t udp_v6dst; /* Connected destination */
+ uint32_t udp_state; /* TPI state */
+ in_port_t udp_port; /* Port bound to this stream */
+ in_port_t udp_dstport; /* Connected port */
+ in6_addr_t udp_v6src; /* Source address of this stream */
+ in6_addr_t udp_bound_v6src; /* Explicitly bound address */
+ in6_addr_t udp_v6dst; /* Connected destination */
uint32_t udp_flowinfo; /* Connected flow id and tclass */
- uint32_t udp_max_hdr_len; /* For write offset in stream head */
+ uint32_t udp_max_hdr_len; /* For write offset in stream head */
sa_family_t udp_family; /* Family from socket() call */
/*
* IP format that packets transmitted from this struct should use.
* Value can be IP4_VERSION or IPV6_VERSION.
*/
ushort_t udp_ipversion;
- uint32_t udp_ip_snd_options_len; /* Len of IPv4 options */
+ uint32_t udp_ip_snd_options_len; /* Len of IPv4 options */
uchar_t *udp_ip_snd_options; /* Ptr to IPv4 options */
- uint32_t udp_ip_rcv_options_len; /* Len of IPv4 options recvd */
+ uint32_t udp_ip_rcv_options_len; /* Len of IPv4 options recvd */
uchar_t *udp_ip_rcv_options; /* Ptr to IPv4 options recvd */
- cred_t *udp_credp; /* Credentials at open */
uchar_t udp_multicast_ttl; /* IP*_MULTICAST_TTL/HOPS */
- ipaddr_t udp_multicast_if_addr; /* IP_MULTICAST_IF option */
+ ipaddr_t udp_multicast_if_addr; /* IP_MULTICAST_IF option */
uint_t udp_multicast_if_index; /* IPV6_MULTICAST_IF option */
int udp_bound_if; /* IP*_BOUND_IF option */
int udp_xmit_if; /* IP_XMIT_IF option */
+ conn_t *udp_connp;
uint32_t
udp_debug : 1, /* SO_DEBUG "socket" option. */
udp_dontroute : 1, /* SO_DONTROUTE "socket" option. */
@@ -76,35 +93,36 @@ typedef struct udp_s {
udp_useloopback : 1, /* SO_USELOOPBACK "socket" option */
udp_reuseaddr : 1, /* SO_REUSEADDR "socket" option. */
- udp_multicast_loop : 1, /* IP_MULTICAST_LOOP option */
udp_dgram_errind : 1, /* SO_DGRAM_ERRIND option */
udp_recvdstaddr : 1, /* IP_RECVDSTADDR option */
-
udp_recvopts : 1, /* IP_RECVOPTS option */
+
udp_discon_pending : 1, /* T_DISCON_REQ in progress */
udp_unspec_source : 1, /* IP*_UNSPEC_SRC option */
udp_ipv6_recvpktinfo : 1, /* IPV6_RECVPKTINFO option */
-
udp_ipv6_recvhoplimit : 1, /* IPV6_RECVHOPLIMIT option */
+
udp_ipv6_recvhopopts : 1, /* IPV6_RECVHOPOPTS option */
udp_ipv6_recvdstopts : 1, /* IPV6_RECVDSTOPTS option */
udp_ipv6_recvrthdr : 1, /* IPV6_RECVRTHDR option */
-
udp_ipv6_recvtclass : 1, /* IPV6_RECVTCLASS */
+
udp_ipv6_recvpathmtu : 1, /* IPV6_RECVPATHMTU */
udp_anon_priv_bind : 1,
udp_exclbind : 1, /* ``exclusive'' binding */
-
udp_recvif : 1, /* IP_RECVIF option */
+
udp_recvslla : 1, /* IP_RECVSLLA option */
udp_recvttl : 1, /* IP_RECVTTL option */
udp_recvucred : 1, /* IP_RECVUCRED option */
-
udp_old_ipv6_recvdstopts : 1, /* old form of IPV6_DSTOPTS */
- udp_ipv6_recvrthdrdstopts : 1, /* IPV6_RECVRTHDRDSTOPTS */
+ udp_ipv6_recvrthdrdstopts : 1, /* IPV6_RECVRTHDRDSTOPTS */
udp_rcvhdr : 1, /* UDP_RCVHDR option */
- udp_pad_to_bit_31 : 7;
+ udp_issocket : 1, /* socket mode */
+ udp_direct_sockfs : 1, /* direct calls to/from sockfs */
+
+ udp_pad_to_bit_31 : 4;
uint8_t udp_type_of_service; /* IP_TOS option */
uint8_t udp_ttl; /* TTL or hoplimit */
@@ -114,7 +132,20 @@ typedef struct udp_s {
uint_t udp_sticky_hdrs_len; /* Incl. ip6h and any ip6i */
struct udp_s *udp_bind_hash; /* Bind hash chain */
struct udp_s **udp_ptpbhn; /* Pointer to previous bind hash next. */
- zoneid_t udp_zoneid; /* ID of owning zone */
+ udp_mode_t udp_mode; /* Current mode of operation */
+ mblk_t *udp_mphead; /* Head of the queued operations */
+ mblk_t *udp_mptail; /* Tail of the queued operations */
+ uint_t udp_mpcount; /* Number of messages in the queue */
+ uint_t udp_reader_count; /* Number of reader threads */
+ uint_t udp_squeue_count; /* Number of messages in conn_sqp */
+
+ kmutex_t udp_drain_lock; /* lock for udp_rcv_list */
+ boolean_t udp_drain_qfull; /* drain queue is full */
+ mblk_t *udp_rcv_list_head; /* b_next chain of mblks */
+ mblk_t *udp_rcv_list_tail; /* last mblk in chain */
+ uint_t udp_rcv_cnt; /* total data in rcv_list */
+ uint_t udp_rcv_msgcnt; /* total messages in rcv_list */
+ size_t udp_rcv_hiwat; /* receive high watermark */
} udp_t;
/* UDP Protocol header */
@@ -127,6 +158,92 @@ typedef struct udpahdr_s {
} udpha_t;
#define UDPH_SIZE 8
+/* Named Dispatch Parameter Management Structure */
+typedef struct udpparam_s {
+ uint32_t udp_param_min;
+ uint32_t udp_param_max;
+ uint32_t udp_param_value;
+ char *udp_param_name;
+} udpparam_t;
+
+extern udpparam_t udp_param_arr[];
+
+#define udp_wroff_extra udp_param_arr[0].udp_param_value
+#define udp_ipv4_ttl udp_param_arr[1].udp_param_value
+#define udp_ipv6_hoplimit udp_param_arr[2].udp_param_value
+#define udp_smallest_nonpriv_port udp_param_arr[3].udp_param_value
+#define udp_do_checksum udp_param_arr[4].udp_param_value
+#define udp_smallest_anon_port udp_param_arr[5].udp_param_value
+#define udp_largest_anon_port udp_param_arr[6].udp_param_value
+#define udp_xmit_hiwat udp_param_arr[7].udp_param_value
+#define udp_xmit_lowat udp_param_arr[8].udp_param_value
+#define udp_recv_hiwat udp_param_arr[9].udp_param_value
+#define udp_max_buf udp_param_arr[10].udp_param_value
+#define udp_ndd_get_info_interval udp_param_arr[11].udp_param_value
+
+/* Kstats */
+typedef struct { /* Class "net" kstats */
+ kstat_named_t udp_ip_send;
+ kstat_named_t udp_ip_ire_send;
+ kstat_named_t udp_ire_null;
+ kstat_named_t udp_drain;
+ kstat_named_t udp_sock_fallback;
+ kstat_named_t udp_rrw_busy;
+ kstat_named_t udp_rrw_msgcnt;
+ kstat_named_t udp_out_sw_cksum;
+ kstat_named_t udp_out_sw_cksum_bytes;
+ kstat_named_t udp_out_opt;
+ kstat_named_t udp_out_err_notconn;
+ kstat_named_t udp_out_err_output;
+ kstat_named_t udp_out_err_tudr;
+ kstat_named_t udp_in_pktinfo;
+ kstat_named_t udp_in_recvdstaddr;
+ kstat_named_t udp_in_recvopts;
+ kstat_named_t udp_in_recvif;
+ kstat_named_t udp_in_recvslla;
+ kstat_named_t udp_in_recvucred;
+ kstat_named_t udp_in_recvttl;
+ kstat_named_t udp_in_recvhopopts;
+ kstat_named_t udp_in_recvhoplimit;
+ kstat_named_t udp_in_recvdstopts;
+ kstat_named_t udp_in_recvrtdstopts;
+ kstat_named_t udp_in_recvrthdr;
+ kstat_named_t udp_in_recvpktinfo;
+ kstat_named_t udp_in_recvtclass;
+#ifdef DEBUG
+ kstat_named_t udp_data_conn;
+ kstat_named_t udp_data_notconn;
+#endif
+} udp_stat_t;
+
+extern udp_stat_t udp_statistics;
+
+#define UDP_STAT(x) (udp_statistics.x.value.ui64++)
+#define UDP_STAT_UPDATE(x, n) (udp_statistics.x.value.ui64 += (n))
+#ifdef DEBUG
+#define UDP_DBGSTAT(x) UDP_STAT(x)
+#else
+#define UDP_DBGSTAT(x)
+#endif /* DEBUG */
+
+extern major_t UDP6_MAJ;
+
+extern int udp_opt_default(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
+extern int udp_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *);
+extern int udp_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *,
+ uint_t *, uchar_t *, void *, cred_t *, mblk_t *);
+extern int udp_snmp_get(queue_t *, mblk_t *);
+extern int udp_snmp_set(queue_t *, t_scalar_t, t_scalar_t, uchar_t *, int);
+extern void udp_close_free(conn_t *);
+extern void udp_quiesce_conn(conn_t *);
+extern void udp_ddi_init(void);
+extern void udp_ddi_destroy(void);
+extern void udp_resume_bind(conn_t *, mblk_t *);
+extern void udp_conn_recv(conn_t *, mblk_t *);
+extern boolean_t udp_compute_checksum(void);
+extern void udp_wput_data(queue_t *, mblk_t *, struct sockaddr *,
+ socklen_t);
+
#endif /* _KERNEL */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/io/gld.c b/usr/src/uts/common/io/gld.c
index 90ccf7952a..faa90fb792 100644
--- a/usr/src/uts/common/io/gld.c
+++ b/usr/src/uts/common/io/gld.c
@@ -3415,6 +3415,8 @@ gld_cap_ack(queue_t *q, mblk_t *mp)
dlhp->hcksum_txflags |= HCKSUM_INET_PARTIAL;
if (macinfo->gldm_capabilities & GLD_CAP_CKSUM_FULL_V4)
dlhp->hcksum_txflags |= HCKSUM_INET_FULL_V4;
+ if (macinfo->gldm_capabilities & GLD_CAP_CKSUM_FULL_V6)
+ dlhp->hcksum_txflags |= HCKSUM_INET_FULL_V6;
if (macinfo->gldm_capabilities & GLD_CAP_CKSUM_IPHDR)
dlhp->hcksum_txflags |= HCKSUM_IPHDRCKSUM;
diff --git a/usr/src/uts/common/io/stream.c b/usr/src/uts/common/io/stream.c
index 93564f29f4..9baaebd365 100644
--- a/usr/src/uts/common/io/stream.c
+++ b/usr/src/uts/common/io/stream.c
@@ -1690,6 +1690,21 @@ getq(queue_t *q)
}
/*
+ * Calculate number of data bytes in a single data message block taking
+ * multidata messages into account.
+ */
+
+#define ADD_MBLK_SIZE(mp, size) \
+ if (DB_TYPE(mp) != M_MULTIDATA) { \
+ (size) += MBLKL(mp); \
+ } else { \
+ uint_t pinuse; \
+ \
+ mmd_getsize(mmd_getmultidata(mp), NULL, &pinuse); \
+ (size) += pinuse; \
+ }
+
+/*
* Like getq() but does not backenable. This is used by the stream
* head when a putback() is likely. The caller must call qbackenable()
* after it is done with accessing the queue.
@@ -1721,7 +1736,7 @@ getq_noenab(queue_t *q)
/* Get message byte count for q_count accounting */
for (tmp = bp; tmp; tmp = tmp->b_cont) {
- bytecnt += (tmp->b_wptr - tmp->b_rptr);
+ ADD_MBLK_SIZE(tmp, bytecnt);
mblkcnt++;
}
@@ -1941,7 +1956,7 @@ rmvq_noenab(queue_t *q, mblk_t *mp)
/* Get the size of the message for q_count accounting */
for (tmp = mp; tmp; tmp = tmp->b_cont) {
- bytecnt += (tmp->b_wptr - tmp->b_rptr);
+ ADD_MBLK_SIZE(tmp, bytecnt);
mblkcnt++;
}
@@ -2433,9 +2448,10 @@ putq(queue_t *q, mblk_t *bp)
/* Get message byte count for q_count accounting */
for (tmp = bp; tmp; tmp = tmp->b_cont) {
- bytecnt += (tmp->b_wptr - tmp->b_rptr);
+ ADD_MBLK_SIZE(tmp, bytecnt);
mblkcnt++;
}
+
if (qbp) {
qbp->qb_count += bytecnt;
qbp->qb_mblkcnt += mblkcnt;
@@ -2617,7 +2633,7 @@ putbq(queue_t *q, mblk_t *bp)
/* Get message byte count for q_count accounting */
for (tmp = bp; tmp; tmp = tmp->b_cont) {
- bytecnt += (tmp->b_wptr - tmp->b_rptr);
+ ADD_MBLK_SIZE(tmp, bytecnt);
mblkcnt++;
}
if (qbp) {
@@ -2748,7 +2764,7 @@ badord:
/* Get mblk and byte count for q_count accounting */
for (tmp = mp; tmp; tmp = tmp->b_cont) {
- bytecnt += (tmp->b_wptr - tmp->b_rptr);
+ ADD_MBLK_SIZE(tmp, bytecnt);
mblkcnt++;
}
diff --git a/usr/src/uts/common/io/strsun.c b/usr/src/uts/common/io/strsun.c
index 00b22e348f..87f0eeaa60 100644
--- a/usr/src/uts/common/io/strsun.c
+++ b/usr/src/uts/common/io/strsun.c
@@ -37,7 +37,9 @@
#include <sys/errno.h>
#include <sys/stream.h>
#include <sys/stropts.h>
+#include <sys/strsubr.h>
#include <sys/strsun.h>
+#include <sys/sysmacros.h>
#include <sys/cmn_err.h>
void
@@ -243,3 +245,63 @@ miocpullup(mblk_t *iocmp, size_t size)
freemsg(datamp);
return (0);
}
+
+/* Copy userdata into a new mblk_t */
+mblk_t *
+mcopyinuio(struct stdata *stp, uio_t *uiop, ssize_t iosize,
+ ssize_t maxblk, int *errorp)
+{
+ mblk_t *head = NULL, **tail = &head;
+ size_t offset = stp->sd_wroff;
+
+ if (iosize == INFPSZ || iosize > uiop->uio_resid)
+ iosize = uiop->uio_resid;
+
+ if (maxblk == INFPSZ)
+ maxblk = iosize;
+
+ /* Nothing to do in these cases, so we're done */
+ if (iosize < 0 || maxblk < 0 || (maxblk == 0 && iosize > 0))
+ goto done;
+
+ if (stp->sd_flag & STRCOPYCACHED)
+ uiop->uio_extflg |= UIO_COPY_CACHED;
+
+ /*
+ * We will enter the loop below if iosize is 0; it will allocate an
+ * empty message block and call uiomove(9F) which will just return.
+ * We could avoid that with an extra check but would only slow
+ * down the much more likely case where iosize is larger than 0.
+ */
+ do {
+ ssize_t blocksize;
+ mblk_t *mp;
+
+ blocksize = MIN(iosize, maxblk);
+ ASSERT(blocksize >= 0);
+ if ((mp = allocb_cred(offset + blocksize, CRED())) == NULL) {
+ *errorp = ENOMEM;
+ return (head);
+ }
+ mp->b_rptr += offset;
+ mp->b_wptr = mp->b_rptr + blocksize;
+ DB_CPID(mp) = curproc->p_pid;
+
+ *tail = mp;
+ tail = &mp->b_cont;
+
+ /* uiomove(9F) either returns 0 or EFAULT */
+ if ((*errorp = uiomove(mp->b_rptr, (size_t)blocksize,
+ UIO_WRITE, uiop)) != 0) {
+ ASSERT(*errorp != ENOMEM);
+ freemsg(head);
+ return (NULL);
+ }
+
+ iosize -= blocksize;
+ } while (iosize > 0);
+
+done:
+ *errorp = 0;
+ return (head);
+}
diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c
index e28d9e2fe0..0b0ac98ca4 100644
--- a/usr/src/uts/common/os/streamio.c
+++ b/usr/src/uts/common/os/streamio.c
@@ -2642,11 +2642,18 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize,
int
strwrite(struct vnode *vp, struct uio *uiop, cred_t *crp)
{
+ return (strwrite_common(vp, uiop, crp, 0));
+}
+
+/* ARGSUSED2 */
+int
+strwrite_common(struct vnode *vp, struct uio *uiop, cred_t *crp, int wflag)
+{
struct stdata *stp;
struct queue *wqp;
ssize_t rmin, rmax;
ssize_t iosize;
- char waitflag;
+ int waitflag;
int tempmode;
int error = 0;
int b_flag;
@@ -2701,7 +2708,7 @@ strwrite(struct vnode *vp, struct uio *uiop, cred_t *crp)
/*
* Do until count satisfied or error.
*/
- waitflag = WRITEWAIT;
+ waitflag = WRITEWAIT | wflag;
if (stp->sd_flag & OLDNDELAY)
tempmode = uiop->uio_fmode & ~FNDELAY;
else
@@ -2803,79 +2810,6 @@ out:
}
/*
- * kstrwritemp() has very similar semantics as that of strwrite().
- * The main difference is it obtains mblks from the caller and also
- * does not do any copy as done in strwrite() from user buffers to
- * kernel buffers.
- *
- *
- * Currently, this routine is used by sendfile to send data allocated
- * within the kernel without any copying. This interface does not use the
- * synchronous stream interface as synch. stream interface implies
- * copying.
- */
-int
-kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
-{
- struct stdata *stp;
- struct queue *wqp;
- char waitflag;
- int tempmode;
- int error;
- int done = 0;
-
- ASSERT(vp->v_stream);
- stp = vp->v_stream;
-
- if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
- mutex_enter(&stp->sd_lock);
- error = strwriteable(stp, B_FALSE, B_TRUE);
- mutex_exit(&stp->sd_lock);
- if (error != 0)
- return (error);
- }
-
- /*
- * First, check for flow control without grabbing the sd_lock.
- * If we would block, re-check with the lock. This is similar
- * to the logic used by strwrite().
- */
- wqp = stp->sd_wrq;
- if (canputnext(wqp)) {
- putnext(wqp, mp);
- return (0);
- }
-
- waitflag = WRITEWAIT;
- if (stp->sd_flag & OLDNDELAY)
- tempmode = fmode & ~FNDELAY;
- else
- tempmode = fmode;
-
- mutex_enter(&stp->sd_lock);
- do {
- if (canputnext(wqp)) {
- mutex_exit(&stp->sd_lock);
- putnext(wqp, mp);
- return (0);
- }
- error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
- &done);
- } while (error == 0 && !done);
-
- mutex_exit(&stp->sd_lock);
- /*
- * EAGAIN tells the application to try again. ENOMEM
- * is returned only if the memory allocation size
- * exceeds the physical limits of the system. ENOMEM
- * can't be true here.
- */
- if (error == ENOMEM)
- error = EAGAIN;
- return (error);
-}
-
-/*
* Stream head write service routine.
* Its job is to wake up any sleeping writers when a queue
* downstream needs data (part of the flow control in putq and getq).
diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c
index 66184c5206..16dad7e4bb 100644
--- a/usr/src/uts/common/os/strsubr.c
+++ b/usr/src/uts/common/os/strsubr.c
@@ -2437,6 +2437,18 @@ devflg_to_qflag(struct streamtab *stp, uint32_t devflag, uint32_t *qflagp,
if (devflag & D_SYNCSTR)
qflag |= QSYNCSTR;
+ /*
+ * Private flag used by a transport module to indicate
+ * to sockfs that it supports direct-access mode without
+ * having to go through STREAMS.
+ */
+ if (devflag & _D_DIRECT) {
+ /* Reject unless the module is fully-MT (no perimeter) */
+ if ((qflag & QMT_TYPEMASK) != QMTSAFE)
+ goto bad;
+ qflag |= _QDIRECT;
+ }
+
*qflagp = qflag;
*sqtypep = sqtype;
return (0);
@@ -8236,11 +8248,11 @@ hcksum_assoc(mblk_t *mp, multidata_t *mmd, pdesc_t *pd,
ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_MULTIDATA);
if (mp->b_datap->db_type == M_DATA) {
/* Associate values for M_DATA type */
- mp->b_datap->db_cksumstart = (intptr_t)start;
- mp->b_datap->db_cksumstuff = (intptr_t)stuff;
- mp->b_datap->db_cksumend = (intptr_t)end;
- mp->b_datap->db_struioun.cksum.flags = flags;
- mp->b_datap->db_cksum16 = (uint16_t)value;
+ DB_CKSUMSTART(mp) = (intptr_t)start;
+ DB_CKSUMSTUFF(mp) = (intptr_t)stuff;
+ DB_CKSUMEND(mp) = (intptr_t)end;
+ DB_CKSUMFLAGS(mp) = flags;
+ DB_CKSUM16(mp) = (uint16_t)value;
} else {
pattrinfo_t pa_info;
@@ -8258,6 +8270,8 @@ hcksum_assoc(mblk_t *mp, multidata_t *mmd, pdesc_t *pd,
hck->hcksum_end_offset = end;
hck->hcksum_cksum_val.inet_cksum = (uint16_t)value;
hck->hcksum_flags = flags;
+ } else {
+ rc = -1;
}
}
return (rc);
@@ -8271,20 +8285,16 @@ hcksum_retrieve(mblk_t *mp, multidata_t *mmd, pdesc_t *pd,
ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_MULTIDATA);
if (mp->b_datap->db_type == M_DATA) {
if (flags != NULL) {
- *flags = mp->b_datap->db_struioun.cksum.flags;
+ *flags = DB_CKSUMFLAGS(mp);
if (*flags & HCK_PARTIALCKSUM) {
if (start != NULL)
- *start = (uint32_t)
- mp->b_datap->db_cksumstart;
+ *start = (uint32_t)DB_CKSUMSTART(mp);
if (stuff != NULL)
- *stuff = (uint32_t)
- mp->b_datap->db_cksumstuff;
+ *stuff = (uint32_t)DB_CKSUMSTUFF(mp);
if (end != NULL)
- *end =
- (uint32_t)mp->b_datap->db_cksumend;
+ *end = (uint32_t)DB_CKSUMEND(mp);
if (value != NULL)
- *value =
- (uint32_t)mp->b_datap->db_cksum16;
+ *value = (uint32_t)DB_CKSUM16(mp);
}
}
} else {
diff --git a/usr/src/uts/common/sys/conf.h b/usr/src/uts/common/sys/conf.h
index 2f19697c81..305c40e236 100644
--- a/usr/src/uts/common/sys/conf.h
+++ b/usr/src/uts/common/sys/conf.h
@@ -24,7 +24,7 @@
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -219,6 +219,8 @@ extern int cdev_prop_op(dev_t, dev_info_t *, ddi_prop_op_t,
#define D_U64BIT 0x40000 /* Driver supports unsigned 64-bit uio offset */
+#define _D_DIRECT 0x80000 /* Private flag for transport modules */
+
#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */
#ifdef __cplusplus
diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h
index c35f9dc27d..1169d68d68 100644
--- a/usr/src/uts/common/sys/dlpi.h
+++ b/usr/src/uts/common/sys/dlpi.h
@@ -689,6 +689,8 @@ typedef struct {
/* ability */
#define HCKSUM_INET_FULL_V4 0x04 /* Full 1's complement checksum */
/* ability for IPv4 packets. */
+#define HCKSUM_INET_FULL_V6 0x08 /* Full 1's complement checksum */
+ /* ability for IPv6 packets. */
#define HCKSUM_IPHDRCKSUM 0x10 /* IPv4 Header checksum offload */
/* capability */
#ifdef _KERNEL
diff --git a/usr/src/uts/common/sys/gld.h b/usr/src/uts/common/sys/gld.h
index ed24a8deae..e42bb62f28 100644
--- a/usr/src/uts/common/sys/gld.h
+++ b/usr/src/uts/common/sys/gld.h
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -240,9 +240,12 @@ typedef struct gld_mac_info {
#define GLD_CAP_LINKSTATE 0x00000001 /* will call gld_linkstate() */
#define GLD_CAP_CKSUM_IPHDR 0x00000008 /* IP checksum offload */
#define GLD_CAP_CKSUM_PARTIAL 0x00000010 /* TCP/UDP partial */
-#define GLD_CAP_CKSUM_FULL_V4 0x00000020 /* TCP/UDP full */
-#define GLD_CAP_CKSUM_ANY 0x00000038 /* any or all of the above */
+#define GLD_CAP_CKSUM_FULL_V4 0x00000020 /* TCP/UDP full for IPv4 */
#define GLD_CAP_ZEROCOPY 0x00000040 /* zerocopy */
+#define GLD_CAP_CKSUM_FULL_V6 0x00000080 /* TCP/UDP full for IPv6 */
+#define GLD_CAP_CKSUM_ANY \
+ (GLD_CAP_CKSUM_IPHDR|GLD_CAP_CKSUM_PARTIAL| \
+ GLD_CAP_CKSUM_FULL_V4|GLD_CAP_CKSUM_FULL_V6)
/* values of gldm_linkstate, as passed to gld_linkstate() */
#define GLD_LINKSTATE_DOWN -1
diff --git a/usr/src/uts/common/sys/multidata.h b/usr/src/uts/common/sys/multidata.h
index 60ce570fbf..f649b187bc 100644
--- a/usr/src/uts/common/sys/multidata.h
+++ b/usr/src/uts/common/sys/multidata.h
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -68,19 +68,24 @@ typedef struct mbufinfo_s {
/*
* Multidata packet descriptor information.
*/
-typedef struct pdescinfo_s {
- uint_t flags; /* misc. flags */
- uchar_t *hdr_base; /* start address of header area */
- uchar_t *hdr_rptr; /* start address of header data */
- uchar_t *hdr_wptr; /* end address of header data */
- uchar_t *hdr_lim; /* end address of header area */
- uint_t pld_cnt; /* number of payload area */
- struct pld_ary_s {
- int pld_pbuf_idx; /* payload buffer index */
- uchar_t *pld_rptr; /* start address of payload data */
- uchar_t *pld_wptr; /* pointer to end of payload data */
- } pld_ary[MULTIDATA_MAX_PBUFS];
-} pdescinfo_t;
+struct pld_ary_s {
+ int pld_pbuf_idx; /* payload buffer index */
+ uchar_t *pld_rptr; /* start address of payload data */
+ uchar_t *pld_wptr; /* pointer to end of payload data */
+};
+
+#define PDESCINFO_STRUCT(elems) \
+{ \
+ uint_t flags; /* misc. flags */ \
+ uchar_t *hdr_base; /* start address of header area */ \
+ uchar_t *hdr_rptr; /* start address of header data */ \
+ uchar_t *hdr_wptr; /* end address of header data */ \
+ uchar_t *hdr_lim; /* end address of header area */ \
+ uint_t pld_cnt; /* number of payload area */ \
+ struct pld_ary_s pld_ary[(elems)]; \
+}
+
+typedef struct pdescinfo_s PDESCINFO_STRUCT(MULTIDATA_MAX_PBUFS) pdescinfo_t;
/*
* Possible values for flags
diff --git a/usr/src/uts/common/sys/multidata_impl.h b/usr/src/uts/common/sys/multidata_impl.h
index 92df853beb..05589c6f03 100644
--- a/usr/src/uts/common/sys/multidata_impl.h
+++ b/usr/src/uts/common/sys/multidata_impl.h
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -191,21 +191,6 @@ struct multidata_s {
uint_t mmd_pbuf_ref; /* descriptors referring to payload buffer(s) */
};
-/*
- * Smaller and private version of pdescinfo_t used specifically for tcp,
- * which allows for only two payload spans per packet. Any changes made
- * to the pdescinfo_t structure must be reflected here as well.
- */
-typedef struct tcp_pdescinfo_s {
- uint_t flags; /* misc. flags */
- uchar_t *hdr_base; /* start address of header area */
- uchar_t *hdr_rptr; /* start address of header data */
- uchar_t *hdr_wptr; /* end address of header data */
- uchar_t *hdr_lim; /* end address of header area */
- uint_t pld_cnt; /* number of payload area */
- struct pld_ary_s pld_ary[2];
-} tcp_pdescinfo_t;
-
#ifdef _KERNEL
extern void mmd_init(void);
diff --git a/usr/src/uts/common/sys/socketvar.h b/usr/src/uts/common/sys/socketvar.h
index fc39185768..7bcb924d7d 100644
--- a/usr/src/uts/common/sys/socketvar.h
+++ b/usr/src/uts/common/sys/socketvar.h
@@ -100,6 +100,7 @@ struct sockaddr_ux {
};
typedef struct sonodeops sonodeops_t;
+typedef struct sonode sonode_t;
/*
* The sonode represents a socket. A sonode never exist in the file system
@@ -364,7 +365,7 @@ struct sonode {
#define SS_DONEREAD 0x00080000 /* NCAfs: all data read */
#define SS_MOREDATA 0x00100000 /* NCAfs: NCA has more data */
-#define SS_TCP_FAST_ACCEPT 0x00200000 /* Use TCP's accept fast-path */
+#define SS_DIRECT 0x00200000 /* transport is directly below */
#define SS_LADDR_VALID 0x01000000 /* so_laddr valid for user */
#define SS_FADDR_VALID 0x02000000 /* so_faddr valid for user */
@@ -769,8 +770,10 @@ extern void so_drain_discon_ind(struct sonode *);
extern void so_flush_discon_ind(struct sonode *);
extern int sowaitconnected(struct sonode *, int, int);
+extern int sostream_direct(struct sonode *, struct uio *,
+ mblk_t *, cred_t *);
extern int sosend_dgram(struct sonode *, struct sockaddr *,
- socklen_t, struct uio *, int);
+ socklen_t, struct uio *, int);
extern int sosend_svc(struct sonode *, struct uio *, t_scalar_t, int, int);
extern void so_installhooks(struct sonode *);
extern int so_strinit(struct sonode *, struct sonode *);
diff --git a/usr/src/uts/common/sys/sockio.h b/usr/src/uts/common/sys/sockio.h
index 0c15e6deee..c9e57359ef 100644
--- a/usr/src/uts/common/sys/sockio.h
+++ b/usr/src/uts/common/sys/sockio.h
@@ -265,9 +265,9 @@ extern "C" {
#define SIOCDXARP _IOW('i', 168, struct xarpreq) /* delete ARP entry */
/*
- * IOCTL to indicate to the transport that the sockmod is being popped
+ * IOCTL private to sockfs.
*/
-#define SIOCPOPSOCKFS _IOW('i', 169, 0)
+#define _SIOCSOCKFALLBACK _IOW('i', 169, 0)
/*
* IOCTLs for getting and setting zone associated with an interface, and
diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h
index e17ce9388f..3c7b9e685c 100644
--- a/usr/src/uts/common/sys/stream.h
+++ b/usr/src/uts/common/sys/stream.h
@@ -171,6 +171,8 @@ typedef struct queue {
#define _QINSERTING 0x04000000 /* Private, module is being inserted */
#define _QREMOVING 0x08000000 /* Private, module is being removed */
#define _QASSOCIATED 0x10000000 /* queue is associated with a device */
+#define _QDIRECT 0x20000000 /* Private; transport module uses */
+ /* direct interface to/from sockfs */
/* queue sqflags (protected by SQLOCK). */
#define Q_SQQUEUED 0x01 /* Queue is in the syncq list */
diff --git a/usr/src/uts/common/sys/strsubr.h b/usr/src/uts/common/sys/strsubr.h
index 4151204cd3..f907db2c06 100644
--- a/usr/src/uts/common/sys/strsubr.h
+++ b/usr/src/uts/common/sys/strsubr.h
@@ -1096,6 +1096,8 @@ extern int strpoll(register struct stdata *, short, int, short *,
extern void strclean(struct vnode *);
extern void str_cn_clean(); /* XXX hook for consoles signal cleanup */
extern int strwrite(struct vnode *, struct uio *, cred_t *);
+extern int strwrite_common(struct vnode *, struct uio *, cred_t *, int);
+extern int kstrwritemp(struct vnode *, mblk_t *, ushort_t);
extern int strread(struct vnode *, struct uio *, cred_t *);
extern int strioctl(struct vnode *, int, intptr_t, int, int, cred_t *, int *);
extern int strrput(queue_t *, mblk_t *);
@@ -1180,6 +1182,7 @@ extern mblk_t *allocb_wait(size_t, uint_t, uint_t, int *);
extern mblk_t *allocb_cred(size_t, cred_t *);
extern mblk_t *allocb_cred_wait(size_t, uint_t, int *, cred_t *);
extern mblk_t *allocb_tmpl(size_t, const mblk_t *);
+extern mblk_t *allocb_tryhard(size_t);
extern void mblk_setcred(mblk_t *, cred_t *);
extern void strpollwakeup(vnode_t *, short);
extern int putnextctl_wait(queue_t *, int);
@@ -1188,7 +1191,6 @@ extern int kstrputmsg(struct vnode *, mblk_t *, struct uio *, ssize_t,
unsigned char, int, int);
extern int kstrgetmsg(struct vnode *, mblk_t **, struct uio *,
unsigned char *, int *, clock_t, rval_t *);
-extern int kstrwritemp(struct vnode *, mblk_t *, ushort_t);
extern void strsetrerror(vnode_t *, int, int, errfunc_t);
extern void strsetwerror(vnode_t *, int, int, errfunc_t);
@@ -1217,6 +1219,8 @@ extern void fmodsw_rele(fmodsw_impl_t *);
extern void freemsgchain(mblk_t *);
extern mblk_t *copymsgchain(mblk_t *);
+extern mblk_t *mcopyinuio(struct stdata *, uio_t *, ssize_t, ssize_t, int *);
+
/*
* shared or externally configured data structures
*/
@@ -1263,6 +1267,19 @@ extern struct queue *RD(queue_t *);
extern struct queue *WR(queue_t *);
extern int SAMESTR(queue_t *);
+/*
+ * The following hardware checksum related macros are private
+ * interfaces that are subject to change without notice.
+ */
+#ifdef _KERNEL
+#define DB_CKSUMSTART(mp) ((mp)->b_datap->db_cksumstart)
+#define DB_CKSUMEND(mp) ((mp)->b_datap->db_cksumend)
+#define DB_CKSUMSTUFF(mp) ((mp)->b_datap->db_cksumstuff)
+#define DB_CKSUMFLAGS(mp) ((mp)->b_datap->db_struioun.cksum.flags)
+#define DB_CKSUM16(mp) ((mp)->b_datap->db_cksum16)
+#define DB_CKSUM32(mp) ((mp)->b_datap->db_cksum32)
+#endif /* _KERNEL */
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/syscall/sendfile.c b/usr/src/uts/common/syscall/sendfile.c
index 2a3e29a859..04bbd99f65 100644
--- a/usr/src/uts/common/syscall/sendfile.c
+++ b/usr/src/uts/common/syscall/sendfile.c
@@ -73,6 +73,89 @@ extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *,
extern void nl7c_sendfilev(struct sonode *, u_offset_t, struct sendfilevec *,
int);
+/*
+ * kstrwritemp() has very similar semantics as that of strwrite().
+ * The main difference is it obtains mblks from the caller and also
+ * does not do any copy as done in strwrite() from user buffers to
+ * kernel buffers.
+ *
+ * Currently, this routine is used by sendfile to send data allocated
+ * within the kernel without any copying. This interface does not use the
+ * synchronous stream interface as synch. stream interface implies
+ * copying.
+ */
+int
+kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
+{
+ struct stdata *stp;
+ struct queue *wqp;
+ char waitflag;
+ int tempmode;
+ int error = 0;
+ int done = 0;
+ struct sonode *so;
+ boolean_t direct;
+
+ ASSERT(vp->v_stream);
+ stp = vp->v_stream;
+
+ so = VTOSO(vp);
+ direct = (so->so_state & SS_DIRECT);
+
+ /*
+ * This is the sockfs direct fast path. canputnext() need
+ * not be accurate so we don't grab the sd_lock here. If
+ * we get flow-controlled, we grab sd_lock just before the
+ * do..while loop below to emulate what strwrite() does.
+ */
+ wqp = stp->sd_wrq;
+ if (canputnext(wqp) && direct &&
+ !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
+ return (sostream_direct(so, NULL, mp, CRED()));
+ } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
+ /* Fast check of flags before acquiring the lock */
+ mutex_enter(&stp->sd_lock);
+ error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
+ mutex_exit(&stp->sd_lock);
+ if (error != 0) {
+ if (!(stp->sd_flag & STPLEX) &&
+ (stp->sd_wput_opt & SW_SIGPIPE)) {
+ tsignal(curthread, SIGPIPE);
+ error = EPIPE;
+ }
+ return (error);
+ }
+ }
+
+ waitflag = WRITEWAIT;
+ if (stp->sd_flag & OLDNDELAY)
+ tempmode = fmode & ~FNDELAY;
+ else
+ tempmode = fmode;
+
+ mutex_enter(&stp->sd_lock);
+ do {
+ if (canputnext(wqp)) {
+ mutex_exit(&stp->sd_lock);
+ putnext(wqp, mp);
+ return (0);
+ }
+ error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
+ &done);
+ } while (error == 0 && !done);
+
+ mutex_exit(&stp->sd_lock);
+ /*
+ * EAGAIN tells the application to try again. ENOMEM
+ * is returned only if the memory allocation size
+ * exceeds the physical limits of the system. ENOMEM
+ * can't be true here.
+ */
+ if (error == ENOMEM)
+ error = EAGAIN;
+ return (error);
+}
+
#define SEND_MAX_CHUNK 16
#if defined(_SYSCALL32_IMPL) || defined(_ILP32)
@@ -1045,7 +1128,7 @@ sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt,
goto err;
}
- if ((so->so_state & SS_TCP_FAST_ACCEPT) &&
+ if ((so->so_state & SS_DIRECT) &&
(so->so_priv != NULL)) {
maxblk = ((tcp_t *)so->so_priv)->tcp_mss;
} else {
diff --git a/usr/src/uts/intel/ia32/ml/modstubs.s b/usr/src/uts/intel/ia32/ml/modstubs.s
index e0b0a92ee9..658dc6dd89 100644
--- a/usr/src/uts/intel/ia32/ml/modstubs.s
+++ b/usr/src/uts/intel/ia32/ml/modstubs.s
@@ -482,6 +482,7 @@ fcnname/**/_info: \
NO_UNLOAD_STUB(sockfs, sosendfile64, nomod_zero);
NO_UNLOAD_STUB(sockfs, sock_getfasync, nomod_zero);
NO_UNLOAD_STUB(sockfs, nl7c_sendfilev, nomod_zero);
+ NO_UNLOAD_STUB(sockfs, sostream_direct, nomod_zero);
END_MODULE(sockfs);
#endif
@@ -529,12 +530,6 @@ fcnname/**/_info: \
END_MODULE(spdsock);
#endif
-#ifndef UDP_MODULE
- MODULE(udp,drv);
- WSTUB(udp, udp_compute_checksum, nomod_zero);
- END_MODULE(udp);
-#endif
-
#ifndef NATTYMOD_MODULE
MODULE(nattymod, strmod);
WSTUB(nattymod, nattymod_clean_ipif, nomod_zero);
diff --git a/usr/src/uts/sparc/ml/modstubs.s b/usr/src/uts/sparc/ml/modstubs.s
index 9594335f33..599658a635 100644
--- a/usr/src/uts/sparc/ml/modstubs.s
+++ b/usr/src/uts/sparc/ml/modstubs.s
@@ -368,6 +368,7 @@ stubs_base:
NO_UNLOAD_STUB(sockfs, sosendfile64, nomod_zero);
NO_UNLOAD_STUB(sockfs, sock_getfasync, nomod_zero);
NO_UNLOAD_STUB(sockfs, nl7c_sendfilev, nomod_zero);
+ NO_UNLOAD_STUB(sockfs, sostream_direct, nomod_zero);
END_MODULE(sockfs);
#endif
@@ -415,12 +416,6 @@ stubs_base:
END_MODULE(spdsock);
#endif
-#ifndef UDP_MODULE
- MODULE(udp,drv);
- WSTUB(udp, udp_compute_checksum, nomod_zero);
- END_MODULE(udp);
-#endif
-
#ifndef NATTYMOD_MODULE
MODULE(nattymod, strmod);
WSTUB(nattymod, nattymod_clean_ipif, nomod_zero);