diff options
Diffstat (limited to 'usr/src')
63 files changed, 7606 insertions, 3547 deletions
diff --git a/usr/src/cmd/cmd-inet/usr.bin/pppd/sys-solaris.c b/usr/src/cmd/cmd-inet/usr.bin/pppd/sys-solaris.c index 9d11ac72d8..0b1ce1eafd 100644 --- a/usr/src/cmd/cmd-inet/usr.bin/pppd/sys-solaris.c +++ b/usr/src/cmd/cmd-inet/usr.bin/pppd/sys-solaris.c @@ -76,6 +76,7 @@ #include <netinet/in.h> #include <sys/tihdr.h> #include <inet/mib2.h> +#include <inet/ip.h> #include <sys/ethernet.h> #include <sys/ser_sync.h> @@ -92,27 +93,6 @@ static const char rcsid[] = RCSID; #endif -/* Need to use UDP for ifconfig compatibility */ -#if !defined(UDP_DEV_NAME) -#define UDP_DEV_NAME "/dev/udp" -#endif /* UDP_DEV_NAME */ - -#if !defined(IP_DEV_NAME) -#define IP_DEV_NAME "/dev/ip" -#endif /* IP_DEV_NAME */ - -#if !defined(UDP6_DEV_NAME) -#define UDP6_DEV_NAME "/dev/udp6" -#endif /* UDP6_DEV_NAME */ - -#if !defined(IP6_DEV_NAME) -#define IP6_DEV_NAME "/dev/ip6" -#endif /* IP6_DEV_NAME */ - -#if !defined(IP_MOD_NAME) -#define IP_MOD_NAME "ip" -#endif /* IP_MOD_NAME */ - #define PPPSTRTIMOUT 1 /* Timeout in seconds for ioctl */ #define MAX_POLLFDS 32 #define NMODULES 32 diff --git a/usr/src/cmd/cmd-inet/usr.lib/ncaconfd/ncaconfd.c b/usr/src/cmd/cmd-inet/usr.lib/ncaconfd/ncaconfd.c index be2461b276..06972f53dc 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/ncaconfd/ncaconfd.c +++ b/usr/src/cmd/cmd-inet/usr.lib/ncaconfd/ncaconfd.c @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -55,18 +55,6 @@ #include "ncaconf.h" /* NCA does not support IPv6... */ -#ifndef IP_DEV_NAME -#define IP_DEV_NAME "/dev/ip" -#endif - -#ifndef IP_MOD_NAME -#define IP_MOD_NAME "ip" -#endif - -#ifndef UDP_DEV_NAME -#define UDP_DEV_NAME "/dev/udp" -#endif - #ifndef NCA_MOD_NAME #define NCA_MOD_NAME "nca" #endif diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c index aa2107f3f4..ee577669b0 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c @@ -18,6 +18,8 @@ #include <sys/dlpi.h> #include <libdlpi.h> +#include <inet/ip.h> + #define LOOPBACK_IF "lo0" #define NONE_STR "none" @@ -26,26 +28,6 @@ #define ARP_MOD_NAME "arp" #endif -#ifndef IP_DEV_NAME -#define IP_DEV_NAME "/dev/ip" -#endif - -#ifndef IP_MOD_NAME -#define IP_MOD_NAME "ip" -#endif - -#ifndef IP6_DEV_NAME -#define IP6_DEV_NAME "/dev/ip6" -#endif - -#ifndef UDP_DEV_NAME -#define UDP_DEV_NAME "/dev/udp" -#endif - -#ifndef UDP6_DEV_NAME -#define UDP6_DEV_NAME "/dev/udp6" -#endif - #define ADDRBITS_V4 32 /* number of bits in IPv4 address */ #define ADDRBITS_V6 128 /* number of bits in IPv6 address */ diff --git a/usr/src/cmd/mdb/common/modules/genunix/genunix.c b/usr/src/cmd/mdb/common/modules/genunix/genunix.c index a36dfc8334..2a3b26ea8c 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/genunix.c +++ b/usr/src/cmd/mdb/common/modules/genunix/genunix.c @@ -3883,9 +3883,6 @@ static const mdb_walker_t walkers[] = { mi_walk_init, mi_walk_step, mi_walk_fini, NULL }, { "sonode", "given a sonode, walk its children", sonode_walk_init, sonode_walk_step, sonode_walk_fini, NULL }, - { "udp", "walk UDP connections using MI", - mi_payload_walk_init, mi_payload_walk_step, - mi_payload_walk_fini, &mi_udp_arg }, /* from nvpair.c */ { NVPAIR_WALKER_NAME, NVPAIR_WALKER_DESCR, diff --git a/usr/src/cmd/mdb/common/modules/genunix/net.c b/usr/src/cmd/mdb/common/modules/genunix/net.c index 0b6d826491..209b207bd3 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/net.c +++ b/usr/src/cmd/mdb/common/modules/genunix/net.c @@ -107,7 +107,8 @@ net_tcp_ipv6(const tcp_t *tcp) static int net_udp_active(const udp_t *udp) { - return ((udp->udp_state != TS_UNBND) && (udp->udp_state != TS_IDLE)); + return ((udp->udp_state == TS_IDLE) || + (udp->udp_state == TS_DATA_XFER)); } static int @@ -355,11 +356,6 @@ mi_payload_walk_fini(mdb_walk_state_t *wsp) delete_mi_payload_walk_data(wsp->walk_data, arg->mi_pwa_size); } -const mi_payload_walk_arg_t mi_udp_arg = { - "udp", "udp_g_head", sizeof (udp_t), - MI_PAYLOAD_DEVICE | MI_PAYLOAD_MODULE -}; - const mi_payload_walk_arg_t mi_ar_arg = { "arp", "ar_g_head", sizeof (ar_t), MI_PAYLOAD_DEVICE | MI_PAYLOAD_MODULE @@ -595,7 +591,7 @@ netstat_tcp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data, int af) tcp = (tcp_t *)((uintptr_t)connp + (tcp_kaddr - kaddr)); if ((uintptr_t)tcp < (uintptr_t)connp || - (uintptr_t)&tcp->tcp_connp > (uintptr_t)connp + itc_size || + (uintptr_t)(tcp + 1) > (uintptr_t)connp + itc_size || (uintptr_t)tcp->tcp_connp != kaddr) { mdb_warn("conn_tcp %p is invalid", tcp_kaddr); return (WALK_NEXT); @@ -603,7 +599,7 @@ netstat_tcp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data, int af) connp->conn_tcp = tcp; tcp->tcp_connp = connp; - if (!(opts & NETSTAT_ALL || net_tcp_active(tcp)) || + if (!((opts & NETSTAT_ALL) || net_tcp_active(tcp)) || (af == AF_INET && !net_tcp_ipv4(tcp)) || (af == AF_INET6 && !net_tcp_ipv6(tcp))) { return (WALK_NEXT); @@ -639,45 +635,57 @@ netstat_tcpv6_cb(uintptr_t kaddr, const void *walk_data, void *cb_data) return (netstat_tcp_cb(kaddr, walk_data, cb_data, AF_INET6)); } +/*ARGSUSED*/ static int -netstat_udpv4_cb(uintptr_t kaddr, const void *walk_data, void *cb_data) +netstat_udp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data, int af) { - const udp_t *udp = walk_data; const uintptr_t opts = (uintptr_t)cb_data; + udp_t udp; + conn_t connp; + + if (mdb_vread(&udp, sizeof (udp_t), kaddr) == -1) { + mdb_warn("failed to read udp at %p", kaddr); + return (WALK_ERR); + } - if (!((opts & NETSTAT_ALL || net_udp_active(udp)) && net_udp_ipv4(udp))) + if (mdb_vread(&connp, sizeof (conn_t), + (uintptr_t)udp.udp_connp) == -1) { + mdb_warn("failed to read udp_connp at %p", + (uintptr_t)udp.udp_connp); + return (WALK_ERR); + } + + if (!((opts & NETSTAT_ALL) || net_udp_active(&udp)) || + (af == AF_INET && !net_udp_ipv4(&udp)) || + (af == AF_INET6 && !net_udp_ipv6(&udp))) { return (WALK_NEXT); + } - mdb_printf("%0?p %2i ", kaddr, udp->udp_state); - net_ipv4addrport_pr(&udp->udp_v6src, udp->udp_port); - mdb_printf(" "); - net_ipv4addrport_pr(&udp->udp_v6dst, udp->udp_dstport); - mdb_printf(" %4i\n", udp->udp_zoneid); + mdb_printf("%0?p %2i ", kaddr, udp.udp_state); + if (af == AF_INET) { + net_ipv4addrport_pr(&udp.udp_v6src, udp.udp_port); + mdb_printf(" "); + net_ipv4addrport_pr(&udp.udp_v6dst, udp.udp_dstport); + } else if (af == AF_INET6) { + net_ipv6addrport_pr(&udp.udp_v6src, udp.udp_port); + mdb_printf(" "); + net_ipv6addrport_pr(&udp.udp_v6dst, udp.udp_dstport); + } + mdb_printf(" %4i\n", connp.conn_zoneid); return (WALK_NEXT); } static int -netstat_udpv6_cb(uintptr_t kaddr, const void *walk_data, void *cb_data) +netstat_udpv4_cb(uintptr_t kaddr, const void *walk_data, void *cb_data) { - const udp_t *udp = walk_data; - const uintptr_t opts = (uintptr_t)cb_data; - - if (!((opts & NETSTAT_ALL || net_udp_active(udp)) && net_udp_ipv6(udp))) - return (WALK_NEXT); - - mdb_printf("%0?p %2i ", kaddr, udp->udp_state); - net_ipv6addrport_pr(&udp->udp_v6src, udp->udp_port); - mdb_printf(" "); - - /* Remote */ - if (udp->udp_state == TS_DATA_XFER) - net_ipv6addrport_pr(&udp->udp_v6dst, udp->udp_dstport); - else - mdb_printf("%*s.0 ", ADDR_V6_WIDTH, "0:0:0:0:0:0:0:0"); - mdb_printf(" %4i\n", udp->udp_zoneid); + return (netstat_udp_cb(kaddr, walk_data, cb_data, AF_INET)); +} - return (WALK_NEXT); +static int +netstat_udpv6_cb(uintptr_t kaddr, const void *walk_data, void *cb_data) +{ + return (netstat_udp_cb(kaddr, walk_data, cb_data, AF_INET6)); } /* @@ -855,7 +863,7 @@ netstat(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) "UDPv4", ADDR_V4_WIDTH, "Local Address", ADDR_V4_WIDTH, "Remote Address", "Zone"); - if (mdb_walk("genunix`udp", netstat_udpv4_cb, + if (mdb_walk("udp_cache", netstat_udpv4_cb, (void *)(uintptr_t)opts) == -1) { mdb_warn("failed to walk genunix`udp"); return (DCMD_ERR); @@ -870,12 +878,11 @@ netstat(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) "UDPv6", ADDR_V6_WIDTH, "Local Address", ADDR_V6_WIDTH, "Remote Address", "Zone"); - if (mdb_walk("genunix`udp", netstat_udpv6_cb, + if (mdb_walk("udp_cache", netstat_udpv6_cb, (void *)(uintptr_t)opts) == -1) { mdb_warn("failed to walk genunix`udp"); return (DCMD_ERR); } - } } diff --git a/usr/src/cmd/mdb/common/modules/genunix/net.h b/usr/src/cmd/mdb/common/modules/genunix/net.h index 04bf278638..45e03a5352 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/net.h +++ b/usr/src/cmd/mdb/common/modules/genunix/net.h @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2000, 2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -33,7 +33,6 @@ extern "C" { #endif -extern struct mi_payload_walk_arg_s mi_udp_arg; extern struct mi_payload_walk_arg_s mi_ar_arg; extern struct mi_payload_walk_arg_s mi_icmp_arg; extern struct mi_payload_walk_arg_s mi_ill_arg; diff --git a/usr/src/cmd/rcm_daemon/common/ip_rcm.c b/usr/src/cmd/rcm_daemon/common/ip_rcm.c index 95ed823a74..38de5ef96c 100644 --- a/usr/src/cmd/rcm_daemon/common/ip_rcm.c +++ b/usr/src/cmd/rcm_daemon/common/ip_rcm.c @@ -54,6 +54,7 @@ #include <libdevinfo.h> #include <sys/systeminfo.h> #include <netdb.h> +#include <inet/ip.h> #include <ipmp_mpathd.h> #include "rcm_module.h" @@ -70,12 +71,7 @@ /* Some generic well-knowns and defaults used in this module */ #define SLASH_DEV "/dev" /* /dev directory */ -#define IP_DEV_NAME "/dev/ip" /* IPV4 ip device */ -#define IP6_DEV_NAME "/dev/ip6" /* IPV6 ip device */ -#define IP_MOD_NAME "ip" /* ip module */ #define ARP_MOD_NAME "arp" /* arp module */ -#define UDP_DEV_NAME "/dev/udp" /* IPv4 udp device */ -#define UDP6_DEV_NAME "/dev/udp6" /* IPv6 udp device */ #define IP_MAX_MODS 9 /* max modules pushed on intr */ #define MAX_RECONFIG_SIZE 1024 /* Max. reconfig string size */ diff --git a/usr/src/pkgdefs/etc/exception_list_i386 b/usr/src/pkgdefs/etc/exception_list_i386 index 5d6fa5d32e..b8c682c9ed 100644 --- a/usr/src/pkgdefs/etc/exception_list_i386 +++ b/usr/src/pkgdefs/etc/exception_list_i386 @@ -347,6 +347,8 @@ usr/lib/llib-lipsecutil.ln i386 usr/include/inet/arp_impl.h i386 usr/include/inet/rawip_impl.h i386 usr/include/inet/udp_impl.h i386 +usr/include/inet/tcp_impl.h i386 +usr/include/inet/ip_impl.h i386 usr/include/inet/ip_ndp.h i386 usr/include/inet/ipdrop.h i386 usr/include/inet/tun.h i386 diff --git a/usr/src/pkgdefs/etc/exception_list_sparc b/usr/src/pkgdefs/etc/exception_list_sparc index 74e0ee0273..8f1f4a40e3 100644 --- a/usr/src/pkgdefs/etc/exception_list_sparc +++ b/usr/src/pkgdefs/etc/exception_list_sparc @@ -336,6 +336,8 @@ usr/share/lib/locale/com/sun/dhcpmgr/cli/pntadm/ResourceBundle.properties sparc usr/include/inet/arp_impl.h sparc usr/include/inet/rawip_impl.h sparc usr/include/inet/udp_impl.h sparc +usr/include/inet/tcp_impl.h sparc +usr/include/inet/ip_impl.h sparc usr/include/inet/ip_ndp.h sparc usr/include/inet/ipdrop.h sparc usr/include/inet/tun.h sparc diff --git a/usr/src/tools/scripts/bfu.sh b/usr/src/tools/scripts/bfu.sh index c4aa388b40..5a102f0bea 100644 --- a/usr/src/tools/scripts/bfu.sh +++ b/usr/src/tools/scripts/bfu.sh @@ -2002,11 +2002,10 @@ if [[ ! -f $usr/lib/dns/libdns.so ]] && ! $ZCAT $cpiodir/generic.usr$ZFIX | \ fi update_script="/ws/onnv-gate/public/bin/update_ce" -if [ "$plat" = "SUNW,Sun-Fire-15000" ] && ifconfig -a | egrep '^ce' \ - >/dev/null 2>/dev/null; then - # Sun Fire 12K/15K/20K/25K requires CE version 1.146 or later. +if ifconfig -a | egrep '^ce' >/dev/null 2>/dev/null; then + # CE version 1.148 or later is required cever=`modinfo | grep 'CE Ethernet' | sed 's/.*v1\.//' | tr -d ')' | \ - nawk '{ if ($1 < 146) print "BAD"; else print $1 }'` + nawk '{ if ($1 < 148) print "BAD"; else print $1 }'` if [ "$cever" = "BAD" ]; then fail "You must run $update_script to upgrade your ce driver." fi diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 69a29625d4..8daf858a7c 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -416,13 +416,9 @@ ICMP6_OBJS += icmp6ddi.o RTS_OBJS += rtsddi.o rts.o rts_opt_data.o -IP_OBJS += igmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \ - ip_cksum.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \ - ip_multi.o ip_ndp.o ip_opt_data.o ip_rts.o ip_srcid.o \ - ipddi.o ipdrop.o mi.o nd.o optcom.o snmpcom.o ipsec_loader.o \ - spd.o ipclassifier.o inet_common.o ip_squeue.o tcp.o \ - tcp_trace.o tcp_opt_data.o tcp_sack.o squeue.o ip_sadb.o \ - sctp_crc32.o sctp.o sctp_opt_data.o sctp_output.o \ +IP_TCP_OBJS = tcp.o tcp_trace.o tcp_opt_data.o tcp_sack.o tcp_fusion.o +IP_UDP_OBJS = udp.o udp_opt_data.o +IP_SCTP_OBJS = sctp_crc32.o sctp.o sctp_opt_data.o sctp_output.o \ sctp_init.o sctp_input.o sctp_cookie.o \ sctp_conn.o sctp_error.o sctp_snmp.o \ sctp_param.o sctp_shutdown.o sctp_common.o \ @@ -430,6 +426,16 @@ IP_OBJS += igmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \ sctp_ioc.o sctp_bind.o sctp_notify.o sctp_asconf.o \ sctp_addr.o +IP_OBJS += igmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \ + ip_cksum.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \ + ip_multi.o ip_ndp.o ip_opt_data.o ip_rts.o ip_srcid.o \ + ipddi.o ipdrop.o mi.o nd.o optcom.o snmpcom.o ipsec_loader.o \ + spd.o ipclassifier.o inet_common.o ip_squeue.o squeue.o \ + ip_sadb.o \ + $(IP_TCP_OBJS) \ + $(IP_UDP_OBJS) \ + $(IP_SCTP_OBJS) + IP6_OBJS += ip6ddi.o KEYSOCK_OBJS += keysockddi.o keysock.o keysock_opt_data.o @@ -467,7 +473,7 @@ ATUN_OBJS += atun.o 6TO4TUN_OBJS += 6to4tun.o -UDP_OBJS += udpddi.o udp.o udp_opt_data.o +UDP_OBJS += udpddi.o UDP6_OBJS += udp6ddi.o diff --git a/usr/src/uts/common/fs/sockfs/sockstr.c b/usr/src/uts/common/fs/sockfs/sockstr.c index 6b934d9f0a..6c148d71b6 100644 --- a/usr/src/uts/common/fs/sockfs/sockstr.c +++ b/usr/src/uts/common/fs/sockfs/sockstr.c @@ -137,21 +137,23 @@ so_sock2stream(struct sonode *so) ASSERT(so->so_version != SOV_STREAM); - /* tell the transport below that sockmod is being popped */ - if ((so->so_state & SS_TCP_FAST_ACCEPT) != 0) { - int rval; - mblk_t **mpp; + if (so->so_state & SS_DIRECT) { + mblk_t **mpp; + int rval; + /* + * Tell the transport below that sockmod is being popped + */ mutex_exit(&so->so_lock); - error = strioctl(vp, SIOCPOPSOCKFS, NULL, 0, K_TO_K, CRED(), + error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K, CRED(), &rval); mutex_enter(&so->so_lock); if (error != 0) { - dprintso(so, 0, - ("so_sock2stream(%p): SIOCPOPSOCKFS failed\n", so)); + dprintso(so, 0, ("so_sock2stream(%p): " + "_SIOCSOCKFALLBACK failed\n", so)); goto exit; } - so->so_state &= ~SS_TCP_FAST_ACCEPT; + so->so_state &= ~SS_DIRECT; for (mpp = &so->so_conn_ind_head; (mp = *mpp) != NULL; mpp = &mp->b_next) { @@ -412,7 +414,7 @@ so_strinit(struct sonode *so, struct sonode *tso) /* the following do_tcapability may update so->so_mode */ if ((tso->so_serv_type != T_CLTS) && - ((so->so_state & SS_TCP_FAST_ACCEPT) == 0)) { + !(so->so_state & SS_DIRECT)) { error = do_tcapability(so, TC1_ACCEPTOR_ID); if (error) return (error); diff --git a/usr/src/uts/common/fs/sockfs/socktpi.c b/usr/src/uts/common/fs/sockfs/socktpi.c index 7b895f99a7..6a5e48464e 100644 --- a/usr/src/uts/common/fs/sockfs/socktpi.c +++ b/usr/src/uts/common/fs/sockfs/socktpi.c @@ -57,6 +57,7 @@ #include <sys/socket.h> #include <sys/socketvar.h> +#include <sys/sockio.h> #include <netinet/in.h> #include <sys/un.h> #include <sys/strsun.h> @@ -72,6 +73,7 @@ #include <inet/ip.h> #include <inet/ip6.h> #include <inet/tcp.h> +#include <inet/udp_impl.h> #include <fs/sockfs/nl7c.h> #include <sys/zone.h> @@ -185,6 +187,10 @@ static int sotpi_sendmsg(struct sonode *, struct nmsghdr *, struct uio *); static int sotpi_shutdown(struct sonode *, int); static int sotpi_getsockname(struct sonode *); +static int sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t, + struct uio *, void *, t_uscalar_t, int); +static int sodgram_direct(struct sonode *, struct sockaddr *, + socklen_t, struct uio *, int); sonodeops_t sotpi_sonodeops = { sotpi_accept, /* sop_accept */ @@ -222,16 +228,40 @@ sotpi_create(vnode_t *accessvp, int domain, int type, int protocol, int version, so = VTOSO(vp); flags = FREAD|FWRITE; - if (tso != NULL) { - if ((tso->so_state & (SS_TCP_FAST_ACCEPT)) != 0) { - flags |= SO_ACCEPTOR|SO_SOCKSTR; - so->so_state |= SS_TCP_FAST_ACCEPT; - } - } else { - if ((so->so_type == SOCK_STREAM) && - (so->so_family == AF_INET || so->so_family == AF_INET6)) { - flags |= SO_SOCKSTR; - so->so_state |= SS_TCP_FAST_ACCEPT; + + if ((type == SOCK_STREAM || type == SOCK_DGRAM) && + (domain == AF_INET || domain == AF_INET6) && + (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP || + protocol == IPPROTO_IP)) { + /* Tell tcp or udp that it's talking to sockets */ + flags |= SO_SOCKSTR; + + /* + * Here we indicate to socktpi_open() our attempt to + * make direct calls between sockfs and transport. + * The final decision is left to socktpi_open(). + */ + so->so_state |= SS_DIRECT; + + ASSERT(so->so_type != SOCK_DGRAM || tso == NULL); + if (so->so_type == SOCK_STREAM && tso != NULL) { + if (tso->so_state & SS_DIRECT) { + /* + * Inherit SS_DIRECT from listener and pass + * SO_ACCEPTOR open flag to tcp, indicating + * that this is an accept fast-path instance. + */ + flags |= SO_ACCEPTOR; + } else { + /* + * SS_DIRECT is not set on listener, meaning + * that the listener has been converted from + * a socket to a stream. Ensure that the + * acceptor inherits these settings. + */ + so->so_state &= ~SS_DIRECT; + flags &= ~SO_SOCKSTR; + } } } @@ -1052,7 +1082,7 @@ done: } /* bind the socket */ -int +static int sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen, int flags) { @@ -1372,7 +1402,7 @@ again: case AF_INET: case AF_INET6: if ((optlen == sizeof (intptr_t)) && - ((so->so_state & SS_TCP_FAST_ACCEPT) != 0)) { + ((so->so_state & SS_DIRECT) != 0)) { bcopy(mp->b_rptr + conn_ind->OPT_offset, &opt, conn_ind->OPT_length); } else { @@ -1385,7 +1415,19 @@ again: * problems when sockfs sends a normal T_CONN_RES * message down the new stream. */ - so->so_state &= ~SS_TCP_FAST_ACCEPT; + if (so->so_state & SS_DIRECT) { + int rval; + /* + * For consistency we inform tcp to disable + * direct interface on the listener, though + * we can certainly live without doing this + * because no data will ever travel upstream + * on the listening socket. + */ + so->so_state &= ~SS_DIRECT; + (void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK, + 0, 0, K_TO_K, CRED(), &rval); + } opt = NULL; optlen = 0; } @@ -1554,9 +1596,10 @@ again: if (nso->so_options & SO_LINGER) nso->so_linger = so->so_linger; - if ((so->so_state & SS_TCP_FAST_ACCEPT) != 0) { + if ((so->so_state & SS_DIRECT) != 0) { mblk_t *ack_mp; + ASSERT(nso->so_state & SS_DIRECT); ASSERT(opt != NULL); conn_res->OPT_length = optlen; @@ -3308,13 +3351,8 @@ err: * Assumes caller has verified that SS_ISBOUND etc. are set. */ static int -sosend_dgramcmsg(struct sonode *so, - struct sockaddr *name, - t_uscalar_t namelen, - struct uio *uiop, - void *control, - t_uscalar_t controllen, - int flags) +sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen, + struct uio *uiop, void *control, t_uscalar_t controllen, int flags) { struct T_unitdata_req tudr; mblk_t *mp; @@ -3636,11 +3674,8 @@ sosend_svccmsg(struct sonode *so, * name and the source address is passed as an option. */ int -sosend_dgram(struct sonode *so, - struct sockaddr *name, - socklen_t namelen, - struct uio *uiop, - int flags) +sosend_dgram(struct sonode *so, struct sockaddr *name, socklen_t namelen, + struct uio *uiop, int flags) { struct T_unitdata_req tudr; mblk_t *mp; @@ -3651,7 +3686,7 @@ sosend_dgram(struct sonode *so, socklen_t srclen; ssize_t len; - ASSERT(name && namelen); + ASSERT(name != NULL && namelen != 0); len = uiop->uio_resid; if (len > so->so_tidu_size) { @@ -3659,14 +3694,14 @@ sosend_dgram(struct sonode *so, goto done; } - /* - * Length and family checks. - */ + /* Length and family checks */ error = so_addr_verify(so, name, namelen); - if (error) { - eprintsoline(so, error); + if (error != 0) goto done; - } + + if (so->so_state & SS_DIRECT) + return (sodgram_direct(so, name, namelen, uiop, flags)); + if (so->so_family == AF_UNIX) { if (so->so_state & SS_FADDR_NOXLATE) { /* @@ -4061,8 +4096,7 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) if (msg->msg_controllen != 0) { if (!(so_mode & SM_CONNREQUIRED)) { error = sosend_dgramcmsg(so, name, namelen, uiop, - msg->msg_control, msg->msg_controllen, - flags); + msg->msg_control, msg->msg_controllen, flags); } else { if (flags & MSG_OOB) { /* Can't generate T_EXDATA_REQ with options */ @@ -4080,7 +4114,7 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) if (!(so_mode & SM_CONNREQUIRED)) { /* * If there is no SO_DONTROUTE to turn off return immediately - * from sosend_dgram. This can allow tail-call optimizations. + * from send_dgram. This can allow tail-call optimizations. */ if (!dontroute) { return (sosend_dgram(so, name, namelen, uiop, flags)); @@ -4104,13 +4138,16 @@ sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop) dprintso(so, 1, ("sotpi_sendmsg: write\n")); /* - * If there is no SO_DONTROUTE to turn off - * return immediately from strwrite. This can - * allow tail-call optimizations. + * If there is no SO_DONTROUTE to turn off, + * SS_DIRECT is on, and there is no flow + * control, we can take the fast path. */ - if (!dontroute) - return (strwrite(SOTOV(so), uiop, - CRED())); + if (!dontroute && + (so_state & SS_DIRECT) && + canputnext(SOTOV(so)->v_stream->sd_wrq)) { + return (sostream_direct(so, uiop, + NULL, CRED())); + } error = strwrite(SOTOV(so), uiop, CRED()); goto done; } @@ -4140,6 +4177,206 @@ done: } /* + * Sending data on a datagram socket. + * Assumes caller has verified that SS_ISBOUND etc. are set. + */ +/* ARGSUSED */ +static int +sodgram_direct(struct sonode *so, struct sockaddr *name, + socklen_t namelen, struct uio *uiop, int flags) +{ + struct T_unitdata_req tudr; + mblk_t *mp; + int error = 0; + void *addr; + socklen_t addrlen; + ssize_t len; + struct stdata *stp = SOTOV(so)->v_stream; + int so_state; + queue_t *udp_wq; + + ASSERT(name != NULL && namelen != 0); + ASSERT(!(so->so_mode & SM_CONNREQUIRED)); + ASSERT(!(so->so_mode & SM_EXDATA)); + ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6); + ASSERT(SOTOV(so)->v_type == VSOCK); + + /* Caller checked for proper length */ + len = uiop->uio_resid; + ASSERT(len <= so->so_tidu_size); + + /* Length and family checks have been done by caller */ + ASSERT(name->sa_family == so->so_family); + ASSERT(so->so_family == AF_INET || + (namelen == (socklen_t)sizeof (struct sockaddr_in6))); + ASSERT(so->so_family == AF_INET6 || + (namelen == (socklen_t)sizeof (struct sockaddr_in))); + + addr = name; + addrlen = namelen; + + if (stp->sd_sidp != NULL && + (error = straccess(stp, JCWRITE)) != 0) + goto done; + + so_state = so->so_state; + + /* + * For UDP we don't break up the copyin into smaller pieces + * as in the TCP case. That means if ENOMEM is returned by + * mcopyinuio() then the uio vector has not been modified at + * all and we fallback to either strwrite() or kstrputmsg() + * below. Note also that we never generate priority messages + * from here. + */ + udp_wq = stp->sd_wrq->q_next; + if (canput(udp_wq) && + (mp = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) { + ASSERT(DB_TYPE(mp) == M_DATA); + ASSERT(uiop->uio_resid == 0); +#ifdef C2_AUDIT + if (audit_active) + audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0); +#endif /* C2_AUDIT */ + udp_wput_data(udp_wq, mp, addr, addrlen); + return (0); + } + if (error != 0 && error != ENOMEM) + return (error); + + /* + * For connected, let strwrite() handle the blocking case. + * Otherwise we fall thru and use kstrputmsg(). + */ + if (so_state & SS_ISCONNECTED) + return (strwrite(SOTOV(so), uiop, CRED())); + + tudr.PRIM_type = T_UNITDATA_REQ; + tudr.DEST_length = addrlen; + tudr.DEST_offset = (t_scalar_t)sizeof (tudr); + tudr.OPT_length = 0; + tudr.OPT_offset = 0; + + mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0, _ALLOC_INTR); + if (mp == NULL) { + /* + * Caught a signal waiting for memory. + * Let send* return EINTR. + */ + error = EINTR; + goto done; + } + +#ifdef C2_AUDIT + if (audit_active) + audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0); +#endif /* C2_AUDIT */ + + error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0); +done: +#ifdef SOCK_DEBUG + if (error != 0) { + eprintsoline(so, error); + } +#endif /* SOCK_DEBUG */ + return (error); +} + +int +sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr) +{ + struct stdata *stp = SOTOV(so)->v_stream; + ssize_t iosize, rmax, maxblk; + queue_t *tcp_wq = stp->sd_wrq->q_next; + int error = 0, wflag = 0; + + ASSERT(so->so_mode & SM_BYTESTREAM); + ASSERT(SOTOV(so)->v_type == VSOCK); + + if (stp->sd_sidp != NULL && + (error = straccess(stp, JCWRITE)) != 0) + return (error); + + if (uiop == NULL) { + /* + * kstrwritemp() should have checked sd_flag and + * flow-control before coming here. If we end up + * here it means that we can simply pass down the + * data to tcp. + */ + ASSERT(mp != NULL); + tcp_wput(tcp_wq, mp); + return (0); + } + + /* Fallback to strwrite() to do proper error handling */ + if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY)) + return (strwrite(SOTOV(so), uiop, cr)); + + rmax = stp->sd_qn_maxpsz; + ASSERT(rmax >= 0 || rmax == INFPSZ); + if (rmax == 0 || uiop->uio_resid <= 0) + return (0); + + if (rmax == INFPSZ) + rmax = uiop->uio_resid; + + maxblk = stp->sd_maxblk; + + for (;;) { + iosize = MIN(uiop->uio_resid, rmax); + + mp = mcopyinuio(stp, uiop, iosize, maxblk, &error); + if (mp == NULL) { + /* + * Fallback to strwrite() for ENOMEM; if this + * is our first time in this routine and the uio + * vector has not been modified, we will end up + * calling strwrite() without any flag set. + */ + if (error == ENOMEM) + goto slow_send; + else + return (error); + } + ASSERT(uiop->uio_resid >= 0); + /* + * If mp is non-NULL and ENOMEM is set, it means that + * mcopyinuio() was able to break down some of the user + * data into one or more mblks. Send the partial data + * to tcp and let the rest be handled in strwrite(). + */ + ASSERT(error == 0 || error == ENOMEM); + tcp_wput(tcp_wq, mp); + + wflag |= NOINTR; + + if (uiop->uio_resid == 0) { /* No more data; we're done */ + ASSERT(error == 0); + break; + } else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag & + (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) { +slow_send: + /* + * We were able to send down partial data using + * the direct call interface, but are now relying + * on strwrite() to handle the non-fastpath cases. + * If the socket is blocking we will sleep in + * strwaitq() until write is permitted, otherwise, + * we will need to return the amount of bytes + * written so far back to the app. This is the + * reason why we pass NOINTR flag to strwrite() + * for non-blocking socket, because we don't want + * to return EAGAIN when portion of the user data + * has actually been sent down. + */ + return (strwrite_common(SOTOV(so), uiop, cr, wflag)); + } + } + return (0); +} + +/* * Update so_faddr by asking the transport (unless AF_UNIX). */ int diff --git a/usr/src/uts/common/fs/sockfs/sockvnops.c b/usr/src/uts/common/fs/sockfs/sockvnops.c index b783dc77ac..817e9b5968 100644 --- a/usr/src/uts/common/fs/sockfs/sockvnops.c +++ b/usr/src/uts/common/fs/sockfs/sockvnops.c @@ -53,6 +53,7 @@ #include <sys/stropts.h> #include <sys/stream.h> #include <sys/strsubr.h> +#include <sys/strsun.h> #include <sys/suntpi.h> #include <sys/ioctl.h> #include <sys/sockio.h> @@ -87,6 +88,9 @@ #include <fs/sockfs/nl7c.h> +#include <inet/udp_impl.h> +#include <inet/tcp_impl.h> + static int socktpi_close(struct vnode *, int, int, offset_t, struct cred *); static int socktpi_read(struct vnode *, struct uio *, int, struct cred *, struct caller_context *); @@ -140,6 +144,15 @@ const fs_operation_def_t socknca_vnodeops_template[] = { }; /* + * Do direct function call to the transport layer below; this would + * also allow the transport to utilize read-side synchronous stream + * interface if necessary. This is a /etc/system tunable that must + * not be modified on a running system. By default this is enabled + * for performance reasons and may be disabled for debugging purposes. + */ +boolean_t socktpi_direct = B_TRUE; + +/* * Open routine used by socket() call. Note that vn_open checks for * VSOCK and fails the open (and VOP_OPEN is fs_nosys). The VSOCK check is * needed since VSOCK type vnodes exist in various underlying filesystems as @@ -205,6 +218,56 @@ socktpi_open(struct vnode **vpp, int flag, struct cred *cr) ASSERT(stp->sd_wrq != NULL); so->so_provinfo = tpi_findprov(stp->sd_wrq); + + /* + * If caller is interested in doing direct function call + * interface to/from transport module, probe the module + * directly beneath the streamhead to see if it qualifies. + * + * We turn off direct interface when qualifications fail; + * note that we do these checks for everything other than + * the tcp acceptor case, because the acceptor inherits + * the capabilities of the listener and we've already done + * the checks against the listening socket. + */ + if (!(flag & SO_ACCEPTOR) && (so->so_state & SS_DIRECT)) { + queue_t *tq = stp->sd_wrq->q_next; + + /* + * SS_DIRECT is currently supported and tested + * only for tcp/udp; this is the main reason to + * have the following assertions. + */ + ASSERT(so->so_family == AF_INET || + so->so_family == AF_INET6); + ASSERT(so->so_protocol == IPPROTO_UDP || + so->so_protocol == IPPROTO_TCP || + so->so_protocol == IPPROTO_IP); + ASSERT(so->so_type == SOCK_DGRAM || + so->so_type == SOCK_STREAM); + + /* + * Abort direct call interface if the module directly + * underneath the stream head is not defined with the + * _D_DIRECT flag. This could happen in the tcp or + * udp case, when some other module is autopushed + * above it, or for some reasons the expected module + * isn't purely D_MP (which is the main requirement). + */ + if (!socktpi_direct || !(tq->q_flag & _QDIRECT) || + !(_OTHERQ(tq)->q_flag & _QDIRECT)) { + int rval; + + /* Continue on without direct calls */ + so->so_state &= ~SS_DIRECT; + if ((error = strioctl(vp, _SIOCSOCKFALLBACK, + 0, 0, K_TO_K, CRED(), &rval)) != 0) { + (void) socktpi_close(vp, flag, 1, + (offset_t)0, cr); + return (error); + } + } + } } else { /* * While the same socket can not be reopened (unlike specfs) @@ -436,6 +499,11 @@ socktpi_write( /* Give NL7C some data */ nl7c_data(so, uiop); } + + if ((so_state & SS_DIRECT) && + canputnext(vp->v_stream->sd_wrq)) { + return (sostream_direct(so, uiop, NULL, cr)); + } return (strwrite(vp, uiop, cr)); } else { /* Send T_DATA_REQ messages without MORE_flag set */ @@ -631,7 +699,7 @@ socktpi_ioctl(struct vnode *vp, int cmd, intptr_t arg, int mode, case I_SENDFD: case I_RECVFD: case I_ATMARK: - case SIOCPOPSOCKFS: + case _SIOCSOCKFALLBACK: /* * These ioctls do not apply to sockets. I_FDINSERT can be * used to send M_PROTO messages without modifying the socket @@ -639,8 +707,9 @@ socktpi_ioctl(struct vnode *vp, int cmd, intptr_t arg, int mode, * descriptor passing since they assume a twisted stream. * SIOCATMARK must be used instead of I_ATMARK. * - * SIOCPOPSOCKFS from an application should never be - * processed. It is always generated in response to I_POP. + * _SIOCSOCKFALLBACK from an application should never be + * processed. It is only generated by socktpi_open() or + * in response to I_POP or I_PUSH. */ #ifdef DEBUG cmn_err(CE_WARN, "Unsupported STREAMS ioctl 0x%x on socket. " @@ -724,6 +793,24 @@ socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode, switch (cmd) { case I_PUSH: + if (so->so_state & SS_DIRECT) { + mutex_enter(&so->so_lock); + so_lock_single(so); + mutex_exit(&so->so_lock); + + error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K, + CRED(), rvalp); + + mutex_enter(&so->so_lock); + if (error == 0) + so->so_state &= ~SS_DIRECT; + so_unlock_single(so, SOLOCKED); + mutex_exit(&so->so_lock); + + if (error != 0) + return (error); + } + error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp); if (error == 0) so->so_pushcnt++; diff --git a/usr/src/uts/common/inet/Makefile b/usr/src/uts/common/inet/Makefile index f43759686a..88afebe947 100644 --- a/usr/src/uts/common/inet/Makefile +++ b/usr/src/uts/common/inet/Makefile @@ -34,7 +34,7 @@ HDRS= arp.h common.h ipclassifier.h ip.h ip6.h ipdrop.h ipsecah.h ipsecesp.h \ ipsec_info.h ip6_asp.h ip_if.h ip_ire.h ip_multi.h ip_ndp.h ip_rts.h \ ipsec_impl.h keysock.h led.h mi.h mib2.h nd.h optcom.h sadb.h \ sctp_itf.h snmpcom.h tcp.h tcp_sack.h tun.h udp_impl.h arp_impl.h \ - rawip_impl.h ipp_common.h + rawip_impl.h ipp_common.h ip_impl.h tcp_impl.h ROOTDIRS= $(ROOT)/usr/include/inet diff --git a/usr/src/uts/common/inet/arp/arp.c b/usr/src/uts/common/inet/arp/arp.c index 59dbbd6808..d4889559e2 100644 --- a/usr/src/uts/common/inet/arp/arp.c +++ b/usr/src/uts/common/inet/arp/arp.c @@ -194,7 +194,6 @@ static int ar_entry_add(queue_t *q, mblk_t *mp); static int ar_entry_delete(queue_t *q, mblk_t *mp); static int ar_entry_query(queue_t *q, mblk_t *mp); static int ar_entry_squery(queue_t *q, mblk_t *mp); -static void ar_freemsg(mblk_t *mp); static int ar_interface_up(queue_t *q, mblk_t *mp); static int ar_interface_down(queue_t *q, mblk_t *mp); static int ar_interface_on(queue_t *q, mblk_t *mp); @@ -1231,7 +1230,7 @@ ar_cmd_done(arl_t *arl) ar_ip->ar_arl_ip_assoc = ar_arl; } } - ar_freemsg(mp); + inet_freemsg(mp); } /* @@ -1745,19 +1744,6 @@ ar_entry_squery(queue_t *q, mblk_t *mp_orig) return (0); } -/* Make sure b_next and b_prev are null and then free the message */ -static void -ar_freemsg(mblk_t *mp) -{ - mblk_t *mp1; - - for (mp1 = mp; mp1; mp1 = mp1->b_cont) { - mp1->b_prev = mp1->b_next = NULL; - mp1->b_queue = NULL; - } - freemsg(mp); -} - /* Process an interface down causing us to detach and unbind. */ /* ARGSUSED */ static int @@ -1936,7 +1922,7 @@ ar_ll_cleanup_arl_queue(queue_t *q) BUMP_IRE_STATS(ire_stats_v4, ire_stats_freed); } - ar_freemsg(mp); + inet_freemsg(mp); } else { prev = mp; } @@ -2587,7 +2573,7 @@ ar_query_delete(ace_t *ace, uchar_t *ar) *(uint32_t *)mp->b_rptr == AR_ENTRY_QUERY) { BUMP_IRE_STATS(ire_stats_v4, ire_stats_freed); } - ar_freemsg(mp); + inet_freemsg(mp); } else { mpp = &mp->b_next; } @@ -2657,7 +2643,7 @@ ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr, } else { if (ret_val != 0) { /* TODO: find some way to let the guy know? */ - ar_freemsg(mp); + inet_freemsg(mp); BUMP_IRE_STATS(ire_stats_v4, ire_stats_freed); continue; } @@ -2849,7 +2835,7 @@ ar_rput(queue_t *q, mblk_t *mp) "arp_rput_end: q %p (%S)", q, "proto"); return; default: - ar_freemsg(mp); + inet_freemsg(mp); return; } if ((mp->b_wptr - mp->b_rptr) < sizeof (dl_unitdata_ind_t) || diff --git a/usr/src/uts/common/inet/common.h b/usr/src/uts/common/inet/common.h index 63c630718b..5ac15b3c4e 100644 --- a/usr/src/uts/common/inet/common.h +++ b/usr/src/uts/common/inet/common.h @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 1992-2001, 2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -97,13 +97,13 @@ typedef void (*pfv_t)(); #define INET_MAXMINOR MAXMIN /* maximum device minor number */ #ifdef _KERNEL +#include <sys/stream.h> -extern void inet_init(void); -extern void inet_destroy(void); extern void *inet_minor_create(char *, dev_t, int); extern void inet_minor_destroy(void *); extern dev_t inet_minor_alloc(void *); extern void inet_minor_free(void *, dev_t); +extern void inet_freemsg(mblk_t *); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/inet/inet_common.c b/usr/src/uts/common/inet/inet_common.c index 0900852a64..e55abc6c01 100644 --- a/usr/src/uts/common/inet/inet_common.c +++ b/usr/src/uts/common/inet/inet_common.c @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -103,3 +103,21 @@ inet_minor_free(void *a, dev_t dev) ASSERT((dev != OPENFAIL) && (dev != 0) && (dev <= inet_maxminor)); vmem_free(((inet_arena_t *)a)->ineta_arena, (void *)dev, 1); } + +/* + * This function is used to free a message that has gone through + * mi_copyin processing which modifies the M_IOCTL mblk's b_next + * and b_prev pointers. We use this function to set b_next/b_prev + * to NULL and free them. + */ +void +inet_freemsg(mblk_t *mp) +{ + mblk_t *bp = mp; + + for (; bp != NULL; bp = bp->b_cont) { + bp->b_prev = NULL; + bp->b_next = NULL; + } + freemsg(mp); +} diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h index 9caf225c41..23e3069934 100644 --- a/usr/src/uts/common/inet/ip.h +++ b/usr/src/uts/common/inet/ip.h @@ -52,6 +52,7 @@ extern "C" { #include <sys/vmem.h> #include <sys/squeue.h> #include <sys/systm.h> +#include <sys/multidata.h> #ifdef DEBUG #define ILL_DEBUG @@ -67,7 +68,19 @@ extern "C" { * of flags. */ #define IP_DEVMTFLAGS D_MP -#endif +#endif /* _KERNEL */ + +#define IP_MOD_NAME "ip" +#define IP_DEV_NAME "/dev/ip" +#define IP6_DEV_NAME "/dev/ip6" + +#define UDP_MOD_NAME "udp" +#define UDP_DEV_NAME "/dev/udp" +#define UDP6_DEV_NAME "/dev/udp6" + +#define TCP_MOD_NAME "tcp" +#define TCP_DEV_NAME "/dev/tcp" +#define TCP6_DEV_NAME "/dev/tcp6" /* Minor numbers */ #define IPV4_MINOR 0 @@ -101,8 +114,6 @@ typedef uint32_t ipaddr_t; #define ILL_FRAG_HASH_TBL_COUNT ((unsigned int)64) #define ILL_FRAG_HASH_TBL_SIZE (ILL_FRAG_HASH_TBL_COUNT * sizeof (ipfb_t)) -#define IP_DEV_NAME "/dev/ip" -#define IP_MOD_NAME "ip" #define IPV4_ADDR_LEN 4 #define IP_ADDR_LEN IPV4_ADDR_LEN #define IP_ARP_PROTO_TYPE 0x0800 @@ -236,6 +247,7 @@ typedef struct ipoptp_s #define Q_TO_CONN(q) ((conn_t *)(q)->q_ptr) #define Q_TO_TCP(q) (Q_TO_CONN((q))->conn_tcp) +#define Q_TO_UDP(q) (Q_TO_CONN((q))->conn_udp) /* * The following two macros are used by IP to get the appropriate @@ -244,13 +256,10 @@ typedef struct ipoptp_s * from a conn directly if it knows that the conn is not TCP. */ #define CONNP_TO_WQ(connp) \ - (((connp)->conn_tcp == NULL) ? (connp)->conn_wq : \ - (connp)->conn_tcp->tcp_wq) + (IPCL_IS_TCP(connp) ? (connp)->conn_tcp->tcp_wq : (connp)->conn_wq) #define CONNP_TO_RQ(connp) RD(CONNP_TO_WQ(connp)) -#define IS_TCP_CONN(connp) (((connp)->conn_flags & IPCL_TCP) != 0) - #define GRAB_CONN_LOCK(q) { \ if (q != NULL && CONN_Q(q)) \ mutex_enter(&(Q_TO_CONN(q))->conn_lock); \ @@ -302,9 +311,8 @@ typedef struct ipoptp_s */ #define IP6_NO_IPPOLICY 0x800 /* Don't do IPQoS processing */ #define IP6_IN_LLMCAST 0x1000 /* Multicast */ -#define IP6_IN_NOCKSUM 0x2000 /* Don't compute checksum */ -#define IP_FF_LOOPBACK 0x4000 /* Loopback fanout */ +#define IP_FF_LOOPBACK 0x2000 /* Loopback fanout */ #ifndef IRE_DB_TYPE #define IRE_DB_TYPE M_SIG @@ -357,6 +365,8 @@ typedef struct ipf_s { uint_t ipf_prev_nexthdr_offset; /* Offset for nexthdr value */ uint8_t ipf_ecn; /* ECN info for the fragments */ uint8_t ipf_num_dups; /* Number of times dup frags recvd */ + uint16_t ipf_checksum_flags; /* Hardware checksum flags */ + uint32_t ipf_checksum; /* Partial checksum of fragment data */ } ipf_t; #define ipf_src V4_PART_OF_V6(ipf_v6src) @@ -623,9 +633,10 @@ typedef struct ip_m_s { * depends on the atomic 32 bit access to that field. */ #define CONN_CLOSING 0x01 /* ip_close waiting for ip_wsrv */ -#define CONN_IPSEC_LOAD_WAIT 0x10 /* waiting for load */ -#define CONN_CONDEMNED 0x40 /* conn is closing, no more refs */ -#define CONN_INCIPIENT 0x80 /* conn not yet visible, no refs */ +#define CONN_IPSEC_LOAD_WAIT 0x02 /* waiting for load */ +#define CONN_CONDEMNED 0x04 /* conn is closing, no more refs */ +#define CONN_INCIPIENT 0x08 /* conn not yet visible, no refs */ +#define CONN_QUIESCED 0x10 /* conn is now quiescent */ /* * Parameter to ip_output giving the identity of the caller. @@ -2593,6 +2604,7 @@ extern ipparam_t *ip_param_arr; extern int ip_g_forward; extern int ipv6_forward; +extern vmem_t *ip_minor_arena; #define ip_respond_to_address_mask_broadcast ip_param_arr[0].ip_param_value #define ip_g_send_redirects ip_param_arr[5].ip_param_value @@ -2697,18 +2709,11 @@ extern uint32_t ipsechw_debug; #define ip1dbg(a) if (ip_debug > 2) printf a #define ip2dbg(a) if (ip_debug > 3) printf a #define ip3dbg(a) if (ip_debug > 4) printf a - -#define ipcsumdbg(a, b) \ - if (ip_debug == 1) \ - prom_printf(a); \ - else if (ip_debug > 1) \ - { prom_printf("%smp=%p\n", a, (void *)b); } #else #define ip0dbg(a) /* */ #define ip1dbg(a) /* */ #define ip2dbg(a) /* */ #define ip3dbg(a) /* */ -#define ipcsumdbg(a, b) /* */ #endif /* IP_DEBUG */ extern const char *dlpi_prim_str(int); @@ -2717,7 +2722,6 @@ extern void ill_frag_timer(void *); extern ill_t *ill_first(int, int, ill_walk_context_t *); extern ill_t *ill_next(ill_walk_context_t *, ill_t *); extern void ill_frag_timer_start(ill_t *); -extern void ip_ioctl_freemsg(mblk_t *); extern mblk_t *ip_carve_mp(mblk_t **, ssize_t); extern mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t); extern char *ip_dot_addr(ipaddr_t, char *); @@ -2749,6 +2753,9 @@ extern void ip_input(ill_t *, ill_rx_ring_t *, mblk_t *, size_t); extern void ip_rput_dlpi(queue_t *, mblk_t *); extern void ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *); extern void ip_rput_forward_multicast(ipaddr_t, mblk_t *, ipif_t *); + +extern int ip_snmpmod_close(queue_t *); +extern void ip_snmpmod_wput(queue_t *, mblk_t *); extern void ip_udp_input(queue_t *, mblk_t *, ipha_t *, ire_t *, ill_t *); extern void ip_proto_input(queue_t *, mblk_t *, ipha_t *, ire_t *, ill_t *); extern void ip_rput_other(ipsq_t *, queue_t *, mblk_t *, void *); @@ -2821,6 +2828,7 @@ extern int ipsec_req_from_conn(conn_t *, ipsec_req_t *, int); extern int ip_snmp_get(queue_t *q, mblk_t *mctl); extern int ip_snmp_set(queue_t *q, int, int, uchar_t *, int); extern void ip_process_ioctl(ipsq_t *, queue_t *, mblk_t *, void *); +extern void ip_quiesce_conn(conn_t *); extern void ip_reprocess_ioctl(ipsq_t *, queue_t *, mblk_t *, void *); extern void ip_restart_optmgmt(ipsq_t *, queue_t *, mblk_t *, void *); extern void ip_ioctl_finish(queue_t *, mblk_t *, int, int, ipif_t *, @@ -2842,6 +2850,7 @@ extern boolean_t ip_md_hcksum_attr(struct multidata_s *, struct pdesc_s *, uint32_t, uint32_t, uint32_t, uint32_t); extern boolean_t ip_md_zcopy_attr(struct multidata_s *, struct pdesc_s *, uint_t); +extern mblk_t *ip_unbind(queue_t *, mblk_t *); /* Hooks for CGTP (multirt routes) filtering module */ #define CGTP_FILTER_REV_1 1 @@ -2925,17 +2934,6 @@ struct ill_mdt_capab_s { uint_t ill_mdt_span_limit; /* maximum payload span per packet */ }; -/* - * ioctl identifier and structure for Multidata Transmit update - * private M_CTL communication from IP to ULP. - */ -#define MDT_IOC_INFO_UPDATE (('M' << 8) + 1020) - -typedef struct ip_mdt_info_s { - uint_t mdt_info_id; /* MDT_IOC_INFO_UPDATE */ - ill_mdt_capab_t mdt_capab; /* ILL MDT capabilities */ -} ip_mdt_info_t; - struct ill_hcksum_capab_s { uint_t ill_hcksum_version; /* interface version */ uint_t ill_hcksum_txflags; /* capabilities on transmit */ @@ -2991,35 +2989,6 @@ struct ill_poll_capab_s { }; /* - * Macro that determines whether or not a given ILL is allowed for MDT. - */ -#define ILL_MDT_USABLE(ill) \ - ((ill->ill_capabilities & ILL_CAPAB_MDT) != 0 && \ - ill->ill_mdt_capab != NULL && \ - ill->ill_mdt_capab->ill_mdt_version == MDT_VERSION_2 && \ - ill->ill_mdt_capab->ill_mdt_on != 0) - -/* - * Macro that determines whether or not a given CONN may be considered - * for fast path prior to proceeding further with Multidata. - */ -#define CONN_IS_MD_FASTPATH(connp) \ - ((connp)->conn_dontroute == 0 && /* SO_DONTROUTE */ \ - (connp)->conn_nofailover_ill == NULL && /* IPIF_NOFAILOVER */ \ - (connp)->conn_xmit_if_ill == NULL && /* IP_XMIT_IF */ \ - (connp)->conn_outgoing_pill == NULL && /* IP{V6}_BOUND_PIF */ \ - (connp)->conn_outgoing_ill == NULL) /* IP{V6}_BOUND_IF */ - -/* - * Macro that determines whether or not a given IPC requires - * outbound IPSEC processing. - */ -#define CONN_IPSEC_OUT_ENCAPSULATED(connp) \ - ((connp)->conn_out_enforce_policy || \ - ((connp)->conn_latch != NULL && \ - (connp)->conn_latch->ipl_out_policy != NULL)) - -/* * IP squeues exports */ extern int ip_squeue_profile; @@ -3049,12 +3018,15 @@ extern void ip_squeue_get_pkts(squeue_t *); extern int ip_squeue_bind_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); extern int ip_squeue_bind_get(queue_t *, mblk_t *, caddr_t, cred_t *); extern void ip_squeue_clean(void *, mblk_t *, void *); +extern void ip_resume_tcp_bind(void *, mblk_t *, void *); + +extern void tcp_wput(queue_t *, mblk_t *); -extern void ip_resume_tcp_bind(void *, mblk_t *mp, void *); extern int ip_fill_mtuinfo(struct in6_addr *, in_port_t, struct ip6_mtuinfo *); +extern ipif_t *conn_get_held_ipif(conn_t *, ipif_t **, int *); -typedef void (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *); +typedef void (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *); /* * Squeue tags. Tags only need to be unique when the callback function is the @@ -3091,6 +3063,11 @@ typedef void (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *); #define SQTAG_TCP_WPUT_OTHER 28 #define SQTAG_TCP_CONN_REQ_UNBOUND 29 #define SQTAG_TCP_SEND_PENDING 30 +#define SQTAG_BIND_RETRY 31 +#define SQTAG_UDP_FANOUT 32 +#define SQTAG_UDP_INPUT 33 +#define SQTAG_UDP_WPUT 34 +#define SQTAG_UDP_OUTPUT 35 #endif /* _KERNEL */ diff --git a/usr/src/uts/common/inet/ip/igmp.c b/usr/src/uts/common/inet/ip/igmp.c index e09aed5736..435f085d24 100644 --- a/usr/src/uts/common/inet/ip/igmp.c +++ b/usr/src/uts/common/inet/ip/igmp.c @@ -1925,6 +1925,8 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr) igmpa->igmpa_group = ilm->ilm_addr; igmpa->igmpa_cksum = 0; igmpa->igmpa_cksum = IP_CSUM(mp, hdrlen, 0); + if (igmpa->igmpa_cksum == 0) + igmpa->igmpa_cksum = 0xffff; rtralert[0] = IPOPT_COPY & IPOPT_RTRALERT; rtralert[1] = RTRALERT_LEN; diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c index 89a5fdfaf8..a988b67cbb 100644 --- a/usr/src/uts/common/inet/ip/ip.c +++ b/usr/src/uts/common/inet/ip/ip.c @@ -75,9 +75,11 @@ #include <netinet/sctp.h> #include <inet/ip.h> +#include <inet/ip_impl.h> #include <inet/ip6.h> #include <inet/ip6_asp.h> #include <inet/tcp.h> +#include <inet/tcp_impl.h> #include <inet/ip_multi.h> #include <inet/ip_if.h> #include <inet/ip_ire.h> @@ -110,6 +112,7 @@ #include <inet/ipclassifier.h> #include <inet/sctp_ip.h> +#include <inet/udp_impl.h> /* * Values for squeue switch: @@ -122,7 +125,8 @@ squeue_func_t ip_input_proc; /* * IP statistics. */ -#define IP_STAT(x) (ip_statistics.x.value.ui64++) +#define IP_STAT(x) (ip_statistics.x.value.ui64++) +#define IP_STAT_UPDATE(x, n) (ip_statistics.x.value.ui64 += (n)) typedef struct ip_stat { kstat_named_t ipsec_fanout_proto; @@ -158,42 +162,68 @@ typedef struct ip_stat { kstat_named_t ip_ire_redirect_timer_expired; kstat_named_t ip_ire_pmtu_timer_expired; kstat_named_t ip_input_multi_squeue; + kstat_named_t ip_tcp_in_full_hw_cksum_err; + kstat_named_t ip_tcp_in_part_hw_cksum_err; + kstat_named_t ip_tcp_in_sw_cksum_err; + kstat_named_t ip_tcp_out_sw_cksum_bytes; + kstat_named_t ip_udp_in_full_hw_cksum_err; + kstat_named_t ip_udp_in_part_hw_cksum_err; + kstat_named_t ip_udp_in_sw_cksum_err; + kstat_named_t ip_udp_out_sw_cksum_bytes; + kstat_named_t ip_frag_mdt_pkt_out; + kstat_named_t ip_frag_mdt_discarded; + kstat_named_t ip_frag_mdt_allocfail; + kstat_named_t ip_frag_mdt_addpdescfail; + kstat_named_t ip_frag_mdt_allocd; } ip_stat_t; static ip_stat_t ip_statistics = { - { "ipsec_fanout_proto", KSTAT_DATA_UINT64 }, - { "ip_udp_fannorm", KSTAT_DATA_UINT64 }, - { "ip_udp_fanmb", KSTAT_DATA_UINT64 }, - { "ip_udp_fanothers", KSTAT_DATA_UINT64 }, - { "ip_udp_fast_path", KSTAT_DATA_UINT64 }, - { "ip_udp_slow_path", KSTAT_DATA_UINT64 }, - { "ip_udp_input_err", KSTAT_DATA_UINT64 }, - { "ip_tcppullup", KSTAT_DATA_UINT64 }, - { "ip_tcpoptions", KSTAT_DATA_UINT64 }, - { "ip_multipkttcp", KSTAT_DATA_UINT64 }, - { "ip_tcp_fast_path", KSTAT_DATA_UINT64 }, - { "ip_tcp_slow_path", KSTAT_DATA_UINT64 }, - { "ip_tcp_input_error", KSTAT_DATA_UINT64 }, - { "ip_db_ref", KSTAT_DATA_UINT64 }, - { "ip_notaligned1", KSTAT_DATA_UINT64 }, - { "ip_notaligned2", KSTAT_DATA_UINT64 }, - { "ip_multimblk3", KSTAT_DATA_UINT64 }, - { "ip_multimblk4", KSTAT_DATA_UINT64 }, - { "ip_ipoptions", KSTAT_DATA_UINT64 }, - { "ip_classify_fail", KSTAT_DATA_UINT64 }, - { "ip_opt", KSTAT_DATA_UINT64 }, - { "ip_udp_rput_local", KSTAT_DATA_UINT64 }, - { "ipsec_proto_ahesp", KSTAT_DATA_UINT64 }, - { "ip_conn_flputbq", KSTAT_DATA_UINT64 }, - { "ip_conn_walk_drain", KSTAT_DATA_UINT64 }, - { "ip_out_sw_cksum", KSTAT_DATA_UINT64 }, - { "ip_in_sw_cksum", KSTAT_DATA_UINT64 }, - { "ip_trash_ire_reclaim_calls", KSTAT_DATA_UINT64 }, + { "ipsec_fanout_proto", KSTAT_DATA_UINT64 }, + { "ip_udp_fannorm", KSTAT_DATA_UINT64 }, + { "ip_udp_fanmb", KSTAT_DATA_UINT64 }, + { "ip_udp_fanothers", KSTAT_DATA_UINT64 }, + { "ip_udp_fast_path", KSTAT_DATA_UINT64 }, + { "ip_udp_slow_path", KSTAT_DATA_UINT64 }, + { "ip_udp_input_err", KSTAT_DATA_UINT64 }, + { "ip_tcppullup", KSTAT_DATA_UINT64 }, + { "ip_tcpoptions", KSTAT_DATA_UINT64 }, + { "ip_multipkttcp", KSTAT_DATA_UINT64 }, + { "ip_tcp_fast_path", KSTAT_DATA_UINT64 }, + { "ip_tcp_slow_path", KSTAT_DATA_UINT64 }, + { "ip_tcp_input_error", KSTAT_DATA_UINT64 }, + { "ip_db_ref", KSTAT_DATA_UINT64 }, + { "ip_notaligned1", KSTAT_DATA_UINT64 }, + { "ip_notaligned2", KSTAT_DATA_UINT64 }, + { "ip_multimblk3", KSTAT_DATA_UINT64 }, + { "ip_multimblk4", KSTAT_DATA_UINT64 }, + { "ip_ipoptions", KSTAT_DATA_UINT64 }, + { "ip_classify_fail", KSTAT_DATA_UINT64 }, + { "ip_opt", KSTAT_DATA_UINT64 }, + { "ip_udp_rput_local", KSTAT_DATA_UINT64 }, + { "ipsec_proto_ahesp", KSTAT_DATA_UINT64 }, + { "ip_conn_flputbq", KSTAT_DATA_UINT64 }, + { "ip_conn_walk_drain", KSTAT_DATA_UINT64 }, + { "ip_out_sw_cksum", KSTAT_DATA_UINT64 }, + { "ip_in_sw_cksum", KSTAT_DATA_UINT64 }, + { "ip_trash_ire_reclaim_calls", KSTAT_DATA_UINT64 }, { "ip_trash_ire_reclaim_success", KSTAT_DATA_UINT64 }, - { "ip_ire_arp_timer_expired", KSTAT_DATA_UINT64 }, + { "ip_ire_arp_timer_expired", KSTAT_DATA_UINT64 }, { "ip_ire_redirect_timer_expired", KSTAT_DATA_UINT64 }, - { "ip_ire_pmtu_timer_expired", KSTAT_DATA_UINT64 }, - { "ip_input_multi_squeue", KSTAT_DATA_UINT64 }, + { "ip_ire_pmtu_timer_expired", KSTAT_DATA_UINT64 }, + { "ip_input_multi_squeue", KSTAT_DATA_UINT64 }, + { "ip_tcp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, + { "ip_tcp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, + { "ip_tcp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, + { "ip_tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, + { "ip_udp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, + { "ip_udp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, + { "ip_udp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, + { "ip_udp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, + { "ip_frag_mdt_pkt_out", KSTAT_DATA_UINT64 }, + { "ip_frag_mdt_discarded", KSTAT_DATA_UINT64 }, + { "ip_frag_mdt_allocfail", KSTAT_DATA_UINT64 }, + { "ip_frag_mdt_addpdescfail", KSTAT_DATA_UINT64 }, + { "ip_frag_mdt_allocd", KSTAT_DATA_UINT64 }, }; static kstat_t *ip_kstat; @@ -591,28 +621,12 @@ uint_t ip_max_frag_dups = 10; /* RFC1122 Conformance */ #define IP_FORWARD_DEFAULT IP_FORWARD_NEVER -#ifdef _BIG_ENDIAN -#define IP_HDR_CSUM_TTL_ADJUST 256 -#define IP_TCP_CSUM_COMP IPPROTO_TCP -#define IP_UDP_CSUM_COMP IPPROTO_UDP -#else -#define IP_HDR_CSUM_TTL_ADJUST 1 -#define IP_TCP_CSUM_COMP (IPPROTO_TCP << 8) -#define IP_UDP_CSUM_COMP (IPPROTO_UDP << 8) -#endif - -#define TCP_CHECKSUM_OFFSET 16 -#define UDP_CHECKSUM_OFFSET 6 - #define ILL_MAX_NAMELEN LIFNAMSIZ -#define UDPH_SIZE 8 - /* Leave room for ip_newroute to tack on the src and target addresses */ #define OK_RESOLVER_MP(mp) \ ((mp) && ((mp)->b_wptr - (mp)->b_rptr) >= (2 * IP_ADDR_LEN)) -static ipif_t *conn_get_held_ipif(conn_t *, ipif_t **, int *); static int conn_set_held_ipif(conn_t *, ipif_t **, ipif_t *); static mblk_t *ip_wput_attach_llhdr(mblk_t *, ire_t *, ip_proc_t, uint32_t); @@ -668,6 +682,8 @@ static int ip_rput_forward_options(mblk_t *, ipha_t *, ire_t *); static boolean_t ip_rput_local_options(queue_t *, mblk_t *, ipha_t *, ire_t *); static int ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *); +static boolean_t ip_rput_fragment(queue_t *, mblk_t **, ipha_t *, uint32_t *, + uint16_t *); int ip_snmp_get(queue_t *, mblk_t *); static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *); static mblk_t *ip_snmp_get_mib2_ip6(queue_t *, mblk_t *); @@ -692,7 +708,6 @@ int ip_snmp_set(queue_t *, int, int, uchar_t *, int); static boolean_t ip_source_routed(ipha_t *); static boolean_t ip_source_route_included(ipha_t *); -static void ip_unbind(queue_t *, mblk_t *); static void ip_wput_frag(ire_t *, mblk_t *, ip_pkt_t, uint32_t, uint32_t); static mblk_t *ip_wput_frag_copyhdr(uchar_t *, int, int); static void ip_wput_local_options(ipha_t *); @@ -767,6 +782,15 @@ uint_t icmp_pkt_err_sent = 0; /* Number of packets sent in burst */ time_t ip_g_frag_timeout = IP_FRAG_TIMEOUT; clock_t ip_g_frag_timo_ms = IP_FRAG_TIMEOUT * 1000; +/* + * Threshold which determines whether MDT should be used when + * generating IP fragments; payload size must be greater than + * this threshold for MDT to take place. + */ +#define IP_WPUT_FRAG_MDT_MIN 32768 + +int ip_wput_frag_mdt_min = IP_WPUT_FRAG_MDT_MIN; + /* Protected by ip_mi_lock */ static void *ip_g_head; /* Instance Data List Head */ kmutex_t ip_mi_lock; /* Lock for list of instances */ @@ -1431,7 +1455,7 @@ static ipha_t icmp_ipha = { }; struct module_info ip_mod_info = { - 5701, "ip", 1, INFPSZ, 65536, 1024 + IP_MOD_ID, IP_MOD_NAME, 1, INFPSZ, 65536, 1024 }; static struct qinit rinit = { @@ -1930,6 +1954,8 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill, /* Send out an ICMP packet */ icmph->icmph_checksum = 0; icmph->icmph_checksum = IP_CSUM(mp, iph_hdr_length, 0); + if (icmph->icmph_checksum == 0) + icmph->icmph_checksum = 0xFFFF; if (broadcast || CLASSD(ipha->ipha_dst)) { ipif_t *ipif_chosen; /* @@ -3204,6 +3230,8 @@ icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len, bcopy(stuff, icmph, len); icmph->icmph_checksum = 0; icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0); + if (icmph->icmph_checksum == 0) + icmph->icmph_checksum = 0xFFFF; BUMP_MIB(&icmp_mib, icmpOutMsgs); put(q, ipsec_mp); } @@ -3704,7 +3732,7 @@ ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp) ASSERT(!connp->conn_af_isv6); connp->conn_pkt_isv6 = B_FALSE; - len = mp->b_wptr - mp->b_rptr; + len = MBLKL(mp); if (len < (sizeof (*tbr) + 1)) { (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, "ip_bind: bogus msg, len %ld", len); @@ -3716,7 +3744,7 @@ ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp) protocol = *mp->b_wptr & 0xFF; tbr = (struct T_bind_req *)mp->b_rptr; /* Reset the message type in preparation for shipping it back. */ - mp->b_datap->db_type = M_PCPROTO; + DB_TYPE(mp) = M_PCPROTO; connp->conn_ulp = (uint8_t)protocol; @@ -3762,8 +3790,8 @@ ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp) */ mp1 = mp->b_cont; - ire_requested = (mp1 && mp1->b_datap->db_type == IRE_DB_REQ_TYPE); - ipsec_policy_set = (mp1 && mp1->b_datap->db_type == IPSEC_POLICY_SET); + ire_requested = (mp1 != NULL && DB_TYPE(mp1) == IRE_DB_REQ_TYPE); + ipsec_policy_set = (mp1 != NULL && DB_TYPE(mp1) == IPSEC_POLICY_SET); switch (tbr->ADDR_length) { default: @@ -4169,7 +4197,7 @@ ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp, if (ip_multidata_outbound && !ipsec_policy_set && dst_ire != NULL && !(dst_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST)) && (md_ill = ire_to_ill(dst_ire), md_ill != NULL) && - (md_ill->ill_capabilities & ILL_CAPAB_MDT)) { + ILL_MDT_CAPABLE(md_ill)) { md_dst_ire = dst_ire; IRE_REFHOLD(md_dst_ire); } @@ -4689,43 +4717,19 @@ ip_modclose(ill_t *ill) } /* - * IP has been configured as _D_QNEXTLESS for the client side i.e the driver - * instance. This implies that - * 1. IP cannot access the read side q_next pointer directly - it must - * use routines like putnext and canputnext. - * 2. ip_close must ensure that all sources of messages being putnext upstream - * are gone before qprocsoff is called. - * - * #2 is handled by having ip_close do the ipcl_hash_remove and wait for - * conn_ref to drop to zero before calling qprocsoff. + * This is called as part of close() for both IP and UDP + * in order to quiesce the conn. */ - -/* ARGSUSED */ -int -ip_close(queue_t *q, int flags) +void +ip_quiesce_conn(conn_t *connp) { - conn_t *connp; boolean_t drain_cleanup_reqd = B_FALSE; boolean_t conn_ioctl_cleanup_reqd = B_FALSE; boolean_t ilg_cleanup_reqd = B_FALSE; - TRACE_1(TR_FAC_IP, TR_IP_CLOSE, "ip_close: q %p", q); + ASSERT(!IPCL_IS_TCP(connp)); /* - * Call the appropriate delete routine depending on whether this is - * a module or device. - */ - if (WR(q)->q_next != NULL) { - /* This is a module close */ - return (ip_modclose((ill_t *)q->q_ptr)); - } - - connp = Q_TO_CONN(q); - ASSERT(connp->conn_tcp == NULL); - - /* - * We are being closed as /dev/ip or /dev/ip6. - * * Mark the conn as closing, and this conn must not be * inserted in future into any list. Eg. conn_drain_insert(), * won't insert this conn into the conn_drain_list. @@ -4736,6 +4740,7 @@ ip_close(queue_t *q, int flags) * cannot get set henceforth. */ mutex_enter(&connp->conn_lock); + ASSERT(!(connp->conn_state_flags & CONN_QUIESCED)); connp->conn_state_flags |= CONN_CLOSING; if (connp->conn_idl != NULL) drain_cleanup_reqd = B_TRUE; @@ -4745,17 +4750,17 @@ ip_close(queue_t *q, int flags) ilg_cleanup_reqd = B_TRUE; mutex_exit(&connp->conn_lock); + if (IPCL_IS_UDP(connp)) + udp_quiesce_conn(connp); + if (conn_ioctl_cleanup_reqd) conn_ioctl_cleanup(connp); /* * Remove this conn from any fanout list it is on. - * Then wait until the number of pending putnexts from - * the fanout code drops to zero, before calling qprocsoff. - * This is the guarantee a QNEXTLESS driver provides to - * STREAMS, and is mentioned at the top of this function. + * and then wait for any threads currently operating + * on this endpoint to finish */ - ipcl_hash_remove(connp); /* @@ -4776,7 +4781,6 @@ ip_close(queue_t *q, int flags) conn_delete_ire(connp, NULL); - /* * Now conn refcnt can increase only thru CONN_INC_REF_LOCKED. * callers from write side can't be there now because close @@ -4787,7 +4791,29 @@ ip_close(queue_t *q, int flags) connp->conn_state_flags |= CONN_CONDEMNED; while (connp->conn_ref != 1) cv_wait(&connp->conn_cv, &connp->conn_lock); + connp->conn_state_flags |= CONN_QUIESCED; mutex_exit(&connp->conn_lock); +} + +/* ARGSUSED */ +int +ip_close(queue_t *q, int flags) +{ + conn_t *connp; + + TRACE_1(TR_FAC_IP, TR_IP_CLOSE, "ip_close: q %p", q); + + /* + * Call the appropriate delete routine depending on whether this is + * a module or device. + */ + if (WR(q)->q_next != NULL) { + /* This is a module close */ + return (ip_modclose((ill_t *)q->q_ptr)); + } + + connp = q->q_ptr; + ip_quiesce_conn(connp); qprocsoff(q); @@ -4801,6 +4827,15 @@ ip_close(queue_t *q, int flags) * has completed, and service has completed or won't run in * future. */ + ASSERT(connp->conn_ref == 1); + + /* + * A conn which was previously marked as IPCL_UDP cannot + * retain the flag because it would have been cleared by + * udp_close(). + */ + ASSERT(!IPCL_IS_UDP(connp)); + if (connp->conn_latch != NULL) { IPLATCH_REFRELE(connp->conn_latch); connp->conn_latch = NULL; @@ -4827,6 +4862,83 @@ ip_close(queue_t *q, int flags) return (0); } +int +ip_snmpmod_close(queue_t *q) +{ + conn_t *connp = Q_TO_CONN(q); + ASSERT(connp->conn_flags & (IPCL_TCPMOD | IPCL_UDPMOD)); + + qprocsoff(q); + + if (connp->conn_flags & IPCL_UDPMOD) + udp_close_free(connp); + + if (connp->conn_cred != NULL) { + crfree(connp->conn_cred); + connp->conn_cred = NULL; + } + CONN_DEC_REF(connp); + q->q_ptr = WR(q)->q_ptr = NULL; + return (0); +} + +/* + * Write side put procedure for TCP module or UDP module instance. TCP/UDP + * as a module is only used for MIB browsers that push TCP/UDP over IP or ARP. + * The only supported primitives are T_SVR4_OPTMGMT_REQ and T_OPTMGMT_REQ. + * M_FLUSH messages and ioctls are only passed downstream; we don't flush our + * queues as we never enqueue messages there and we don't handle any ioctls. + * Everything else is freed. + */ +void +ip_snmpmod_wput(queue_t *q, mblk_t *mp) +{ + conn_t *connp = q->q_ptr; + pfi_t setfn; + pfi_t getfn; + + ASSERT(connp->conn_flags & (IPCL_TCPMOD | IPCL_UDPMOD)); + + switch (DB_TYPE(mp)) { + case M_PROTO: + case M_PCPROTO: + if ((MBLKL(mp) >= sizeof (t_scalar_t)) && + ((((union T_primitives *)mp->b_rptr)->type == + T_SVR4_OPTMGMT_REQ) || + (((union T_primitives *)mp->b_rptr)->type == + T_OPTMGMT_REQ))) { + /* + * This is the only TPI primitive supported. Its + * handling does not require tcp_t, but it does require + * conn_t to check permissions. + */ + cred_t *cr = DB_CREDDEF(mp, connp->conn_cred); + + if (connp->conn_flags & IPCL_TCPMOD) { + setfn = tcp_snmp_set; + getfn = tcp_snmp_get; + } else { + setfn = udp_snmp_set; + getfn = udp_snmp_get; + } + if (!snmpcom_req(q, mp, setfn, getfn, cr)) { + freemsg(mp); + return; + } + } else if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, ENOTSUP)) + != NULL) + qreply(q, mp); + break; + case M_FLUSH: + case M_IOCTL: + putnext(q, mp); + break; + default: + freemsg(mp); + break; + } +} + /* Return the IP checksum for the IP header at "iph". */ uint16_t ip_csum_hdr(ipha_t *ipha) @@ -5081,7 +5193,7 @@ ip_dot_saddr(uchar_t *addr, char *buf) * Send an ICMP error after patching up the packet appropriately. Returns * non-zero if the appropriate MIB should be bumped; zero otherwise. */ -static int +static boolean_t ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags, uint_t icmp_type, uint_t icmp_code, boolean_t mctl_present, zoneid_t zoneid) { @@ -5103,8 +5215,8 @@ ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags, * ipsec_check_global_policy() assumes M_DATA as clear * and M_CTL as secure. */ - db_type = mp->b_datap->db_type; - mp->b_datap->db_type = M_DATA; + db_type = DB_TYPE(mp); + DB_TYPE(mp) = M_DATA; secure = B_FALSE; } /* @@ -5119,17 +5231,17 @@ ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags, first_mp = ipsec_check_global_policy(first_mp, NULL, ipha, NULL, mctl_present); if (first_mp == NULL) - return (0); + return (B_FALSE); } if (!mctl_present) - mp->b_datap->db_type = db_type; + DB_TYPE(mp) = db_type; if (flags & IP_FF_SEND_ICMP) { if (flags & IP_FF_HDR_COMPLETE) { if (ip_hdr_complete(ipha, zoneid)) { freemsg(first_mp); - return (1); + return (B_TRUE); } } if (flags & IP_FF_CKSUM) { @@ -5152,10 +5264,10 @@ ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags, } } else { freemsg(first_mp); - return (0); + return (B_FALSE); } - return (1); + return (B_TRUE); } #ifdef DEBUG @@ -5592,7 +5704,7 @@ ip_fanout_tcp(queue_t *q, mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, } mp->b_datap->db_struioflag |= STRUIO_EAGER; - mp->b_datap->db_cksumstart = (intptr_t)sqp; + DB_CKSUMSTART(mp) = (intptr_t)sqp; syn_present = B_TRUE; } } @@ -5720,7 +5832,6 @@ ip_fanout_udp_conn(conn_t *connp, mblk_t *first_mp, mblk_t *mp, boolean_t secure, ipha_t *ipha, uint_t flags, ill_t *recv_ill, boolean_t ip_policy) { - queue_t *rq = connp->conn_rq; boolean_t mctl_present = (first_mp != NULL); uint32_t in_flags = 0; /* set to IP_RECVSLLA and/or IP_RECVIF */ uint32_t ill_index; @@ -5730,7 +5841,7 @@ ip_fanout_udp_conn(conn_t *connp, mblk_t *first_mp, mblk_t *mp, else first_mp = mp; - if (!canputnext(rq)) { + if (CONN_UDP_FLOWCTLD(connp)) { BUMP_MIB(&ip_mib, udpInOverflows); freemsg(first_mp); return; @@ -5776,7 +5887,9 @@ ip_fanout_udp_conn(conn_t *connp, mblk_t *first_mp, mblk_t *mp, mp = ip_add_info(mp, recv_ill, in_flags); } BUMP_MIB(&ip_mib, ipInDelivers); - putnext(rq, mp); + + /* Send it upstream */ + CONN_UDP_RECV(connp, mp); } /* @@ -8454,7 +8567,6 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) return (ip_modopen(q, devp, flag, sflag, credp)); } - /* * We are opening as a device. This is an IP client stream, and we * allocate an conn_t as the instance data. @@ -8463,6 +8575,9 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) connp->conn_upq = q; q->q_ptr = WR(q)->q_ptr = connp; + if (flag & SO_SOCKSTR) + connp->conn_flags |= IPCL_SOCKET; + /* Minor tells us which /dev entry was opened */ if (geteminor(*devp) == IPV6_MINOR) { connp->conn_flags |= IPCL_ISV6; @@ -8474,9 +8589,7 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) connp->conn_pkt_isv6 = B_FALSE; } - - if ((connp->conn_dev = - inet_minor_alloc(ip_minor_arena)) == 0) { + if ((connp->conn_dev = inet_minor_alloc(ip_minor_arena)) == 0) { q->q_ptr = WR(q)->q_ptr = NULL; CONN_DEC_REF(connp); return (EBUSY); @@ -10734,381 +10847,455 @@ ip_udp_check(queue_t *q, conn_t *connp, ill_t *ill, ipha_t *ipha, } /* - * Do fragmentation reassembly. - * returns B_TRUE if successful else B_FALSE. + * Fragmentation reassembly. Each ILL has a hash table for + * queuing packets undergoing reassembly for all IPIFs + * associated with the ILL. The hash is based on the packet + * IP ident field. The ILL frag hash table was allocated + * as a timer block at the time the ILL was created. Whenever + * there is anything on the reassembly queue, the timer will + * be running. Returns B_TRUE if successful else B_FALSE; * frees mp on failure. */ static boolean_t -ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha) +ip_rput_fragment(queue_t *q, mblk_t **mpp, ipha_t *ipha, + uint32_t *cksum_val, uint16_t *cksum_flags) { uint32_t frag_offset_flags; - ill_t *ill = (ill_t *)q->q_ptr; - mblk_t *mp = *mpp; - mblk_t *t_mp; + ill_t *ill = (ill_t *)q->q_ptr; + mblk_t *mp = *mpp; + mblk_t *t_mp; ipaddr_t dst; + uint8_t proto = ipha->ipha_protocol; + uint32_t sum_val; + uint16_t sum_flags; + ipf_t *ipf; + ipf_t **ipfp; + ipfb_t *ipfb; + uint16_t ident; + uint32_t offset; + ipaddr_t src; + uint_t hdr_length; + uint32_t end; + mblk_t *mp1; + mblk_t *tail_mp; + size_t count; + size_t msg_len; + uint8_t ecn_info = 0; + uint32_t packet_size; + boolean_t pruned = B_FALSE; + + if (cksum_val != NULL) + *cksum_val = 0; + if (cksum_flags != NULL) + *cksum_flags = 0; /* * Drop the fragmented as early as possible, if * we don't have resource(s) to re-assemble. */ - if (ip_reass_queue_bytes == 0) { freemsg(mp); return (B_FALSE); } - dst = ipha->ipha_dst; - - /* Clear hardware checksumming flag if set */ - mp->b_datap->db_struioun.cksum.flags = 0; + /* Check for fragmentation offset; return if there's none */ + if ((frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) & + (IPH_MF | IPH_OFFSET)) == 0) + return (B_TRUE); - /* Check for fragmentation offset. */ - frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) & - (IPH_MF | IPH_OFFSET); - if (frag_offset_flags) { - ipf_t *ipf; - ipf_t **ipfp; - ipfb_t *ipfb; - uint16_t ident; - uint32_t offset; - ipaddr_t src; - uint_t hdr_length; - uint32_t end; - uint8_t proto; - mblk_t *mp1; - mblk_t *tail_mp; - size_t count; - size_t msg_len; - uint8_t ecn_info = 0; - uint32_t packet_size; - boolean_t pruned = B_FALSE; - - ident = ipha->ipha_ident; - offset = (frag_offset_flags << 3) & 0xFFFF; - src = ipha->ipha_src; - hdr_length = IPH_HDR_LENGTH(ipha); - end = ntohs(ipha->ipha_length) - hdr_length; + /* + * We utilize hardware computed checksum info only for UDP since + * IP fragmentation is a normal occurence for the protocol. In + * addition, checksum offload support for IP fragments carrying + * UDP payload is commonly implemented across network adapters. + */ + ASSERT(ill != NULL); + if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) && + (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) { + mblk_t *mp1 = mp->b_cont; + int32_t len; + + /* Record checksum information from the packet */ + sum_val = (uint32_t)DB_CKSUM16(mp); + sum_flags = DB_CKSUMFLAGS(mp); + + /* IP payload offset from beginning of mblk */ + offset = ((uchar_t *)ipha + IPH_HDR_LENGTH(ipha)) - mp->b_rptr; + + if ((sum_flags & HCK_PARTIALCKSUM) && + (mp1 == NULL || mp1->b_cont == NULL) && + offset >= DB_CKSUMSTART(mp) && + ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) { + uint32_t adj; + /* + * Partial checksum has been calculated by hardware + * and attached to the packet; in addition, any + * prepended extraneous data is even byte aligned. + * If any such data exists, we adjust the checksum; + * this would also handle any postpended data. + */ + IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp), + mp, mp1, len, adj); - /* - * if end == 0 then we have a packet with no data, so just - * free it. - */ - if (end == 0) { - freemsg(mp); - return (B_FALSE); + /* One's complement subtract extraneous checksum */ + if (adj >= sum_val) + sum_val = ~(adj - sum_val) & 0xFFFF; + else + sum_val -= adj; } - proto = ipha->ipha_protocol; + } else { + sum_val = 0; + sum_flags = 0; + } - /* - * Fragmentation reassembly. Each ILL has a hash table for - * queuing packets undergoing reassembly for all IPIFs - * associated with the ILL. The hash is based on the packet - * IP ident field. The ILL frag hash table was allocated - * as a timer block at the time the ILL was created. Whenever - * there is anything on the reassembly queue, the timer will - * be running. - */ - ASSERT(ill != NULL); + /* Clear hardware checksumming flag */ + DB_CKSUMFLAGS(mp) = 0; - /* Record the ECN field info. */ - ecn_info = (ipha->ipha_type_of_service & 0x3); - if (offset != 0) { - /* - * If this isn't the first piece, strip the header, and - * add the offset to the end value. - */ - mp->b_rptr += hdr_length; - end += offset; - } + ident = ipha->ipha_ident; + offset = (frag_offset_flags << 3) & 0xFFFF; + src = ipha->ipha_src; + dst = ipha->ipha_dst; + hdr_length = IPH_HDR_LENGTH(ipha); + end = ntohs(ipha->ipha_length) - hdr_length; - msg_len = mp->b_datap->db_lim - mp->b_datap->db_base; - tail_mp = mp; - while (tail_mp->b_cont != NULL) { - tail_mp = tail_mp->b_cont; - msg_len += tail_mp->b_datap->db_lim - - tail_mp->b_datap->db_base; - } + /* If end == 0 then we have a packet with no data, so just free it */ + if (end == 0) { + freemsg(mp); + return (B_FALSE); + } + /* Record the ECN field info. */ + ecn_info = (ipha->ipha_type_of_service & 0x3); + if (offset != 0) { /* - * If the reassembly list for this ILL will get too big - * prune it. + * If this isn't the first piece, strip the header, and + * add the offset to the end value. */ - if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >= - ip_reass_queue_bytes) { - ill_frag_prune(ill, - (ip_reass_queue_bytes < msg_len) ? 0 : - (ip_reass_queue_bytes - msg_len)); - pruned = B_TRUE; - } + mp->b_rptr += hdr_length; + end += offset; + } - ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)]; - mutex_enter(&ipfb->ipfb_lock); + msg_len = MBLKSIZE(mp); + tail_mp = mp; + while (tail_mp->b_cont != NULL) { + tail_mp = tail_mp->b_cont; + msg_len += MBLKSIZE(tail_mp); + } - ipfp = &ipfb->ipfb_ipf; - /* Try to find an existing fragment queue for this packet. */ - for (;;) { - ipf = ipfp[0]; - if (ipf != NULL) { - /* - * It has to match on ident and src/dst address. - */ - if (ipf->ipf_ident == ident && - ipf->ipf_src == src && - ipf->ipf_dst == dst && - ipf->ipf_protocol == proto) { - /* - * If we have received too many - * duplicate fragments for this packet - * free it. - */ - if (ipf->ipf_num_dups > - ip_max_frag_dups) { - ill_frag_free_pkts(ill, ipfb, - ipf, 1); - freemsg(mp); - mutex_exit(&ipfb->ipfb_lock); - return (B_FALSE); - } - /* Found it. */ - break; - } - ipfp = &ipf->ipf_hash_next; - continue; - } + /* If the reassembly list for this ILL will get too big, prune it */ + if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >= + ip_reass_queue_bytes) { + ill_frag_prune(ill, + (ip_reass_queue_bytes < msg_len) ? 0 : + (ip_reass_queue_bytes - msg_len)); + pruned = B_TRUE; + } + + ipfb = &ill->ill_frag_hash_tbl[ILL_FRAG_HASH(src, ident)]; + mutex_enter(&ipfb->ipfb_lock); + ipfp = &ipfb->ipfb_ipf; + /* Try to find an existing fragment queue for this packet. */ + for (;;) { + ipf = ipfp[0]; + if (ipf != NULL) { /* - * If we pruned the list, do we want to store this new - * fragment?. We apply an optimization here based on the - * fact that most fragments will be received in order. - * So if the offset of this incoming fragment is zero, - * it is the first fragment of a new packet. We will - * keep it. Otherwise drop the fragment, as we have - * probably pruned the packet already (since the - * packet cannot be found). + * It has to match on ident and src/dst address. */ - if (pruned && offset != 0) { - mutex_exit(&ipfb->ipfb_lock); - freemsg(mp); - return (B_FALSE); - } - - if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS) { + if (ipf->ipf_ident == ident && + ipf->ipf_src == src && + ipf->ipf_dst == dst && + ipf->ipf_protocol == proto) { /* - * Too many fragmented packets in this hash - * bucket. Free the oldest. + * If we have received too many + * duplicate fragments for this packet + * free it. */ - ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, - 1); - } - - /* New guy. Allocate a frag message. */ - mp1 = allocb(sizeof (*ipf), BPRI_MED); - if (mp1 == NULL) { - BUMP_MIB(&ip_mib, ipInDiscards); - freemsg(mp); -reass_done: - mutex_exit(&ipfb->ipfb_lock); - return (B_FALSE); + if (ipf->ipf_num_dups > ip_max_frag_dups) { + ill_frag_free_pkts(ill, ipfb, ipf, 1); + freemsg(mp); + mutex_exit(&ipfb->ipfb_lock); + return (B_FALSE); + } + /* Found it. */ + break; } + ipfp = &ipf->ipf_hash_next; + continue; + } + /* + * If we pruned the list, do we want to store this new + * fragment?. We apply an optimization here based on the + * fact that most fragments will be received in order. + * So if the offset of this incoming fragment is zero, + * it is the first fragment of a new packet. We will + * keep it. Otherwise drop the fragment, as we have + * probably pruned the packet already (since the + * packet cannot be found). + */ + if (pruned && offset != 0) { + mutex_exit(&ipfb->ipfb_lock); + freemsg(mp); + return (B_FALSE); + } - BUMP_MIB(&ip_mib, ipReasmReqds); - mp1->b_cont = mp; - - /* Initialize the fragment header. */ - ipf = (ipf_t *)mp1->b_rptr; - ipf->ipf_mp = mp1; - ipf->ipf_ptphn = ipfp; - ipfp[0] = ipf; - ipf->ipf_hash_next = NULL; - ipf->ipf_ident = ident; - ipf->ipf_protocol = proto; - ipf->ipf_src = src; - ipf->ipf_dst = dst; - ipf->ipf_nf_hdr_len = 0; - /* Record reassembly start time. */ - ipf->ipf_timestamp = gethrestime_sec(); - /* Record ipf generation and account for frag header */ - ipf->ipf_gen = ill->ill_ipf_gen++; - ipf->ipf_count = mp1->b_datap->db_lim - - mp1->b_datap->db_base; - ipf->ipf_last_frag_seen = B_FALSE; - ipf->ipf_ecn = ecn_info; - ipf->ipf_num_dups = 0; - ipfb->ipfb_frag_pkts++; - + if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS) { /* - * We handle reassembly two ways. In the easy case, - * where all the fragments show up in order, we do - * minimal bookkeeping, and just clip new pieces on - * the end. If we ever see a hole, then we go off - * to ip_reassemble which has to mark the pieces and - * keep track of the number of holes, etc. Obviously, - * the point of having both mechanisms is so we can - * handle the easy case as efficiently as possible. + * Too many fragmented packets in this hash + * bucket. Free the oldest. */ - if (offset == 0) { - /* Easy case, in-order reassembly so far. */ - ipf->ipf_count += msg_len; - ipf->ipf_tail_mp = tail_mp; - /* - * Keep track of next expected offset in - * ipf_end. - */ - ipf->ipf_end = end; - ipf->ipf_nf_hdr_len = hdr_length; - } else { - /* Hard case, hole at the beginning. */ - ipf->ipf_tail_mp = NULL; - /* - * ipf_end == 0 means that we have given up - * on easy reassembly. - */ - ipf->ipf_end = 0; - /* - * ipf_hole_cnt is set by ip_reassemble. - * ipf_count is updated by ip_reassemble. - * No need to check for return value here - * as we don't expect reassembly to complete - * or fail for the first fragment itself. - */ - (void) ip_reassemble(mp, ipf, - (frag_offset_flags & IPH_OFFSET) << 3, - (frag_offset_flags & IPH_MF), ill, msg_len); - } - /* Update per ipfb and ill byte counts */ - ipfb->ipfb_count += ipf->ipf_count; - ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ - ill->ill_frag_count += ipf->ipf_count; - ASSERT(ill->ill_frag_count > 0); /* Wraparound */ - /* If the frag timer wasn't already going, start it. */ - mutex_enter(&ill->ill_lock); - ill_frag_timer_start(ill); - mutex_exit(&ill->ill_lock); - goto reass_done; + ill_frag_free_pkts(ill, ipfb, ipfb->ipfb_ipf, 1); } - /* - * We have a new piece of a datagram which is already being - * reassembled. Update the ECN info if all IP fragments - * are ECN capable. If there is one which is not, clear - * all the info. If there is at least one which has CE - * code point, IP needs to report that up to transport. - */ - if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) { - if (ecn_info == IPH_ECN_CE) - ipf->ipf_ecn = IPH_ECN_CE; - } else { - ipf->ipf_ecn = IPH_ECN_NECT; + /* New guy. Allocate a frag message. */ + mp1 = allocb(sizeof (*ipf), BPRI_MED); + if (mp1 == NULL) { + BUMP_MIB(&ip_mib, ipInDiscards); + freemsg(mp); +reass_done: + mutex_exit(&ipfb->ipfb_lock); + return (B_FALSE); } - if (offset && ipf->ipf_end == offset) { - /* The new fragment fits at the end */ - ipf->ipf_tail_mp->b_cont = mp; - /* Update the byte count */ + + + BUMP_MIB(&ip_mib, ipReasmReqds); + mp1->b_cont = mp; + + /* Initialize the fragment header. */ + ipf = (ipf_t *)mp1->b_rptr; + ipf->ipf_mp = mp1; + ipf->ipf_ptphn = ipfp; + ipfp[0] = ipf; + ipf->ipf_hash_next = NULL; + ipf->ipf_ident = ident; + ipf->ipf_protocol = proto; + ipf->ipf_src = src; + ipf->ipf_dst = dst; + ipf->ipf_nf_hdr_len = 0; + /* Record reassembly start time. */ + ipf->ipf_timestamp = gethrestime_sec(); + /* Record ipf generation and account for frag header */ + ipf->ipf_gen = ill->ill_ipf_gen++; + ipf->ipf_count = MBLKSIZE(mp1); + ipf->ipf_last_frag_seen = B_FALSE; + ipf->ipf_ecn = ecn_info; + ipf->ipf_num_dups = 0; + ipfb->ipfb_frag_pkts++; + ipf->ipf_checksum = 0; + ipf->ipf_checksum_flags = 0; + + /* Store checksum value in fragment header */ + if (sum_flags != 0) { + sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); + sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); + ipf->ipf_checksum = sum_val; + ipf->ipf_checksum_flags = sum_flags; + } + + /* + * We handle reassembly two ways. In the easy case, + * where all the fragments show up in order, we do + * minimal bookkeeping, and just clip new pieces on + * the end. If we ever see a hole, then we go off + * to ip_reassemble which has to mark the pieces and + * keep track of the number of holes, etc. Obviously, + * the point of having both mechanisms is so we can + * handle the easy case as efficiently as possible. + */ + if (offset == 0) { + /* Easy case, in-order reassembly so far. */ ipf->ipf_count += msg_len; - /* Update per ipfb and ill byte counts */ - ipfb->ipfb_count += msg_len; - ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ - ill->ill_frag_count += msg_len; - ASSERT(ill->ill_frag_count > 0); /* Wraparound */ - if (frag_offset_flags & IPH_MF) { - /* More to come. */ - ipf->ipf_end = end; - ipf->ipf_tail_mp = tail_mp; - goto reass_done; - } + ipf->ipf_tail_mp = tail_mp; + /* + * Keep track of next expected offset in + * ipf_end. + */ + ipf->ipf_end = end; + ipf->ipf_nf_hdr_len = hdr_length; } else { - /* Go do the hard cases. */ - int ret; + /* Hard case, hole at the beginning. */ + ipf->ipf_tail_mp = NULL; + /* + * ipf_end == 0 means that we have given up + * on easy reassembly. + */ + ipf->ipf_end = 0; - if (offset == 0) - ipf->ipf_nf_hdr_len = hdr_length; + /* Forget checksum offload from now on */ + ipf->ipf_checksum_flags = 0; - /* Save current byte count */ - count = ipf->ipf_count; - ret = ip_reassemble(mp, ipf, + /* + * ipf_hole_cnt is set by ip_reassemble. + * ipf_count is updated by ip_reassemble. + * No need to check for return value here + * as we don't expect reassembly to complete + * or fail for the first fragment itself. + */ + (void) ip_reassemble(mp, ipf, (frag_offset_flags & IPH_OFFSET) << 3, (frag_offset_flags & IPH_MF), ill, msg_len); - /* Count of bytes added and subtracted (freeb()ed) */ - count = ipf->ipf_count - count; - if (count) { - /* Update per ipfb and ill byte counts */ - ipfb->ipfb_count += count; - ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ - ill->ill_frag_count += count; - ASSERT(ill->ill_frag_count > 0); - } - if (ret == IP_REASS_PARTIAL) { - goto reass_done; - } else if (ret == IP_REASS_FAILED) { - /* Reassembly failed. Free up all resources */ - ill_frag_free_pkts(ill, ipfb, ipf, 1); - for (t_mp = mp; t_mp != NULL; - t_mp = t_mp->b_cont) { - IP_REASS_SET_START(t_mp, 0); - IP_REASS_SET_END(t_mp, 0); - } - freemsg(mp); - goto reass_done; - } - /* We will reach here iff 'ret' is IP_REASS_COMPLETE */ } - /* - * We have completed reassembly. Unhook the frag header from - * the reassembly list. - * - * Before we free the frag header, record the ECN info - * to report back to the transport. - */ - ecn_info = ipf->ipf_ecn; - BUMP_MIB(&ip_mib, ipReasmOKs); - ipfp = ipf->ipf_ptphn; - mp1 = ipf->ipf_mp; - count = ipf->ipf_count; - ipf = ipf->ipf_hash_next; - if (ipf) - ipf->ipf_ptphn = ipfp; - ipfp[0] = ipf; - ill->ill_frag_count -= count; - ASSERT(ipfb->ipfb_count >= count); - ipfb->ipfb_count -= count; - ipfb->ipfb_frag_pkts--; - mutex_exit(&ipfb->ipfb_lock); - /* Ditch the frag header. */ - mp = mp1->b_cont; + /* Update per ipfb and ill byte counts */ + ipfb->ipfb_count += ipf->ipf_count; + ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ + ill->ill_frag_count += ipf->ipf_count; + ASSERT(ill->ill_frag_count > 0); /* Wraparound */ + /* If the frag timer wasn't already going, start it. */ + mutex_enter(&ill->ill_lock); + ill_frag_timer_start(ill); + mutex_exit(&ill->ill_lock); + goto reass_done; + } - freeb(mp1); + /* + * If the packet's flag has changed (it could be coming up + * from an interface different than the previous, therefore + * possibly different checksum capability), then forget about + * any stored checksum states. Otherwise add the value to + * the existing one stored in the fragment header. + */ + if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) { + sum_val += ipf->ipf_checksum; + sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); + sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); + ipf->ipf_checksum = sum_val; + } else if (ipf->ipf_checksum_flags != 0) { + /* Forget checksum offload from now on */ + ipf->ipf_checksum_flags = 0; + } - /* Restore original IP length in header. */ - packet_size = (uint32_t)msgdsize(mp); - if (packet_size > IP_MAXPACKET) { - freemsg(mp); - BUMP_MIB(&ip_mib, ipInHdrErrors); - return (B_FALSE); + /* + * We have a new piece of a datagram which is already being + * reassembled. Update the ECN info if all IP fragments + * are ECN capable. If there is one which is not, clear + * all the info. If there is at least one which has CE + * code point, IP needs to report that up to transport. + */ + if (ecn_info != IPH_ECN_NECT && ipf->ipf_ecn != IPH_ECN_NECT) { + if (ecn_info == IPH_ECN_CE) + ipf->ipf_ecn = IPH_ECN_CE; + } else { + ipf->ipf_ecn = IPH_ECN_NECT; + } + if (offset && ipf->ipf_end == offset) { + /* The new fragment fits at the end */ + ipf->ipf_tail_mp->b_cont = mp; + /* Update the byte count */ + ipf->ipf_count += msg_len; + /* Update per ipfb and ill byte counts */ + ipfb->ipfb_count += msg_len; + ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ + ill->ill_frag_count += msg_len; + ASSERT(ill->ill_frag_count > 0); /* Wraparound */ + if (frag_offset_flags & IPH_MF) { + /* More to come. */ + ipf->ipf_end = end; + ipf->ipf_tail_mp = tail_mp; + goto reass_done; } + } else { + /* Go do the hard cases. */ + int ret; - if (mp->b_datap->db_ref > 1) { - mblk_t *mp2; + if (offset == 0) + ipf->ipf_nf_hdr_len = hdr_length; - mp2 = copymsg(mp); - freemsg(mp); - if (!mp2) { - BUMP_MIB(&ip_mib, ipInDiscards); - return (B_FALSE); + /* Save current byte count */ + count = ipf->ipf_count; + ret = ip_reassemble(mp, ipf, + (frag_offset_flags & IPH_OFFSET) << 3, + (frag_offset_flags & IPH_MF), ill, msg_len); + /* Count of bytes added and subtracted (freeb()ed) */ + count = ipf->ipf_count - count; + if (count) { + /* Update per ipfb and ill byte counts */ + ipfb->ipfb_count += count; + ASSERT(ipfb->ipfb_count > 0); /* Wraparound */ + ill->ill_frag_count += count; + ASSERT(ill->ill_frag_count > 0); + } + if (ret == IP_REASS_PARTIAL) { + goto reass_done; + } else if (ret == IP_REASS_FAILED) { + /* Reassembly failed. Free up all resources */ + ill_frag_free_pkts(ill, ipfb, ipf, 1); + for (t_mp = mp; t_mp != NULL; t_mp = t_mp->b_cont) { + IP_REASS_SET_START(t_mp, 0); + IP_REASS_SET_END(t_mp, 0); } - mp = mp2; + freemsg(mp); + goto reass_done; } - ipha = (ipha_t *)mp->b_rptr; + /* We will reach here iff 'ret' is IP_REASS_COMPLETE */ + } + /* + * We have completed reassembly. Unhook the frag header from + * the reassembly list. + * + * Before we free the frag header, record the ECN info + * to report back to the transport. + */ + ecn_info = ipf->ipf_ecn; + BUMP_MIB(&ip_mib, ipReasmOKs); + ipfp = ipf->ipf_ptphn; - ipha->ipha_length = htons((uint16_t)packet_size); - /* We're now complete, zip the frag state */ - ipha->ipha_fragment_offset_and_flags = 0; - /* Record the ECN info. */ - ipha->ipha_type_of_service &= 0xFC; - ipha->ipha_type_of_service |= ecn_info; - *mpp = mp; + /* We need to supply these to caller */ + if ((sum_flags = ipf->ipf_checksum_flags) != 0) + sum_val = ipf->ipf_checksum; + else + sum_val = 0; + + mp1 = ipf->ipf_mp; + count = ipf->ipf_count; + ipf = ipf->ipf_hash_next; + if (ipf != NULL) + ipf->ipf_ptphn = ipfp; + ipfp[0] = ipf; + ill->ill_frag_count -= count; + ASSERT(ipfb->ipfb_count >= count); + ipfb->ipfb_count -= count; + ipfb->ipfb_frag_pkts--; + mutex_exit(&ipfb->ipfb_lock); + /* Ditch the frag header. */ + mp = mp1->b_cont; + + freeb(mp1); + + /* Restore original IP length in header. */ + packet_size = (uint32_t)msgdsize(mp); + if (packet_size > IP_MAXPACKET) { + freemsg(mp); + BUMP_MIB(&ip_mib, ipInHdrErrors); + return (B_FALSE); + } + if (DB_REF(mp) > 1) { + mblk_t *mp2 = copymsg(mp); + + freemsg(mp); + if (mp2 == NULL) { + BUMP_MIB(&ip_mib, ipInDiscards); + return (B_FALSE); + } + mp = mp2; } + ipha = (ipha_t *)mp->b_rptr; + + ipha->ipha_length = htons((uint16_t)packet_size); + /* We're now complete, zip the frag state */ + ipha->ipha_fragment_offset_and_flags = 0; + /* Record the ECN info. */ + ipha->ipha_type_of_service &= 0xFC; + ipha->ipha_type_of_service |= ecn_info; + *mpp = mp; + + /* Reassembly is successful; return checksum information if needed */ + if (cksum_val != NULL) + *cksum_val = sum_val; + if (cksum_flags != NULL) + *cksum_flags = sum_flags; + return (B_TRUE); } @@ -11156,16 +11343,12 @@ ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, { uint32_t sum; uint32_t u1; - uint32_t u2; boolean_t mctl_present; conn_t *connp; mblk_t *first_mp; - mblk_t *mp1; - dblk_t *dp; uint16_t *up; ill_t *ill = (ill_t *)q->q_ptr; - uint32_t ports; - boolean_t cksum_computed = B_FALSE; + uint16_t reass_hck_flags = 0; #define rptr ((uchar_t *)ipha) @@ -11182,19 +11365,13 @@ ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, IP_SIMPLE_HDR_LENGTH_IN_WORDS); /* IP options present */ - if (u1) + if (u1 != 0) goto ipoptions; -#define IS_IPHDR_HWCKSUM(mctl_present, mp, ill) \ - ((!mctl_present) && (mp->b_datap->db_struioun.cksum.flags & \ - HCK_IPV4_HDRCKSUM) && (ill->ill_capabilities & \ - ILL_CAPAB_HCKSUM) && dohwcksum) - /* Check the IP header checksum. */ - if (IS_IPHDR_HWCKSUM(mctl_present, mp, ill)) { + if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) { /* Clear the IP header h/w cksum flag */ - mp->b_datap->db_struioun.cksum.flags &= - ~HCK_IPV4_HDRCKSUM; + DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; } else { #define uph ((uint16_t *)ipha) sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + uph[5] + @@ -11207,7 +11384,7 @@ ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, * Don't verify header checksum if this packet is coming * back from AH/ESP as we already did it. */ - if (!mctl_present && (sum && sum != 0xFFFF)) { + if (!mctl_present && sum != 0 && sum != 0xFFFF) { BUMP_MIB(&ip_mib, ipInCksumErrs); freemsg(first_mp); return; @@ -11236,133 +11413,52 @@ ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, /* packet does not contain complete IP & UDP headers */ if ((mp->b_wptr - rptr) < (IP_SIMPLE_HDR_LENGTH + UDPH_SIZE)) goto udppullup; + /* up points to UDP header */ up = (uint16_t *)((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH); #define iphs ((uint16_t *)ipha) -#define IP_CKSUM_RECV(len, u1, u2, mp, mp1, error, dp) { \ - boolean_t doswcksum = B_TRUE; \ - uint_t hcksumflags = 0; \ - \ - hcksumflags = dp->db_struioun.cksum.flags; \ - \ - /* Clear the hardware checksum flags; they have been consumed */\ - dp->db_struioun.cksum.flags = 0; \ - if (hcksumflags && (ill->ill_capabilities & ILL_CAPAB_HCKSUM) &&\ - dohwcksum) { \ - if (hcksumflags & HCK_FULLCKSUM) { \ - /* \ - * Full checksum has been computed by the \ - * hardware and has been attached. \ - */ \ - doswcksum = B_FALSE; \ - if (!(hcksumflags & HCK_FULLCKSUM_OK) && \ - (dp->db_cksum16 != 0xffff)) { \ - ipcsumdbg("full hwcksumerr\n", mp); \ - goto error; \ - } \ - } else if ((hcksumflags & HCK_PARTIALCKSUM) && \ - (((len = (IP_SIMPLE_HDR_LENGTH - dp->db_cksumstart))\ - & 1) == 0)) { \ - uint32_t tot_len = 0; \ - \ - doswcksum = B_FALSE; \ - /* Partial checksum computed */ \ - u1 += dp->db_cksum16; \ - tot_len = mp->b_wptr - mp->b_rptr; \ - if (!mp1) \ - mp1 = mp; \ - else \ - tot_len += mp1->b_wptr - mp1->b_rptr; \ - if (len > 0) { \ - /* \ - * Prepended extraneous data. Adjust \ - * checksum. \ - */ \ - u2 = IP_BCSUM_PARTIAL((uchar_t *)(rptr +\ - dp->db_cksumstart), (int32_t)len, \ - 0); \ - } else \ - u2 = 0; \ - if ((len = (dp->db_cksumend - tot_len)) > 0) { \ - /* \ - * Postpended extraneous data. Adjust \ - * checksum. \ - */ \ - uint32_t u3; \ - \ - u3 = IP_BCSUM_PARTIAL(mp1->b_wptr, \ - (int32_t)len, 0); \ - if ((uintptr_t)mp1->b_wptr & 1) \ - /* \ - * Postpended extraneous data \ - * was odd byte aligned, so \ - * swap resulting checksum \ - * bytes. \ - */ \ - u2 += ((u3 << 8) & 0xffff) | \ - (u3 >> 8); \ - else \ - u2 += u3; \ - u2 = (u2 & 0xFFFF) + ((int)(u2) >> 16); \ - } \ - /* \ - * One's complement subtract extraneous checksum\ - */ \ - if (u2 >= u1) \ - u1 = ~(u2 - u1) & 0xFFFF; \ - else \ - u1 -= u2; \ - u1 = (u1 & 0xFFFF) + ((int)u1 >> 16); \ - if (~(u1) & 0xFFFF) { \ - ipcsumdbg("partial hwcksumerr\n", mp); \ - goto error; \ - } \ - } \ - } \ - if (doswcksum) { \ - IP_STAT(ip_in_sw_cksum); \ - if ((IP_CSUM(mp, (int32_t)((uchar_t *)up - \ - (uchar_t *)ipha), u1)) != 0) { \ - ipcsumdbg("swcksumerr\n", mp); \ - goto error; \ - } \ - } \ -} - - dp = mp->b_datap; /* if udp hdr cksum != 0, then need to checksum udp packet */ - if (up[3]) { - cksum_computed = B_TRUE; - /* multiple mblks of udp data? */ - if ((mp1 = mp->b_cont) != NULL) { - /* more than two? */ - if (mp1->b_cont) - goto multipktudp; - } + if (up[3] != 0) { + mblk_t *mp1 = mp->b_cont; + boolean_t cksum_err; + uint16_t hck_flags = 0; /* Pseudo-header checksum */ u1 = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] + iphs[9] + up[2]; - if (!mctl_present) { - ssize_t len = 0; - IP_CKSUM_RECV(len, u1, u2, mp, mp1, udpcksumerr, dp); - } else { -multipktudp: + /* + * Revert to software checksum calculation if the interface + * isn't capable of checksum offload or if IPsec is present. + */ + if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum) + hck_flags = DB_CKSUMFLAGS(mp); + + if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) IP_STAT(ip_in_sw_cksum); - if ((IP_CSUM(mp, (int32_t)((uchar_t *)up - - (uchar_t *)ipha), u1)) != 0) { -udpcksumerr: - ip1dbg(("ip_udp_input: bad udp checksum\n")); - BUMP_MIB(&ip_mib, udpInCksumErrs); - freemsg(first_mp); - return; - } + + IP_CKSUM_RECV(hck_flags, u1, + (uchar_t *)(rptr + DB_CKSUMSTART(mp)), + (int32_t)((uchar_t *)up - rptr), + mp, mp1, cksum_err); + + if (cksum_err) { + BUMP_MIB(&ip_mib, udpInCksumErrs); + + if (hck_flags & HCK_FULLCKSUM) + IP_STAT(ip_udp_in_full_hw_cksum_err); + else if (hck_flags & HCK_PARTIALCKSUM) + IP_STAT(ip_udp_in_part_hw_cksum_err); + else + IP_STAT(ip_udp_in_sw_cksum_err); + + freemsg(first_mp); + return; } } - /* broadcast IP packet? */ + /* Non-fragmented broadcast or multicast packet? */ if (ire->ire_type == IRE_BROADCAST) goto udpslowpath; @@ -11371,7 +11467,7 @@ udpcksumerr: ASSERT(connp->conn_upq != NULL); IP_STAT(ip_udp_fast_path); - if (!canputnext(connp->conn_upq)) { + if (CONN_UDP_FLOWCTLD(connp)) { freemsg(mp); BUMP_MIB(&ip_mib, udpInOverflows); } else { @@ -11383,7 +11479,8 @@ udpcksumerr: */ if (ip_udp_check(q, connp, recv_ill, ipha, &mp, &first_mp, mctl_present)) { - putnext(connp->conn_upq, mp); + /* Send it upstream */ + CONN_UDP_RECV(connp, mp); } } /* @@ -11416,9 +11513,13 @@ ipoptions: u1 = ntohs(ipha->ipha_fragment_offset_and_flags); if (u1 & (IPH_MF | IPH_OFFSET)) { fragmented: - if (!ip_rput_fragment(q, &mp, ipha)) { + /* + * "sum" and "reass_hck_flags" are non-zero if the + * reassembled packet has a valid hardware computed + * checksum information associated with it. + */ + if (!ip_rput_fragment(q, &mp, ipha, &sum, &reass_hck_flags)) goto slow_done; - } /* * Make sure that first_mp points back to mp as * the mp we came in with could have changed in @@ -11432,7 +11533,7 @@ fragmented: /* Now we have a complete datagram, destined for this machine. */ u1 = IPH_HDR_LENGTH(ipha); /* Pull up the UDP header, if necessary. */ - if ((mp->b_wptr - mp->b_rptr) < (u1 + UDPH_SIZE)) { + if ((MBLKL(mp)) < (u1 + UDPH_SIZE)) { udppullup: if (!pullupmsg(mp, u1 + UDPH_SIZE)) { BUMP_MIB(&ip_mib, ipInDiscards); @@ -11441,30 +11542,43 @@ udppullup: } ipha = (ipha_t *)mp->b_rptr; } + /* - * Validate the checksum. This code is a bit funny looking - * but may help out the compiler in this crucial spot. + * Validate the checksum for the reassembled packet; for the + * pullup case we calculate the payload checksum in software. */ up = (uint16_t *)((uchar_t *)ipha + u1 + UDP_PORTS_OFFSET); - if (!cksum_computed && up[3]) { - IP_STAT(ip_in_sw_cksum); - sum = IP_CSUM(mp, (int32_t)((uchar_t *)up - (uchar_t *)ipha), - IP_UDP_CSUM_COMP + iphs[6] + - iphs[7] + iphs[8] + - iphs[9] + up[2]); - if (sum != 0) { - ip1dbg(("ip_udp_input: bad udp checksum\n")); - BUMP_MIB(&ip_mib, udpInCksumErrs); - freemsg(first_mp); - goto slow_done; + if (up[3] != 0) { + boolean_t cksum_err; + + if ((reass_hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) + IP_STAT(ip_in_sw_cksum); + + IP_CKSUM_RECV_REASS(reass_hck_flags, + (int32_t)((uchar_t *)up - (uchar_t *)ipha), + IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] + + iphs[9] + up[2], sum, cksum_err); + + if (cksum_err) { + BUMP_MIB(&ip_mib, udpInCksumErrs); + + if (reass_hck_flags & HCK_FULLCKSUM) + IP_STAT(ip_udp_in_full_hw_cksum_err); + else if (reass_hck_flags & HCK_PARTIALCKSUM) + IP_STAT(ip_udp_in_part_hw_cksum_err); + else + IP_STAT(ip_udp_in_sw_cksum_err); + + freemsg(first_mp); + goto slow_done; } } udpslowpath: - ports = *(uint32_t *)up; - /* Clear hardware checksum flag */ - mp->b_datap->db_struioun.cksum.flags = 0; - ip_fanout_udp(q, first_mp, ill, ipha, ports, + /* Clear hardware checksum flag to be safe */ + DB_CKSUMFLAGS(mp) = 0; + + ip_fanout_udp(q, first_mp, ill, ipha, *(uint32_t *)up, (ire->ire_type == IRE_BROADCAST), IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_IP6INFO, mctl_present, B_TRUE, recv_ill, ire->ire_zoneid); @@ -11473,6 +11587,7 @@ slow_done: IP_STAT(ip_udp_slow_path); return; +#undef iphs #undef rptr } @@ -11485,17 +11600,17 @@ ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, conn_t *connp; uint32_t sum; uint32_t u1; - uint32_t u2; uint16_t *up; int offset; ssize_t len; mblk_t *mp1; - dblk_t *dp; boolean_t syn_present = B_FALSE; tcph_t *tcph; uint_t ip_hdr_len; ill_t *ill = (ill_t *)q->q_ptr; zoneid_t zoneid = ire->ire_zoneid; + boolean_t cksum_err; + uint16_t hck_flags = 0; #define rptr ((uchar_t *)ipha) @@ -11514,10 +11629,9 @@ ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, goto ipoptions; } else { /* Check the IP header checksum. */ - if (IS_IPHDR_HWCKSUM(mctl_present, mp, ill)) { + if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) { /* Clear the IP header h/w cksum flag */ - mp->b_datap->db_struioun.cksum.flags &= - ~HCK_IPV4_HDRCKSUM; + DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; } else { #define uph ((uint16_t *)ipha) sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + @@ -11596,30 +11710,32 @@ ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, #endif u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9]; - /* - * If the packet has gone through AH/ESP, do the checksum here - * itself. - * - * If it has not gone through IPSEC processing and not a duped - * mblk, then look for driver checksummed mblk. We validate or - * postpone the checksum to TCP for single copy checksum. - * - * Note that we only honor HW cksum in the fastpath. + * Revert to software checksum calculation if the interface + * isn't capable of checksum offload or if IPsec is present. */ - dp = mp->b_datap; - if (!mctl_present) { - IP_CKSUM_RECV(len, u1, u2, mp, mp1, tcpcksumerr, dp); - } else { + if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum) + hck_flags = DB_CKSUMFLAGS(mp); + + if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) IP_STAT(ip_in_sw_cksum); - if ((IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr), - u1)) != 0) { -tcpcksumerr: - BUMP_MIB(&ip_mib, tcpInErrs); - ip1dbg(("ip_tcp_input: bad tcp checksum \n")); - freemsg(first_mp); - goto slow_done; - } + + IP_CKSUM_RECV(hck_flags, u1, + (uchar_t *)(rptr + DB_CKSUMSTART(mp)), + (int32_t)((uchar_t *)up - rptr), + mp, mp1, cksum_err); + + if (cksum_err) { + BUMP_MIB(&ip_mib, tcpInErrs); + + if (hck_flags & HCK_FULLCKSUM) + IP_STAT(ip_tcp_in_full_hw_cksum_err); + else if (hck_flags & HCK_PARTIALCKSUM) + IP_STAT(ip_tcp_in_part_hw_cksum_err); + else + IP_STAT(ip_tcp_in_sw_cksum_err); + + goto error; } try_again: @@ -11654,7 +11770,7 @@ try_again: if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { if (IPCL_IS_TCP(connp)) { mp->b_datap->db_struioflag |= STRUIO_EAGER; - mp->b_datap->db_cksumstart = + DB_CKSUMSTART(mp) = (intptr_t)ip_squeue_get(ill_ring); if (IPCL_IS_FULLY_BOUND(connp) && !mctl_present && !CONN_INBOUND_POLICY_PRESENT(connp)) { @@ -11800,7 +11916,7 @@ ipoptions: u1 = ntohs(ipha->ipha_fragment_offset_and_flags); if (u1 & (IPH_MF | IPH_OFFSET)) { fragmented: - if (!ip_rput_fragment(q, &mp, ipha)) { + if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) { if (mctl_present) freeb(first_mp); goto slow_done; @@ -11876,9 +11992,10 @@ multipkttcp: * ICMP's back, then this flag may need to be cleared in * other places as well. */ - mp->b_datap->db_struioun.cksum.flags = 0; + DB_CKSUMFLAGS(mp) = 0; up = (uint16_t *)(rptr + u1 + TCP_PORTS_OFFSET); + u1 = (uint32_t)(len - u1); /* TCP datagram length. */ #ifdef _BIG_ENDIAN u1 += IPPROTO_TCP; @@ -11890,7 +12007,7 @@ multipkttcp: * Not M_DATA mblk or its a dup, so do the checksum now. */ IP_STAT(ip_in_sw_cksum); - if (IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr), u1)) { + if (IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr), u1) != 0) { BUMP_MIB(&ip_mib, tcpInErrs); goto error; } @@ -11937,12 +12054,12 @@ ip_sctp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, goto ipoptions; } else { /* Check the IP header checksum. */ - if (IS_IPHDR_HWCKSUM(mctl_present, mp, ill)) { + if (IS_IP_HDR_HWCKSUM(mctl_present, mp, ill)) { /* * Since there is no SCTP h/w cksum support yet, just * clear the flag. */ - mp->b_datap->db_struioun.cksum.flags = 0; + DB_CKSUMFLAGS(mp) = 0; } else { #define uph ((uint16_t *)ipha) sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + @@ -12031,7 +12148,7 @@ no_conn: return; ipoptions: - mp->b_datap->db_struioun.cksum.flags = 0; + DB_CKSUMFLAGS(mp) = 0; if (!ip_options_cksum(q, first_mp, ipha, ire)) goto slow_done; @@ -12041,7 +12158,7 @@ ipoptions: u1 = ntohs(ipha->ipha_fragment_offset_and_flags); if (u1 & (IPH_MF | IPH_OFFSET)) { fragmented: - if (!ip_rput_fragment(q, &mp, ipha)) + if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) goto slow_done; /* * Make sure that first_mp points back to mp as @@ -12183,7 +12300,7 @@ ip_rput_noire(queue_t *q, ill_t *in_ill, mblk_t *mp, int ll_multicast, * Clear the indication that this may have a hardware checksum * as we are not using it */ - mp->b_datap->db_struioun.cksum.flags = 0; + DB_CKSUMFLAGS(mp) = 0; /* * Now hand the packet to ip_newroute. @@ -12351,7 +12468,7 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha, * Clear the indication that this may have * hardware checksum as we are not using it. */ - mp->b_datap->db_struioun.cksum.flags = 0; + DB_CKSUMFLAGS(mp) = 0; icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED); ire_refrele(ire); @@ -12361,7 +12478,7 @@ ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha, } /* Packet is being forwarded. Turning off hwcksum flag. */ - mp->b_datap->db_struioun.cksum.flags = 0; + DB_CKSUMFLAGS(mp) = 0; if (ip_g_send_redirects) { /* * Check whether the incoming interface and outgoing @@ -12435,15 +12552,17 @@ ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t **irep, ipha_t *ipha, { queue_t *q; ire_t *ire; + uint16_t hcksumflags; q = *qp; ire = *irep; /* * Clear the indication that this may have hardware - * checksum as we are not using it. + * checksum as we are not using it for forwarding. */ - mp->b_datap->db_struioun.cksum.flags = 0; + hcksumflags = DB_CKSUMFLAGS(mp); + DB_CKSUMFLAGS(mp) = 0; /* * Directed broadcast forwarding: if the packet came in over a @@ -12613,6 +12732,9 @@ ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t **irep, ipha_t *ipha, } *irep = ire; + + /* Restore any hardware checksum flags */ + DB_CKSUMFLAGS(mp) = hcksumflags; return (B_FALSE); } @@ -12632,7 +12754,7 @@ ip_rput_process_multicast(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, * Clear the indication that this may have hardware * checksum as we are not using it. */ - mp->b_datap->db_struioun.cksum.flags = 0; + DB_CKSUMFLAGS(mp) = 0; retval = ip_mforward(ill, ipha, mp); /* ip_mforward updates mib variables if needed */ /* clear b_prev - used by ip_mroute_decap */ @@ -12951,7 +13073,7 @@ ip_rput(queue_t *q, mblk_t *mp) /* * Also SIOC[GS]TUN* ioctls can come here. */ - ip_ioctl_freemsg(mp); + inet_freemsg(mp); TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, "ip_input_end: q %p (%S)", q, "uninit"); return; @@ -13300,9 +13422,20 @@ ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, size_t hdrlen) continue; } - /* broadcast? */ + /* + * Broadcast IRE may indicate either broadcast or + * multicast packet + */ if (ire->ire_type == IRE_BROADCAST) { - if (ip_rput_process_broadcast(&q, mp, &ire, ipha, ill, + /* + * Skip broadcast checks if packet is UDP multicast; + * we'd rather not enter ip_rput_process_broadcast() + * unless the packet is broadcast for real, since + * that routine is a no-op for multicast. + */ + if ((ipha->ipha_protocol != IPPROTO_UDP || + !CLASSD(ipha->ipha_dst)) && + ip_rput_process_broadcast(&q, mp, &ire, ipha, ill, dst, cgtp_flt_pkt, ll_multicast)) { continue; } @@ -13533,24 +13666,6 @@ ip_rput_dlpi(queue_t *q, mblk_t *mp) } /* - * This function is used to free a message that has gone through - * mi_copyin processing which modifies the M_IOCTL mblk's b_next - * and b_prev pointers. We use this function to set b_next/b_prev - * to NULL and free them. - */ -void -ip_ioctl_freemsg(mblk_t *mp) -{ - mblk_t *bp = mp; - - for (; bp != NULL; bp = bp->b_cont) { - bp->b_prev = NULL; - bp->b_next = NULL; - } - freemsg(mp); -} - -/* * Handling of DLPI messages that require exclusive access to the ipsq. * * Need to do ill_pending_mp_release on ioctl completion, which could @@ -14483,7 +14598,7 @@ ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) mp->b_cont->b_prev = mp1->b_cont->b_prev; } - ip_ioctl_freemsg(mp1); + inet_freemsg(mp1); ASSERT(ipsq->ipsq_current_ipif != NULL); ASSERT(connp != NULL); ip_ioctl_finish(CONNP_TO_WQ(connp), mp, @@ -14515,7 +14630,7 @@ ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) mp->b_cont->b_prev = mp1->b_cont->b_prev; } - ip_ioctl_freemsg(mp1); + inet_freemsg(mp1); if (iocp->ioc_error == 0) mp->b_datap->db_type = M_IOCDATA; ASSERT(connp != NULL); @@ -14596,7 +14711,7 @@ ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) mp->b_cont->b_prev = mp1->b_cont->b_prev; } - ip_ioctl_freemsg(mp1); + inet_freemsg(mp1); if (iocp->ioc_error == 0) iocp->ioc_error = EINVAL; ASSERT(connp != NULL); @@ -15321,7 +15436,7 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, */ ASSERT(!mctl_present); ASSERT(first_mp == mp); - if (!ip_rput_fragment(q, &mp, ipha)) { + if (!ip_rput_fragment(q, &mp, ipha, NULL, NULL)) { return; } /* @@ -15337,7 +15452,7 @@ ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, * Clear hardware checksumming flag as it is currently only * used by TCP and UDP. */ - mp->b_datap->db_struioun.cksum.flags = 0; + DB_CKSUMFLAGS(mp) = 0; /* Now we have a complete datagram, destined for this machine. */ u1 = IPH_HDR_LENGTH(ipha); @@ -15839,7 +15954,7 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire) bad_src_route: q = WR(q); /* make sure we clear any indication of a hardware checksum */ - mp->b_datap->db_struioun.cksum.flags = 0; + DB_CKSUMFLAGS(mp) = 0; icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED); return (B_FALSE); @@ -16022,14 +16137,14 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp) param_prob: q = WR(q); /* make sure we clear any indication of a hardware checksum */ - mp->b_datap->db_struioun.cksum.flags = 0; + DB_CKSUMFLAGS(mp) = 0; icmp_param_problem(q, mp, (uint8_t)code); return (-1); bad_src_route: q = WR(q); /* make sure we clear any indication of a hardware checksum */ - mp->b_datap->db_struioun.cksum.flags = 0; + DB_CKSUMFLAGS(mp) = 0; icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED); return (-1); } @@ -17571,7 +17686,7 @@ ip_trash_ire_reclaim(void *args) * upper level protocol. We remove this conn from any fanout hash list it is * on, and zero out the bind information. No reply is expected up above. */ -static void +mblk_t * ip_unbind(queue_t *q, mblk_t *mp) { conn_t *connp = Q_TO_CONN(q); @@ -17591,7 +17706,7 @@ ip_unbind(queue_t *q, mblk_t *mp) * original message. */ if (mp == NULL) - return; + return (NULL); /* * Don't bzero the ports if its TCP since TCP still needs the @@ -17601,7 +17716,7 @@ ip_unbind(queue_t *q, mblk_t *mp) if (!IPCL_IS_TCP(connp)) bzero(&connp->u_port, sizeof (connp->u_port)); - qreply(q, mp); + return (mp); } /* @@ -17657,7 +17772,9 @@ ip_output(void *arg, mblk_t *mp, void *arg2, int caller) /* is queue flow controlled? */ if ((q->q_first != NULL || connp->conn_draining) && (caller == IP_WPUT)) { - goto doputq; + ASSERT(!need_decref); + (void) putq(q, mp); + return; } /* Multidata transmit? */ @@ -17992,11 +18109,6 @@ standard_path: CONN_DEC_REF(connp); return; -doputq: - ASSERT(!need_decref); - (void) putq(q, mp); - return; - qnext: /* * Upper Level Protocols pass down complete IP datagrams @@ -18933,7 +19045,7 @@ ip_wput(queue_t *q, mblk_t *mp) * the ipif refcnt has gone to zero and holding the ill_g_lock and ill_lock * the above holds. */ -static ipif_t * +ipif_t * conn_get_held_ipif(conn_t *connp, ipif_t **ipifp, int *err) { ipif_t *ipif; @@ -19414,7 +19526,6 @@ ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller) boolean_t multirt_send = B_FALSE; int err; zoneid_t zoneid; - boolean_t iphdrhwcksum = B_FALSE; TRACE_1(TR_FAC_IP, TR_IP_WPUT_IRE_START, "ip_wput_ire_start: q %p", q); @@ -19749,102 +19860,6 @@ another:; /* pseudo checksum (do it in parts for IP header checksum) */ cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); -#define FRAGMENT_NEEDED(mtu, size) \ - (((mtu) < (unsigned int)(size)) ? B_TRUE : B_FALSE) - -#define IS_FASTPATH(ire, bp) \ - ((ire)->ire_fp_mp != NULL && \ - (MBLKHEAD((bp)) >= (MBLKL((ire)->ire_fp_mp)))) \ - -#define IPH_UDPH_CHECKSUMP(ipha, hlen) \ - ((uint16_t *)(((uchar_t *)ipha)+(hlen + UDP_CHECKSUM_OFFSET))) -#define IPH_TCPH_CHECKSUMP(ipha, hlen) \ - ((uint16_t *)(((uchar_t *)ipha)+(hlen+TCP_CHECKSUM_OFFSET))) - -#define IP_CKSUM_XMIT(ill, ire, mp, up, proto, hlen, max_frag, \ - ipsec_len) { \ - uint32_t sum; \ - uint32_t xmit_capab = HCKSUM_INET_FULL_V4 | \ - HCKSUM_INET_PARTIAL | HCKSUM_IPHDRCKSUM; \ - boolean_t cksum_offload = B_FALSE; \ - \ - /* \ - * The ire fp mp can change due to the arrival of a \ - * DL_NOTE_FASTPATH_FLUSH in the case of IRE_BROADCAST \ - * and IRE_MIPRTUN. Hence the ire_fp_mp has to be accessed \ - * only under the ire_lock in such cases. \ - */ \ - LOCK_IRE_FP_MP(ire); \ - if ((ill) && (ill->ill_capabilities & ILL_CAPAB_HCKSUM) && \ - (ill->ill_hcksum_capab->ill_hcksum_txflags & \ - xmit_capab) && (!FRAGMENT_NEEDED(max_frag, \ - (LENGTH + ipsec_len))) && (!(ire->ire_flags & \ - RTF_MULTIRT)) && (ipsec_len == 0) && \ - IS_FASTPATH((ire), (mp)) && (dohwcksum)) { \ - /* \ - * Underlying interface supports hardware checksumming. \ - * So postpone the checksum to the interface driver \ - */ \ - \ - if ((hlen) == IP_SIMPLE_HDR_LENGTH) { \ - if (ill->ill_hcksum_capab->ill_hcksum_txflags & \ - HCKSUM_IPHDRCKSUM) { \ - mp->b_datap->db_struioun.cksum.flags |= \ - HCK_IPV4_HDRCKSUM; \ - /* seed the cksum field to 0 */ \ - ipha->ipha_hdr_checksum = 0; \ - iphdrhwcksum = B_TRUE; \ - } \ - /* \ - * If underlying h/w supports full h/w checksumming \ - * and no IP options are present, then offload \ - * full checksumming to the hardware. \ - * \ - * If h/w can do partial checksumming then offload \ - * unless the startpoint offset, including mac-header, \ - * is too big for the interface to some of our \ - * hardware (CE and ERI) which have 6 bit fields. \ - * Sigh. \ - * Unhappily we don't have the mac-header size here \ - * so punt for any options. \ - */ \ - if (ill->ill_hcksum_capab->ill_hcksum_txflags & \ - HCKSUM_INET_FULL_V4) { \ - UNLOCK_IRE_FP_MP(ire); \ - /* Seed the checksum field to 0 */ \ - *up = 0; \ - mp->b_datap->db_struioun.cksum.flags |= \ - HCK_FULLCKSUM; \ - cksum_offload = B_TRUE; \ - } else if (ill->ill_hcksum_capab->ill_hcksum_txflags & \ - HCKSUM_INET_PARTIAL) { \ - UNLOCK_IRE_FP_MP(ire); \ - sum = *up + cksum + proto; \ - sum = (sum & 0xFFFF) + (sum >> 16); \ - *up = (sum & 0xFFFF) + (sum >> 16); \ - /* \ - * All offsets are relative to the beginning \ - * of the IP header. \ - */ \ - mp->b_datap->db_cksumstart = hlen; \ - mp->b_datap->db_cksumstuff = \ - (PROTO == IPPROTO_UDP) ? \ - (hlen) + UDP_CHECKSUM_OFFSET : \ - (hlen) + TCP_CHECKSUM_OFFSET; \ - mp->b_datap->db_cksumend = ipha->ipha_length; \ - mp->b_datap->db_struioun.cksum.flags |= \ - HCK_PARTIALCKSUM; \ - cksum_offload = B_TRUE; \ - } \ - } \ - } \ - if (!cksum_offload) { \ - UNLOCK_IRE_FP_MP(ire); \ - IP_STAT(ip_out_sw_cksum); \ - (sum) = IP_CSUM((mp), (hlen), cksum + proto); \ - *(up) = (uint16_t)((sum) ? (sum) : ~(sum)); \ - } \ -} if (!IP_FLOW_CONTROLLED_ULP(PROTO)) { queue_t *dev_q = stq->q_next; @@ -19856,10 +19871,16 @@ another:; (ip_hdr_included != IP_HDR_INCLUDED)) { hlen = (V_HLEN & 0xF) << 2; up = IPH_UDPH_CHECKSUMP(ipha, hlen); - if (*up) { - IP_CKSUM_XMIT(ill, ire, mp, up, - IP_UDP_CSUM_COMP, hlen, max_frag, - ipsec_len); + if (*up != 0) { + IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, + hlen, LENGTH, max_frag, ipsec_len, cksum); + /* Software checksum? */ + if (DB_CKSUMFLAGS(mp) == 0) { + IP_STAT(ip_out_sw_cksum); + IP_STAT_UPDATE( + ip_udp_out_sw_cksum_bytes, + LENGTH - hlen); + } } } } else if (ip_hdr_included != IP_HDR_INCLUDED) { @@ -19873,8 +19894,14 @@ another:; * replicated via several interfaces, and not all of * them may have this capability. */ - IP_CKSUM_XMIT(ill, ire, mp, up, - IP_TCP_CSUM_COMP, hlen, max_frag, ipsec_len); + IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, hlen, + LENGTH, max_frag, ipsec_len, cksum); + /* Software checksum? */ + if (DB_CKSUMFLAGS(mp) == 0) { + IP_STAT(ip_out_sw_cksum); + IP_STAT_UPDATE(ip_tcp_out_sw_cksum_bytes, + LENGTH - hlen); + } } else { sctp_hdr_t *sctph; @@ -19904,7 +19931,7 @@ another:; cksum += ttl_protocol; /* fragment the packet */ - if (FRAGMENT_NEEDED(max_frag, (LENGTH + ipsec_len))) + if (max_frag < (uint_t)(LENGTH + ipsec_len)) goto fragmentit; /* * Don't use frag_flag if packet is pre-built or source @@ -19918,8 +19945,8 @@ another:; ipha->ipha_fragment_offset_and_flags |= htons(ire->ire_frag_flag); - if (!iphdrhwcksum) { - /* checksum */ + if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { + /* calculate IP header checksum */ cksum += ipha->ipha_ident; cksum += (v_hlen_tos_len >> 16)+(v_hlen_tos_len & 0xFFFF); cksum += ipha->ipha_fragment_offset_and_flags; @@ -20258,7 +20285,11 @@ broadcast: hlen = (V_HLEN & 0xF) << 2; up = IPH_TCPH_CHECKSUMP(ipha, hlen); IP_STAT(ip_out_sw_cksum); + IP_STAT_UPDATE(ip_tcp_out_sw_cksum_bytes, + LENGTH - hlen); *up = IP_CSUM(mp, hlen, cksum + IP_TCP_CSUM_COMP); + if (*up == 0) + *up = 0xFFFF; } else if (PROTO == IPPROTO_SCTP && (ip_hdr_included != IP_HDR_INCLUDED)) { sctp_hdr_t *sctph; @@ -20338,17 +20369,18 @@ broadcast: */ hlen = (V_HLEN & 0xF) << 2; up = IPH_UDPH_CHECKSUMP(ipha, hlen); - if (*up) { - uint_t sum; - - /* - * NOTE: watch out for compiler high - * bits - */ - IP_STAT(ip_out_sw_cksum); - sum = IP_CSUM(mp, hlen, - cksum + IP_UDP_CSUM_COMP); - *up = (uint16_t)(sum ? sum : ~sum); + max_frag = ire->ire_max_frag; + if (*up != 0) { + IP_CKSUM_XMIT(ire_ill, ire, mp, ipha, + up, PROTO, hlen, LENGTH, max_frag, + ipsec_len, cksum); + /* Software checksum? */ + if (DB_CKSUMFLAGS(mp) == 0) { + IP_STAT(ip_out_sw_cksum); + IP_STAT_UPDATE( + ip_udp_out_sw_cksum_bytes, + LENGTH - hlen); + } } } } @@ -20369,9 +20401,7 @@ broadcast: conn_multicast_loop)); /* Forget header checksum offload */ - mp->b_datap->db_struioun.cksum.flags &= - ~HCK_IPV4_HDRCKSUM; - iphdrhwcksum = B_FALSE; + DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; /* * Local loopback of multicasts? Check the @@ -20459,10 +20489,8 @@ broadcast: } max_frag = ire->ire_max_frag; cksum += ttl_protocol; - if (!FRAGMENT_NEEDED(max_frag, (LENGTH + ipsec_len))) { + if (max_frag >= (uint_t)(LENGTH + ipsec_len)) { /* No fragmentation required for this one. */ - /* Complete the IP header checksum. */ - cksum += ipha->ipha_ident; /* * Don't use frag_flag if packet is pre-built or source * routed or if multicast (since multicast packets do @@ -20475,26 +20503,32 @@ broadcast: ipha->ipha_fragment_offset_and_flags |= htons(ire->ire_frag_flag); - cksum += (v_hlen_tos_len >> 16)+ - (v_hlen_tos_len & 0xFFFF); - cksum += ipha->ipha_fragment_offset_and_flags; - hlen = (V_HLEN & 0xF) - IP_SIMPLE_HDR_LENGTH_IN_WORDS; - if (hlen) { - checksumoptions: - /* - * Account for the IP Options in the IP - * header checksum. - */ - up = (uint16_t *)(rptr+IP_SIMPLE_HDR_LENGTH); - do { - cksum += up[0]; - cksum += up[1]; - up += 2; - } while (--hlen); - } - cksum = ((cksum & 0xFFFF) + (cksum >> 16)); - cksum = ~(cksum + (cksum >> 16)); - ipha->ipha_hdr_checksum = (uint16_t)cksum; + if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { + /* Complete the IP header checksum. */ + cksum += ipha->ipha_ident; + cksum += (v_hlen_tos_len >> 16)+ + (v_hlen_tos_len & 0xFFFF); + cksum += ipha->ipha_fragment_offset_and_flags; + hlen = (V_HLEN & 0xF) - + IP_SIMPLE_HDR_LENGTH_IN_WORDS; + if (hlen) { + checksumoptions: + /* + * Account for the IP Options in the IP + * header checksum. + */ + up = (uint16_t *)(rptr+ + IP_SIMPLE_HDR_LENGTH); + do { + cksum += up[0]; + cksum += up[1]; + up += 2; + } while (--hlen); + } + cksum = ((cksum & 0xFFFF) + (cksum >> 16)); + cksum = ~(cksum + (cksum >> 16)); + ipha->ipha_hdr_checksum = (uint16_t)cksum; + } if (ipsec_len != 0) { ipsec_out_process(q, first_mp, ire, ill_index); if (!next_mp) { @@ -20991,6 +21025,298 @@ ip_md_zcopy_attr(multidata_t *mmd, pdesc_t *pd, uint_t flags) } /* + * Check if ip_wput_frag_mdt() and ip_wput_frag_mdt_v6() can handle a message + * block chain. We could rewrite to handle arbitrary message block chains but + * that would make the code complicated and slow. Right now there three + * restrictions: + * + * 1. The first message block must contain the complete IP header and + * at least 1 byte of payload data. + * 2. At most MULTIDATA_MAX_PBUFS non-empty message blocks are allowed + * so that we can use a single Multidata message. + * 3. No frag must be distributed over two or more message blocks so + * that we don't need more than two packet descriptors per frag. + * + * The above restrictions allow us to support userland applications (which + * will send down a single message block) and NFS over UDP (which will + * send down a chain of at most three message blocks). + * + * We also don't use MDT for payloads with less than or equal to + * ip_wput_frag_mdt_min bytes because it would cause too much overhead. + */ +boolean_t +ip_can_frag_mdt(mblk_t *mp, ssize_t hdr_len, ssize_t len) +{ + int blocks; + ssize_t total, missing, size; + + ASSERT(mp != NULL); + ASSERT(hdr_len > 0); + + size = MBLKL(mp) - hdr_len; + if (size <= 0) + return (B_FALSE); + + /* The first mblk contains the header and some payload. */ + blocks = 1; + total = size; + size %= len; + missing = (size == 0) ? 0 : (len - size); + mp = mp->b_cont; + + while (mp != NULL) { + /* + * Give up if we encounter a zero length message block. + * In practice, this should rarely happen and therefore + * not worth the trouble of freeing and re-linking the + * mblk from the chain to handle such case. + */ + if ((size = MBLKL(mp)) == 0) + return (B_FALSE); + + /* Too many payload buffers for a single Multidata message? */ + if (++blocks > MULTIDATA_MAX_PBUFS) + return (B_FALSE); + + total += size; + /* Is a frag distributed over two or more message blocks? */ + if (missing > size) + return (B_FALSE); + size -= missing; + + size %= len; + missing = (size == 0) ? 0 : (len - size); + + mp = mp->b_cont; + } + + return (total > ip_wput_frag_mdt_min); +} + +/* + * Outbound IPv4 fragmentation routine using MDT. + */ +static void +ip_wput_frag_mdt(ire_t *ire, mblk_t *mp, ip_pkt_t pkt_type, int len, + uint32_t frag_flag, int offset) +{ + ipha_t *ipha_orig; + int i1, ip_data_end; + uint_t pkts, wroff, hdr_chunk_len, pbuf_idx; + mblk_t *hdr_mp, *md_mp = NULL; + unsigned char *hdr_ptr, *pld_ptr; + multidata_t *mmd; + ip_pdescinfo_t pdi; + + ASSERT(DB_TYPE(mp) == M_DATA); + ASSERT(MBLKL(mp) > sizeof (ipha_t)); + + ipha_orig = (ipha_t *)mp->b_rptr; + mp->b_rptr += sizeof (ipha_t); + + /* Calculate how many packets we will send out */ + i1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgsize(mp); + pkts = (i1 + len - 1) / len; + ASSERT(pkts > 1); + + /* Allocate a message block which will hold all the IP Headers. */ + wroff = ip_wroff_extra; + hdr_chunk_len = wroff + IP_SIMPLE_HDR_LENGTH; + + i1 = pkts * hdr_chunk_len; + /* + * Create the header buffer, Multidata and destination address + * and SAP attribute that should be associated with it. + */ + if ((hdr_mp = allocb(i1, BPRI_HI)) == NULL || + ((hdr_mp->b_wptr += i1), + (mmd = mmd_alloc(hdr_mp, &md_mp, KM_NOSLEEP)) == NULL) || + !ip_md_addr_attr(mmd, NULL, ire->ire_dlureq_mp)) { + freemsg(mp); + if (md_mp == NULL) { + freemsg(hdr_mp); + } else { +free_mmd: IP_STAT(ip_frag_mdt_discarded); + freemsg(md_mp); + } + IP_STAT(ip_frag_mdt_allocfail); + UPDATE_MIB(&ip_mib, ipOutDiscards, pkts); + return; + } + IP_STAT(ip_frag_mdt_allocd); + + /* + * Add a payload buffer to the Multidata; this operation must not + * fail, or otherwise our logic in this routine is broken. There + * is no memory allocation done by the routine, so any returned + * failure simply tells us that we've done something wrong. + * + * A failure tells us that either we're adding the same payload + * buffer more than once, or we're trying to add more buffers than + * allowed. None of the above cases should happen, and we panic + * because either there's horrible heap corruption, and/or + * programming mistake. + */ + if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) + goto pbuf_panic; + + hdr_ptr = hdr_mp->b_rptr; + pld_ptr = mp->b_rptr; + + /* Establish the ending byte offset, based on the starting offset. */ + offset <<= 3; + ip_data_end = offset + ntohs(ipha_orig->ipha_length) - + IP_SIMPLE_HDR_LENGTH; + + pdi.flags = PDESC_HBUF_REF | PDESC_PBUF_REF; + + while (pld_ptr < mp->b_wptr) { + ipha_t *ipha; + uint16_t offset_and_flags; + uint16_t ip_len; + int error; + + ASSERT((hdr_ptr + hdr_chunk_len) <= hdr_mp->b_wptr); + ipha = (ipha_t *)(hdr_ptr + wroff); + ASSERT(OK_32PTR(ipha)); + *ipha = *ipha_orig; + + if (ip_data_end - offset > len) { + offset_and_flags = IPH_MF; + } else { + /* + * Last frag. Set len to the length of this last piece. + */ + len = ip_data_end - offset; + /* A frag of a frag might have IPH_MF non-zero */ + offset_and_flags = + ntohs(ipha->ipha_fragment_offset_and_flags) & + IPH_MF; + } + offset_and_flags |= (uint16_t)(offset >> 3); + offset_and_flags |= (uint16_t)frag_flag; + /* Store the offset and flags in the IP header. */ + ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags); + + /* Store the length in the IP header. */ + ip_len = (uint16_t)(len + IP_SIMPLE_HDR_LENGTH); + ipha->ipha_length = htons(ip_len); + + /* + * Set the IP header checksum. Note that mp is just + * the header, so this is easy to pass to ip_csum. + */ + ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); + + /* + * Record offset and size of header and data of the next packet + * in the multidata message. + */ + PDESC_HDR_ADD(&pdi, hdr_ptr, wroff, IP_SIMPLE_HDR_LENGTH, 0); + PDESC_PLD_INIT(&pdi); + i1 = MIN(mp->b_wptr - pld_ptr, len); + ASSERT(i1 > 0); + PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, pld_ptr, i1); + if (i1 == len) { + pld_ptr += len; + } else { + i1 = len - i1; + mp = mp->b_cont; + ASSERT(mp != NULL); + ASSERT(MBLKL(mp) >= i1); + /* + * Attach the next payload message block to the + * multidata message. + */ + if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) + goto pbuf_panic; + PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, mp->b_rptr, i1); + pld_ptr = mp->b_rptr + i1; + } + + if ((mmd_addpdesc(mmd, (pdescinfo_t *)&pdi, &error, + KM_NOSLEEP)) == NULL) { + /* + * Any failure other than ENOMEM indicates that we + * have passed in invalid pdesc info or parameters + * to mmd_addpdesc, which must not happen. + * + * EINVAL is a result of failure on boundary checks + * against the pdesc info contents. It should not + * happen, and we panic because either there's + * horrible heap corruption, and/or programming + * mistake. + */ + if (error != ENOMEM) { + cmn_err(CE_PANIC, "ip_wput_frag_mdt: " + "pdesc logic error detected for " + "mmd %p pinfo %p (%d)\n", + (void *)mmd, (void *)&pdi, error); + /* NOTREACHED */ + } + IP_STAT(ip_frag_mdt_addpdescfail); + /* Free unattached payload message blocks as well */ + md_mp->b_cont = mp->b_cont; + goto free_mmd; + } + + /* Advance fragment offset. */ + offset += len; + + /* Advance to location for next header in the buffer. */ + hdr_ptr += hdr_chunk_len; + + /* Did we reach the next payload message block? */ + if (pld_ptr == mp->b_wptr && mp->b_cont != NULL) { + mp = mp->b_cont; + /* + * Attach the next message block with payload + * data to the multidata message. + */ + if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) + goto pbuf_panic; + pld_ptr = mp->b_rptr; + } + } + + ASSERT(hdr_mp->b_wptr == hdr_ptr); + ASSERT(mp->b_wptr == pld_ptr); + + /* Update IP statistics */ + UPDATE_MIB(&ip_mib, ipFragCreates, pkts); + BUMP_MIB(&ip_mib, ipFragOKs); + IP_STAT_UPDATE(ip_frag_mdt_pkt_out, pkts); + + if (pkt_type == OB_PKT) { + ire->ire_ob_pkt_count += pkts; + if (ire->ire_ipif != NULL) + atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, pkts); + } else { + /* + * The type is IB_PKT in the forwarding path and in + * the mobile IP case when the packet is being reverse- + * tunneled to the home agent. + */ + ire->ire_ib_pkt_count += pkts; + ASSERT(!IRE_IS_LOCAL(ire)); + if (ire->ire_type & IRE_BROADCAST) + atomic_add_32(&ire->ire_ipif->ipif_ib_pkt_count, pkts); + else + atomic_add_32(&ire->ire_ipif->ipif_fo_pkt_count, pkts); + } + ire->ire_last_used_time = lbolt; + /* Send it down */ + putnext(ire->ire_stq, md_mp); + return; + +pbuf_panic: + cmn_err(CE_PANIC, "ip_wput_frag_mdt: payload buffer logic " + "error for mmd %p pbuf %p (%d)", (void *)mmd, (void *)mp, + pbuf_idx); + /* NOTREACHED */ +} + +/* * Outbound IP fragmentation routine. * * NOTE : This routine does not ire_refrele the ire that is passed in @@ -21000,29 +21326,30 @@ static void ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, uint32_t frag_flag) { - int i1; - mblk_t *ll_hdr_mp; - int ll_hdr_len; - int hdr_len; - mblk_t *hdr_mp; - ipha_t *ipha; - int ip_data_end; - int len; - mblk_t *mp = mp_orig; - int offset; - queue_t *q; + int i1; + mblk_t *ll_hdr_mp; + int ll_hdr_len; + int hdr_len; + mblk_t *hdr_mp; + ipha_t *ipha; + int ip_data_end; + int len; + mblk_t *mp = mp_orig; + int offset; + queue_t *q; uint32_t v_hlen_tos_len; - mblk_t *first_mp; - boolean_t mctl_present; - mblk_t *xmit_mp; - mblk_t *carve_mp; - ire_t *ire1 = NULL; - ire_t *save_ire = NULL; - mblk_t *next_mp = NULL; - boolean_t last_frag = B_FALSE; - boolean_t multirt_send = B_FALSE; - ire_t *first_ire = NULL; - irb_t *irb = NULL; + mblk_t *first_mp; + boolean_t mctl_present; + ill_t *ill; + mblk_t *xmit_mp; + mblk_t *carve_mp; + ire_t *ire1 = NULL; + ire_t *save_ire = NULL; + mblk_t *next_mp = NULL; + boolean_t last_frag = B_FALSE; + boolean_t multirt_send = B_FALSE; + ire_t *first_ire = NULL; + irb_t *irb = NULL; TRACE_0(TR_FAC_IP, TR_IP_WPUT_FRAG_START, "ip_wput_frag_start:"); @@ -21036,6 +21363,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, mctl_present = B_FALSE; } + ASSERT(MBLKL(mp) >= sizeof (ipha_t)); ipha = (ipha_t *)mp->b_rptr; /* @@ -21079,8 +21407,37 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, } hdr_len = (V_HLEN & 0xF) << 2; + ipha->ipha_hdr_checksum = 0; + /* + * Establish the number of bytes maximum per frag, after putting + * in the header. + */ + len = (max_frag - hdr_len) & ~7; + + /* Check if we can use MDT to send out the frags. */ + ASSERT(!IRE_IS_LOCAL(ire)); + if (hdr_len == IP_SIMPLE_HDR_LENGTH && ip_multidata_outbound && + !(ire->ire_flags & RTF_MULTIRT) && !IPP_ENABLED(IPP_LOCAL_OUT) && + (ill = ire_to_ill(ire)) != NULL && ILL_MDT_CAPABLE(ill) && + IP_CAN_FRAG_MDT(mp, IP_SIMPLE_HDR_LENGTH, len)) { + ASSERT(ill->ill_mdt_capab != NULL); + if (!ill->ill_mdt_capab->ill_mdt_on) { + /* + * If MDT has been previously turned off in the past, + * and we currently can do MDT (due to IPQoS policy + * removal, etc.) then enable it for this interface. + */ + ill->ill_mdt_capab->ill_mdt_on = 1; + ip1dbg(("ip_wput_frag: enabled MDT for interface %s\n", + ill->ill_name)); + } + ip_wput_frag_mdt(ire, mp, pkt_type, len, frag_flag, + offset); + return; + } + /* Get a copy of the header for the trailing frags */ hdr_mp = ip_wput_frag_copyhdr((uchar_t *)ipha, hdr_len, offset); if (!hdr_mp) { @@ -21100,12 +21457,6 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, offset <<= 3; ip_data_end = offset + ntohs(ipha->ipha_length) - hdr_len; - /* - * Establish the number of bytes maximum per frag, after putting - * in the header. - */ - len = (max_frag - hdr_len) & ~7; - /* Store the length of the first fragment in the IP header. */ i1 = len + hdr_len; ASSERT(i1 <= IP_MAXPACKET); @@ -22565,8 +22916,6 @@ ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill, zoneid_t zoneid; uint32_t cksum; uint16_t *up; - /* Hack until the UDP merge into IP happens. */ - extern boolean_t udp_compute_checksum(void); #ifdef _BIG_ENDIAN #define LENGTH (v_hlen_tos_len & 0xFFFF) #else @@ -22741,6 +23090,8 @@ send: offset = IP_SIMPLE_HDR_LENGTH + UDP_CHECKSUM_OFFSET; IP_STAT(ip_out_sw_cksum); + IP_STAT_UPDATE(ip_udp_out_sw_cksum_bytes, + ntohs(htons(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH)); #define iphs ((uint16_t *)ipha) cksum = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] + iphs[9] + ntohs(htons(ipha->ipha_length) - @@ -23790,10 +24141,10 @@ ip_ioctl_finish(queue_t *q, mblk_t *mp, int err, int mode, void ip_resume_tcp_bind(void *arg, mblk_t *mp, void *arg2) { - conn_t *connp = (conn_t *)arg; + conn_t *connp = arg; tcp_t *tcp; - ASSERT(connp != NULL && connp->conn_tcp != NULL); + ASSERT(connp != NULL && IPCL_IS_TCP(connp) && connp->conn_tcp != NULL); tcp = connp->conn_tcp; if (connp->conn_tcp->tcp_state == TCPS_CLOSED) @@ -23801,7 +24152,6 @@ ip_resume_tcp_bind(void *arg, mblk_t *mp, void *arg2) else tcp_rput_other(tcp, mp); CONN_OPER_PENDING_DONE(connp); - } /* Called from ip_wput for all non data messages */ @@ -24031,31 +24381,48 @@ nak: case T_BIND_REQ: { /* Request can get queued in bind */ ASSERT(connp != NULL); + /* + * Both TCP and UDP call ip_bind_{v4,v6}() directly + * instead of going through this path. We only get + * here in the following cases: + * + * a. Bind retries, where ipsq is non-NULL. + * b. T_BIND_REQ is issued from non TCP/UDP + * transport, e.g. icmp for raw socket, + * in which case ipsq will be NULL. + */ + ASSERT(ipsq != NULL || + (!IPCL_IS_TCP(connp) && !IPCL_IS_UDP(connp))); + /* Don't increment refcnt if this is a re-entry */ if (ipsq == NULL) CONN_INC_REF(connp); - mp = connp->conn_af_isv6 ? - ip_bind_v6(q, mp, connp, NULL) : - ip_bind_v4(q, mp, connp); - if (mp != NULL) { - tcp_t *tcp; - - tcp = connp->conn_tcp; - if (tcp != NULL) { - if (ipsq == NULL) { - tcp_rput_other(tcp, mp); - } else { - CONN_INC_REF(connp); - squeue_fill(connp->conn_sqp, mp, - ip_resume_tcp_bind, - connp, SQTAG_TCP_RPUTOTHER); - return; - } - } else { - qreply(q, mp); - } - CONN_OPER_PENDING_DONE(connp); + mp = connp->conn_af_isv6 ? ip_bind_v6(q, mp, + connp, NULL) : ip_bind_v4(q, mp, connp); + if (mp == NULL) + return; + if (IPCL_IS_TCP(connp)) { + /* + * In the case of TCP endpoint we + * come here only for bind retries + */ + ASSERT(ipsq != NULL); + CONN_INC_REF(connp); + squeue_fill(connp->conn_sqp, mp, + ip_resume_tcp_bind, connp, + SQTAG_BIND_RETRY); + return; + } else if (IPCL_IS_UDP(connp)) { + /* + * In the case of UDP endpoint we + * come here only for bind retries + */ + ASSERT(ipsq != NULL); + udp_resume_bind(connp, mp); + return; } + qreply(q, mp); + CONN_OPER_PENDING_DONE(connp); return; } case T_SVR4_OPTMGMT_REQ: @@ -24111,7 +24478,8 @@ nak: } return; case T_UNBIND_REQ: - ip_unbind(q, mp); + mp = ip_unbind(q, mp); + qreply(q, mp); return; default: /* diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c index d701d170d1..8788d12aa6 100644 --- a/usr/src/uts/common/inet/ip/ip6.c +++ b/usr/src/uts/common/inet/ip/ip6.c @@ -58,6 +58,7 @@ #include <sys/policy.h> #include <net/if.h> #include <net/if_arp.h> +#include <net/if_types.h> #include <net/route.h> #include <net/if_dl.h> #include <sys/sockio.h> @@ -74,9 +75,12 @@ #include <inet/snmpcom.h> #include <inet/ip.h> +#include <inet/ip_impl.h> #include <inet/ip6.h> #include <inet/ip6_asp.h> #include <inet/tcp.h> +#include <inet/tcp_impl.h> +#include <inet/udp_impl.h> #include <inet/ipp_common.h> #include <inet/ip_multi.h> @@ -103,20 +107,51 @@ extern squeue_func_t ip_input_proc; /* * IP statistics. */ -#define IP6_STAT(x) (ip6_statistics.x.value.ui64++) +#define IP6_STAT(x) (ip6_statistics.x.value.ui64++) +#define IP6_STAT_UPDATE(x, n) (ip6_statistics.x.value.ui64 += (n)) typedef struct ip6_stat { kstat_named_t ip6_udp_fast_path; kstat_named_t ip6_udp_slow_path; kstat_named_t ip6_udp_fannorm; kstat_named_t ip6_udp_fanmb; + kstat_named_t ip6_out_sw_cksum; + kstat_named_t ip6_in_sw_cksum; + kstat_named_t ip6_tcp_in_full_hw_cksum_err; + kstat_named_t ip6_tcp_in_part_hw_cksum_err; + kstat_named_t ip6_tcp_in_sw_cksum_err; + kstat_named_t ip6_tcp_out_sw_cksum_bytes; + kstat_named_t ip6_udp_in_full_hw_cksum_err; + kstat_named_t ip6_udp_in_part_hw_cksum_err; + kstat_named_t ip6_udp_in_sw_cksum_err; + kstat_named_t ip6_udp_out_sw_cksum_bytes; + kstat_named_t ip6_frag_mdt_pkt_out; + kstat_named_t ip6_frag_mdt_discarded; + kstat_named_t ip6_frag_mdt_allocfail; + kstat_named_t ip6_frag_mdt_addpdescfail; + kstat_named_t ip6_frag_mdt_allocd; } ip6_stat_t; static ip6_stat_t ip6_statistics = { - { "ip6_udp_fast_path", KSTAT_DATA_UINT64 }, - { "ip6_udp_slow_path", KSTAT_DATA_UINT64 }, - { "ip6_udp_fannorm", KSTAT_DATA_UINT64 }, - { "ip6_udp_fanmb", KSTAT_DATA_UINT64 }, + { "ip6_udp_fast_path", KSTAT_DATA_UINT64 }, + { "ip6_udp_slow_path", KSTAT_DATA_UINT64 }, + { "ip6_udp_fannorm", KSTAT_DATA_UINT64 }, + { "ip6_udp_fanmb", KSTAT_DATA_UINT64 }, + { "ip6_out_sw_cksum", KSTAT_DATA_UINT64 }, + { "ip6_in_sw_cksum", KSTAT_DATA_UINT64 }, + { "ip6_tcp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, + { "ip6_tcp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, + { "ip6_tcp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, + { "ip6_tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, + { "ip6_udp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, + { "ip6_udp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, + { "ip6_udp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, + { "ip6_udp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, + { "ip6_frag_mdt_pkt_out", KSTAT_DATA_UINT64 }, + { "ip6_frag_mdt_discarded", KSTAT_DATA_UINT64 }, + { "ip6_frag_mdt_allocfail", KSTAT_DATA_UINT64 }, + { "ip6_frag_mdt_addpdescfail", KSTAT_DATA_UINT64 }, + { "ip6_frag_mdt_allocd", KSTAT_DATA_UINT64 }, }; static kstat_t *ip6_kstat; @@ -221,7 +256,7 @@ static void ip_fanout_udp_v6(queue_t *, mblk_t *, ip6_t *, uint32_t, static int ip_process_options_v6(queue_t *, mblk_t *, ip6_t *, uint8_t *, uint_t, uint8_t); static mblk_t *ip_rput_frag_v6(queue_t *, mblk_t *, ip6_t *, - ip6_frag_t *, uint_t, uint_t *); + ip6_frag_t *, uint_t, uint_t *, uint32_t *, uint16_t *); static boolean_t ip_source_routed_v6(ip6_t *, mblk_t *); static void ip_wput_ire_v6(queue_t *, mblk_t *, ire_t *, int, int, conn_t *, int, int, int); @@ -2302,7 +2337,8 @@ ip_bind_v6(queue_t *q, mblk_t *mp, conn_t *connp, ip6_pkt_t *ipp) connp->conn_recv = tcp_input; } /* Update qinfo if v4/v6 changed */ - if ((orig_pkt_isv6 != connp->conn_pkt_isv6) && !IS_TCP_CONN(connp)) { + if ((orig_pkt_isv6 != connp->conn_pkt_isv6) && + !(IPCL_IS_TCP(connp) || IPCL_IS_UDP(connp))) { if (connp->conn_pkt_isv6) ip_setqinfo(RD(q), IPV6_MINOR, B_TRUE); else @@ -2531,7 +2567,6 @@ ip_bind_connected_resume_v6(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) { conn_t *connp = NULL; - tcp_t *tcp; t_scalar_t prim; ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); @@ -2543,24 +2578,24 @@ ip_bind_connected_resume_v6(ipsq_t *ipsq, queue_t *q, mblk_t *mp, prim = ((union T_primitives *)mp->b_rptr)->type; ASSERT(prim == O_T_BIND_REQ || prim == T_BIND_REQ); - tcp = connp->conn_tcp; - if (tcp != NULL) { + if (IPCL_IS_TCP(connp)) { /* Pass sticky_ipp for scope_id and pktinfo */ - mp = ip_bind_v6(q, mp, connp, &tcp->tcp_sticky_ipp); + mp = ip_bind_v6(q, mp, connp, &connp->conn_tcp->tcp_sticky_ipp); } else { /* For UDP and ICMP */ mp = ip_bind_v6(q, mp, connp, NULL); } if (mp != NULL) { - if (tcp != NULL) { + if (IPCL_IS_TCP(connp)) { CONN_INC_REF(connp); - squeue_fill(connp->conn_sqp, mp, - ip_resume_tcp_bind, connp, SQTAG_TCP_RPUTOTHER); - return; + squeue_fill(connp->conn_sqp, mp, ip_resume_tcp_bind, + connp, SQTAG_TCP_RPUTOTHER); + } else if (IPCL_IS_UDP(connp)) { + udp_resume_bind(connp, mp); } else { qreply(q, mp); + CONN_OPER_PENDING_DONE(connp); } - CONN_OPER_PENDING_DONE(connp); } } @@ -2719,7 +2754,7 @@ ip_bind_connected_v6(conn_t *connp, mblk_t *mp, in6_addr_t *v6src, if (ip_multidata_outbound && !ipsec_policy_set && dst_ire != NULL && !(dst_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST)) && (md_ill = ire_to_ill(dst_ire), md_ill != NULL) && - (md_ill->ill_capabilities & ILL_CAPAB_MDT)) { + ILL_MDT_CAPABLE(md_ill)) { md_dst_ire = dst_ire; IRE_REFHOLD(md_dst_ire); } @@ -2936,7 +2971,7 @@ ip_bind_connected_v6(conn_t *connp, mblk_t *mp, in6_addr_t *v6src, */ error = ipcl_conn_insert_v6(connp, protocol, v6src, v6dst, connp->conn_ports, - IS_TCP_CONN(connp) ? connp->conn_tcp->tcp_bound_if : 0); + IPCL_IS_TCP(connp) ? connp->conn_tcp->tcp_bound_if : 0); } if (error == 0) { connp->conn_fully_bound = B_TRUE; @@ -3411,8 +3446,7 @@ ip_fanout_tcp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill, ill_t *inill, ASSERT((dp->db_struioflag & STRUIO_IP) == 0); /* Initiate IPPf processing, if needed. */ - if (IPP_ENABLED(IPP_LOCAL_IN) && - (flags & (IP6_NO_IPPOLICY|IP6_IN_NOCKSUM))) { + if (IPP_ENABLED(IPP_LOCAL_IN) && (flags & IP6_NO_IPPOLICY)) { ill_index = ill->ill_phyint->phyint_ifindex; ip_process(IPP_LOCAL_IN, &first_mp, ill_index); if (first_mp == NULL) { @@ -3447,14 +3481,14 @@ ip_fanout_tcp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill, ill_t *inill, } mp->b_datap->db_struioflag |= STRUIO_EAGER; - mp->b_datap->db_cksumstart = (intptr_t)sqp; + DB_CKSUMSTART(mp) = (intptr_t)sqp; /* * db_cksumstuff is unused in the incoming * path; Thus store the ifindex here. It will * be cleared in tcp_conn_create_v6(). */ - mp->b_datap->db_cksumstuff = + DB_CKSUMSTUFF(mp) = (intptr_t)ill->ill_phyint->phyint_ifindex; syn_present = B_TRUE; } @@ -3587,7 +3621,6 @@ ip_fanout_udp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, uint32_t ports, ill_t *ill, ill_t *inill, uint_t flags, boolean_t mctl_present, zoneid_t zoneid) { - queue_t *rq; uint32_t dstport, srcport; in6_addr_t dst; mblk_t *first_mp; @@ -3637,9 +3670,8 @@ ip_fanout_udp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, uint32_t ports, /* Found a client */ CONN_INC_REF(connp); mutex_exit(&connfp->connf_lock); - rq = connp->conn_rq; - if (!canputnext(rq)) { + if (CONN_UDP_FLOWCTLD(connp)) { freemsg(first_mp); BUMP_MIB(ill->ill_ip6_mib, udpInOverflows); CONN_DEC_REF(connp); @@ -3691,7 +3723,10 @@ ip_fanout_udp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, uint32_t ports, } } BUMP_MIB(ill->ill_ip6_mib, ipv6InDelivers); - putnext(rq, mp); + + /* Send it upstream */ + CONN_UDP_RECV(connp, mp); + IP6_STAT(ip6_udp_fannorm); CONN_DEC_REF(connp); if (mctl_present) @@ -3746,7 +3781,6 @@ ip_fanout_udp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, uint32_t ports, mp1 = mctl_present ? first_mp1->b_cont : first_mp1; CONN_INC_REF(connp); mutex_exit(&connfp->connf_lock); - rq = connp->conn_rq; /* * For link-local always add ifindex so that transport * can set sin6_scope_id. Avoid it for ICMP error @@ -3762,7 +3796,7 @@ ip_fanout_udp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, uint32_t ports, BUMP_MIB(ill->ill_ip6_mib, ipv6InDiscards); goto next_one; } - if (!canputnext(rq)) { + if (CONN_UDP_FLOWCTLD(connp)) { BUMP_MIB(ill->ill_ip6_mib, udpInOverflows); freemsg(mp1); goto next_one; @@ -3778,7 +3812,9 @@ ip_fanout_udp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, uint32_t ports, if (mctl_present) freeb(first_mp1); BUMP_MIB(ill->ill_ip6_mib, ipv6InDelivers); - putnext(rq, mp1); + + /* Send it upstream */ + CONN_UDP_RECV(connp, mp1); } next_one: mutex_enter(&connfp->connf_lock); @@ -3791,7 +3827,6 @@ next_one: /* Last one. Send it upstream. */ mutex_exit(&connfp->connf_lock); - rq = connp->conn_rq; /* Initiate IPPF processing */ if (IP6_IN_IPP(flags)) { @@ -3830,7 +3865,7 @@ next_one: first_mp = mp; } } - if (!canputnext(rq)) { + if (CONN_UDP_FLOWCTLD(connp)) { BUMP_MIB(ill->ill_ip6_mib, udpInOverflows); freemsg(mp); } else { @@ -3844,7 +3879,9 @@ next_one: } } BUMP_MIB(ill->ill_ip6_mib, ipv6InDelivers); - putnext(rq, mp); + + /* Send it upstream */ + CONN_UDP_RECV(connp, mp); } IP6_STAT(ip6_udp_fanmb); CONN_DEC_REF(connp); @@ -6447,7 +6484,7 @@ ip_rput_v6(queue_t *q, mblk_t *mp) */ if ((mp->b_datap->db_type != M_PCPROTO) || (dl->dl_primitive == DL_UNITDATA_IND)) { - ip_ioctl_freemsg(mp); + inet_freemsg(mp); return; } } @@ -6835,14 +6872,16 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h, mblk_t *first_mp1; boolean_t no_forward; ip6_hbh_t *hbhhdr; - boolean_t no_cksum = (flags & IP6_IN_NOCKSUM); boolean_t ll_multicast = (flags & IP6_IN_LLMCAST); conn_t *connp; - int off; ilm_t *ilm; uint32_t ports; uint_t ipif_id = 0; zoneid_t zoneid = GLOBAL_ZONEID; + uint16_t hck_flags, reass_hck_flags; + uint32_t reass_sum; + boolean_t cksum_err; + mblk_t *mp1; EXTRACT_PKT_MP(mp, first_mp, mctl_present); @@ -6899,11 +6938,14 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h, pkt_len -= diff; } - /* - * XXX When zero-copy support is added, this turning off of - * checksum flag will need to be done more selectively. - */ - mp->b_datap->db_struioun.cksum.flags &= ~HCK_PARTIALCKSUM; + if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum) + hck_flags = DB_CKSUMFLAGS(mp); + else + hck_flags = 0; + + /* Clear checksum flags in case we need to forward */ + DB_CKSUMFLAGS(mp) = 0; + reass_sum = reass_hck_flags = 0; nexthdr = ip6h->ip6_nxt; @@ -7168,7 +7210,6 @@ ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h, /* TBD add site-local check at site boundary? */ } else if (ipv6_send_redirects) { in6_addr_t *v6targ; - mblk_t *mp1; in6_addr_t gw_addr_v6; ire_t *src_ire_v6 = NULL; @@ -7313,7 +7354,6 @@ ipv6forus: case IPPROTO_TCP: { uint16_t *up; uint32_t sum; - dblk_t *dp; int offset; hdr_len = pkt_len - remlen; @@ -7336,6 +7376,7 @@ ipv6forus: freemsg(first_mp); return; } + hck_flags = 0; ip6h = (ip6_t *)mp->b_rptr; whereptr = (uint8_t *)ip6h + hdr_len; } @@ -7368,30 +7409,12 @@ ipv6forus: freemsg(first_mp); return; } + hck_flags = 0; ip6h = (ip6_t *)mp->b_rptr; whereptr = (uint8_t *)ip6h + hdr_len; } } - /* - * If packet is being looped back locally checksums - * aren't used - */ - if (no_cksum) { - if (mp->b_datap->db_type == M_DATA) { - /* - * M_DATA mblk, so init mblk (chain) - * for no struio(). - */ - mblk_t *mp1 = mp; - - do { - mp1->b_datap->db_struioflag = 0; - } while ((mp1 = mp1->b_cont) != NULL); - } - goto tcp_fanout; - } - up = (uint16_t *)&ip6h->ip6_src; /* * TCP checksum calculation. First sum up the @@ -7400,44 +7423,38 @@ ipv6forus: * - Destination IPv6 address * - TCP payload length * - TCP protocol ID - * XXX need zero-copy support here */ sum = htons(IPPROTO_TCP + remlen) + up[0] + up[1] + up[2] + up[3] + up[4] + up[5] + up[6] + up[7] + up[8] + up[9] + up[10] + up[11] + up[12] + up[13] + up[14] + up[15]; + + /* Fold initial sum */ sum = (sum & 0xffff) + (sum >> 16); - dp = mp->b_datap; - if (dp->db_type != M_DATA || dp->db_ref > 1) { - /* - * Not M_DATA mblk or its a dup, so do the - * checksum now. - */ - sum = IP_CSUM(mp, hdr_len, sum); - if (sum) { - /* checksum failed */ - ip1dbg(("ip_rput_data_v6: TCP checksum" - " failed %x off %d\n", - sum, hdr_len)); - BUMP_MIB(&ip_mib, tcpInErrs); - freemsg(first_mp); - return; - } - } else { - /* - * M_DATA mblk and not a dup - * compute checksum here - */ - off = (int)(whereptr - mp->b_rptr); - if (IP_CSUM(mp, off, sum)) { - BUMP_MIB(&ip_mib, tcpInErrs); - ipcsumdbg("ip_rput_data_v6 " - "swcksumerr\n", mp); - freemsg(first_mp); - return; - } + mp1 = mp->b_cont; + + if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) + IP6_STAT(ip6_in_sw_cksum); + + IP_CKSUM_RECV(hck_flags, sum, (uchar_t *) + ((uchar_t *)mp->b_rptr + DB_CKSUMSTART(mp)), + (int32_t)(whereptr - (uchar_t *)mp->b_rptr), + mp, mp1, cksum_err); + + if (cksum_err) { + BUMP_MIB(&ip_mib, tcpInErrs); + + if (hck_flags & HCK_FULLCKSUM) + IP6_STAT(ip6_tcp_in_full_hw_cksum_err); + else if (hck_flags & HCK_PARTIALCKSUM) + IP6_STAT(ip6_tcp_in_part_hw_cksum_err); + else + IP6_STAT(ip6_tcp_in_sw_cksum_err); + + freemsg(first_mp); + return; } tcp_fanout: ip_fanout_tcp_v6(q, first_mp, ip6h, ill, inill, @@ -7468,18 +7485,16 @@ tcp_fanout: } sctph = (sctp_hdr_t *)(mp->b_rptr + hdr_len); - if (!no_cksum) { - /* checksum */ - pktsum = sctph->sh_chksum; - sctph->sh_chksum = 0; - calcsum = sctp_cksum(mp, hdr_len); - if (calcsum != pktsum) { - BUMP_MIB(&sctp_mib, sctpChecksumError); - freemsg(mp); - return; - } - sctph->sh_chksum = pktsum; + /* checksum */ + pktsum = sctph->sh_chksum; + sctph->sh_chksum = 0; + calcsum = sctp_cksum(mp, hdr_len); + if (calcsum != pktsum) { + BUMP_MIB(&sctp_mib, sctpChecksumError); + freemsg(mp); + return; } + sctph->sh_chksum = pktsum; ports = *(uint32_t *)(mp->b_rptr + hdr_len); if ((connp = sctp_find_conn(&ip6h->ip6_src, &ip6h->ip6_dst, ports, ipif_id, zoneid)) == NULL) { @@ -7501,8 +7516,6 @@ tcp_fanout: hdr_len = pkt_len - remlen; -#define UDPH_SIZE 8 - if (hada_mp != NULL) { ip0dbg(("udp hada drop\n")); goto hada_drop; @@ -7519,16 +7532,10 @@ tcp_fanout: freemsg(first_mp); return; } + hck_flags = 0; ip6h = (ip6_t *)mp->b_rptr; whereptr = (uint8_t *)ip6h + hdr_len; } -#undef UDPH_SIZE - /* - * If packet is being looped back locally checksums - * aren't used - */ - if (no_cksum) - goto udp_fanout; /* * Before going through the regular checksum @@ -7568,15 +7575,37 @@ tcp_fanout: up[8] + up[9] + up[10] + up[11] + up[12] + up[13] + up[14] + up[15]; + /* Fold initial sum */ sum = (sum & 0xffff) + (sum >> 16); - /* Next sum in the UDP packet */ - sum = IP_CSUM(mp, hdr_len, sum); - if (sum) { - /* UDP checksum failed */ - ip1dbg(("ip_rput_data_v6: UDP checksum " - "failed %x\n", - sum)); + + if (reass_hck_flags != 0) { + hck_flags = reass_hck_flags; + + IP_CKSUM_RECV_REASS(hck_flags, + (int32_t)(whereptr - (uchar_t *)mp->b_rptr), + sum, reass_sum, cksum_err); + } else { + mp1 = mp->b_cont; + + IP_CKSUM_RECV(hck_flags, sum, (uchar_t *) + ((uchar_t *)mp->b_rptr + DB_CKSUMSTART(mp)), + (int32_t)(whereptr - (uchar_t *)mp->b_rptr), + mp, mp1, cksum_err); + } + + if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) + IP6_STAT(ip6_in_sw_cksum); + + if (cksum_err) { BUMP_MIB(ill->ill_ip6_mib, udpInCksumErrs); + + if (hck_flags & HCK_FULLCKSUM) + IP6_STAT(ip6_udp_in_full_hw_cksum_err); + else if (hck_flags & HCK_PARTIALCKSUM) + IP6_STAT(ip6_udp_in_part_hw_cksum_err); + else + IP6_STAT(ip6_udp_in_sw_cksum_err); + freemsg(first_mp); return; } @@ -7592,13 +7621,6 @@ tcp_fanout: goto hada_drop; } - /* - * If packet is being looped back locally checksums - * aren't used - */ - if (no_cksum) - goto icmp_fanout; - up = (uint16_t *)&ip6h->ip6_src; sum = htons(IPPROTO_ICMPV6 + remlen) + up[0] + up[1] + up[2] + up[3] + @@ -7607,7 +7629,7 @@ tcp_fanout: up[12] + up[13] + up[14] + up[15]; sum = (sum & 0xffff) + (sum >> 16); sum = IP_CSUM(mp, hdr_len, sum); - if (sum) { + if (sum != 0) { /* IPv6 ICMP checksum failed */ ip1dbg(("ip_rput_data_v6: ICMPv6 checksum " "failed %x\n", @@ -7795,6 +7817,7 @@ tcp_fanout: freemsg(mp); return; } + hck_flags = 0; ip6h = (ip6_t *)mp->b_rptr; whereptr = (uint8_t *)ip6h + pkt_len - remlen; } @@ -7820,8 +7843,12 @@ tcp_fanout: } } + /* Restore the flags */ + DB_CKSUMFLAGS(mp) = hck_flags; + mp = ip_rput_frag_v6(q, mp, ip6h, fraghdr, - remlen - used, &prev_nexthdr_offset); + remlen - used, &prev_nexthdr_offset, + &reass_sum, &reass_hck_flags); if (mp == NULL) { /* Reassembly is still pending */ return; @@ -8032,7 +8059,7 @@ udp_fanout: return; } - if (!canputnext(connp->conn_upq)) { + if (CONN_UDP_FLOWCTLD(connp)) { freemsg(first_mp); BUMP_MIB(ill->ill_ip6_mib, udpInOverflows); CONN_DEC_REF(connp); @@ -8062,7 +8089,9 @@ udp_fanout: IP6_STAT(ip6_udp_fast_path); BUMP_MIB(ill->ill_ip6_mib, ipv6InReceives); BUMP_MIB(ill->ill_ip6_mib, ipv6InDelivers); - putnext(connp->conn_upq, mp); + + /* Send it upstream */ + CONN_UDP_RECV(connp, mp); CONN_DEC_REF(connp); freemsg(hada_mp); @@ -8086,7 +8115,8 @@ hada_drop: */ static mblk_t * ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, - ip6_frag_t *fraghdr, uint_t remlen, uint_t *prev_nexthdr_offset) + ip6_frag_t *fraghdr, uint_t remlen, uint_t *prev_nexthdr_offset, + uint32_t *cksum_val, uint16_t *cksum_flags) { ill_t *ill = (ill_t *)q->q_ptr; uint32_t ident = ntohl(fraghdr->ip6f_ident); @@ -8107,6 +8137,62 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, mblk_t *tail_mp; mblk_t *t_mp; boolean_t pruned = B_FALSE; + uint32_t sum_val; + uint16_t sum_flags; + + + if (cksum_val != NULL) + *cksum_val = 0; + if (cksum_flags != NULL) + *cksum_flags = 0; + + /* + * We utilize hardware computed checksum info only for UDP since + * IP fragmentation is a normal occurence for the protocol. In + * addition, checksum offload support for IP fragments carrying + * UDP payload is commonly implemented across network adapters. + */ + ASSERT(ill != NULL); + if (nexthdr == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(ill) && + (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) { + mblk_t *mp1 = mp->b_cont; + int32_t len; + + /* Record checksum information from the packet */ + sum_val = (uint32_t)DB_CKSUM16(mp); + sum_flags = DB_CKSUMFLAGS(mp); + + /* fragmented payload offset from beginning of mblk */ + offset = (uint16_t)((uchar_t *)&fraghdr[1] - mp->b_rptr); + + if ((sum_flags & HCK_PARTIALCKSUM) && + (mp1 == NULL || mp1->b_cont == NULL) && + offset >= (uint16_t)DB_CKSUMSTART(mp) && + ((len = offset - (uint16_t)DB_CKSUMSTART(mp)) & 1) == 0) { + uint32_t adj; + /* + * Partial checksum has been calculated by hardware + * and attached to the packet; in addition, any + * prepended extraneous data is even byte aligned. + * If any such data exists, we adjust the checksum; + * this would also handle any postpended data. + */ + IP_ADJCKSUM_PARTIAL(mp->b_rptr + DB_CKSUMSTART(mp), + mp, mp1, len, adj); + + /* One's complement subtract extraneous checksum */ + if (adj >= sum_val) + sum_val = ~(adj - sum_val) & 0xFFFF; + else + sum_val -= adj; + } + } else { + sum_val = 0; + sum_flags = 0; + } + + /* Clear hardware checksumming flag */ + DB_CKSUMFLAGS(mp) = 0; /* * Note: Fragment offset in header is in 8-octet units. @@ -8159,7 +8245,6 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, * Drop the fragmented as early as possible, if * we don't have resource(s) to re-assemble. */ - if (ip_reass_queue_bytes == 0) { freemsg(mp); return (NULL); @@ -8183,12 +8268,11 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, * there is anything on the reassembly queue, the timer will * be running. */ - msg_len = mp->b_datap->db_lim - mp->b_datap->db_base; + msg_len = MBLKSIZE(mp); tail_mp = mp; while (tail_mp->b_cont != NULL) { tail_mp = tail_mp->b_cont; - msg_len += tail_mp->b_datap->db_lim - - tail_mp->b_datap->db_base; + msg_len += MBLKSIZE(tail_mp); } /* * If the reassembly list for this ILL will get too big @@ -8287,7 +8371,7 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ipf->ipf_timestamp = gethrestime_sec(); /* Record ipf generation and account for frag header */ ipf->ipf_gen = ill->ill_ipf_gen++; - ipf->ipf_count = mp1->b_datap->db_lim - mp1->b_datap->db_base; + ipf->ipf_count = MBLKSIZE(mp1); ipf->ipf_protocol = nexthdr; ipf->ipf_nf_hdr_len = 0; ipf->ipf_prev_nexthdr_offset = 0; @@ -8295,6 +8379,16 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ipf->ipf_ecn = ecn_info; ipf->ipf_num_dups = 0; ipfb->ipfb_frag_pkts++; + ipf->ipf_checksum = 0; + ipf->ipf_checksum_flags = 0; + + /* Store checksum value in fragment header */ + if (sum_flags != 0) { + sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); + sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); + ipf->ipf_checksum = sum_val; + ipf->ipf_checksum_flags = sum_flags; + } /* * We handle reassembly two ways. In the easy case, @@ -8326,6 +8420,10 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, * on easy reassembly. */ ipf->ipf_end = 0; + + /* Forget checksum offload from now on */ + ipf->ipf_checksum_flags = 0; + /* * ipf_hole_cnt is set by ip_reassemble. * ipf_count is updated by ip_reassemble. @@ -8349,6 +8447,23 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, } /* + * If the packet's flag has changed (it could be coming up + * from an interface different than the previous, therefore + * possibly different checksum capability), then forget about + * any stored checksum states. Otherwise add the value to + * the existing one stored in the fragment header. + */ + if (sum_flags != 0 && sum_flags == ipf->ipf_checksum_flags) { + sum_val += ipf->ipf_checksum; + sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); + sum_val = (sum_val & 0xFFFF) + (sum_val >> 16); + ipf->ipf_checksum = sum_val; + } else if (ipf->ipf_checksum_flags != 0) { + /* Forget checksum offload from now on */ + ipf->ipf_checksum_flags = 0; + } + + /* * We have a new piece of a datagram which is already being * reassembled. Update the ECN info if all IP fragments * are ECN capable. If there is one which is not, clear @@ -8443,6 +8558,13 @@ ip_rput_frag_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, nexthdr = ipf->ipf_protocol; *prev_nexthdr_offset = ipf->ipf_prev_nexthdr_offset; ipfp = ipf->ipf_ptphn; + + /* We need to supply these to caller */ + if ((sum_flags = ipf->ipf_checksum_flags) != 0) + sum_val = ipf->ipf_checksum; + else + sum_val = 0; + mp1 = ipf->ipf_mp; count = ipf->ipf_count; ipf = ipf->ipf_hash_next; @@ -8508,6 +8630,12 @@ reass_done: ip6h->ip6_vcf &= htonl(0xFFCFFFFF); ip6h->ip6_vcf |= htonl(ecn_info << 20); + /* Reassembly is successful; return checksum information if needed */ + if (cksum_val != NULL) + *cksum_val = sum_val; + if (cksum_flags != NULL) + *cksum_flags = sum_flags; + return (mp); } @@ -9954,7 +10082,7 @@ notv6: if (q->q_next == NULL) { connp = Q_TO_CONN(q); - if (IS_TCP_CONN(connp)) { + if (IPCL_IS_TCP(connp)) { /* change conn_send for the tcp_v4_connections */ connp->conn_send = ip_output; } else if (connp->conn_ulp == IPPROTO_SCTP) { @@ -10426,12 +10554,52 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, uint32_t sum; uint_t ill_index = ((ill_t *)ire->ire_stq->q_ptr)-> ill_phyint->phyint_ifindex; + queue_t *dev_q = ire->ire_stq->q_next; /* * non-NULL send-to queue - packet is to be sent * out an interface. */ + /* Driver is flow-controlling? */ + if (!IP_FLOW_CONTROLLED_ULP(nexthdr) && + ((dev_q->q_next || dev_q->q_first) && !canput(dev_q))) { + /* + * Queue packet if we have an conn to give back + * pressure. We can't queue packets intended for + * hardware acceleration since we've tossed that + * state already. If the packet is being fed back + * from ire_send_v6, we don't know the position in + * the queue to enqueue the packet and we discard + * the packet. + */ + ASSERT(mp == first_mp); + if (ip_output_queue && connp != NULL && + !mctl_present && caller != IRE_SEND) { + if (caller == IP_WSRV) { + connp->conn_did_putbq = 1; + (void) putbq(connp->conn_wq, mp); + conn_drain_insert(connp); + /* + * caller == IP_WSRV implies we are + * the service thread, and the + * queue is already noenabled. + * The check for canput and + * the putbq is not atomic. + * So we need to check again. + */ + if (canput(dev_q)) + connp->conn_did_putbq = 0; + } else { + (void) putq(connp->conn_wq, mp); + } + return; + } + BUMP_MIB(mibptr, ipv6OutDiscards); + freemsg(mp); + return; + } + /* * Look for reachability confirmations from the transport. */ @@ -10490,20 +10658,20 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, up[12] + up[13] + up[14] + up[15]; sum = (sum & 0xffff) + (sum >> 16); *insp = IP_CSUM(mp, hdr_length, sum); + if (*insp == 0) + *insp = 0xFFFF; } else if (nexthdr == IPPROTO_TCP) { uint16_t *up; /* * Check for full IPv6 header + enough TCP header * to get at the checksum field. - * XXX need hardware checksum support. */ -#define TCP_CSUM_OFFSET 16 -#define TCP_CSUM_SIZE 2 if ((mp->b_wptr - mp->b_rptr) < - (hdr_length + TCP_CSUM_OFFSET + TCP_CSUM_SIZE)) { + (hdr_length + TCP_CHECKSUM_OFFSET + + TCP_CHECKSUM_SIZE)) { if (!pullupmsg(mp, hdr_length + - TCP_CSUM_OFFSET + TCP_CSUM_SIZE)) { + TCP_CHECKSUM_OFFSET + TCP_CHECKSUM_SIZE)) { ip1dbg(("ip_wput_v6: TCP hdr pullupmsg" " failed\n")); BUMP_MIB(mibptr, ipv6OutDiscards); @@ -10519,30 +10687,28 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, * into the tcp checksum field, so we don't * need to explicitly sum it in here. */ - if (hdr_length == IPV6_HDR_LEN) { - /* src, dst, tcp consequtive */ - up = (uint16_t *)(((uchar_t *)ip6h) + - IPV6_HDR_LEN + TCP_CSUM_OFFSET); - *up = IP_CSUM(mp, - IPV6_HDR_LEN - 2 * sizeof (in6_addr_t), - htons(IPPROTO_TCP)); - } else { - sum = htons(IPPROTO_TCP) + - up[0] + up[1] + up[2] + up[3] + - up[4] + up[5] + up[6] + up[7] + - up[8] + up[9] + up[10] + up[11] + - up[12] + up[13] + up[14] + up[15]; - /* - * Fold the initial sum. - */ - sum = (sum & 0xffff) + (sum >> 16); - up = (uint16_t *)(((uchar_t *)ip6h) + - hdr_length + TCP_CSUM_OFFSET); - *up = IP_CSUM(mp, hdr_length, sum); - } -#undef TCP_CSUM_OFFSET -#undef TCP_CSUM_SIZE + sum = up[0] + up[1] + up[2] + up[3] + + up[4] + up[5] + up[6] + up[7] + + up[8] + up[9] + up[10] + up[11] + + up[12] + up[13] + up[14] + up[15]; + + /* Fold the initial sum */ + sum = (sum & 0xffff) + (sum >> 16); + + up = (uint16_t *)(((uchar_t *)ip6h) + + hdr_length + TCP_CHECKSUM_OFFSET); + IP_CKSUM_XMIT(ill, ire, mp, ip6h, up, IPPROTO_TCP, + hdr_length, ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN, + ire->ire_max_frag, mctl_present, sum); + + /* Software checksum? */ + if (DB_CKSUMFLAGS(mp) == 0) { + IP6_STAT(ip6_out_sw_cksum); + IP6_STAT_UPDATE(ip6_tcp_out_sw_cksum_bytes, + (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN) - + hdr_length); + } } else if (nexthdr == IPPROTO_UDP) { uint16_t *up; @@ -10550,12 +10716,10 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, * check for full IPv6 header + enough UDP header * to get at the UDP checksum field */ -#define UDP_CSUM_OFFSET 6 -#define UDP_CSUM_SIZE 2 if ((mp->b_wptr - mp->b_rptr) < (hdr_length + - UDP_CSUM_OFFSET + UDP_CSUM_SIZE)) { + UDP_CHECKSUM_OFFSET + UDP_CHECKSUM_SIZE)) { if (!pullupmsg(mp, hdr_length + - UDP_CSUM_OFFSET + UDP_CSUM_SIZE)) { + UDP_CHECKSUM_OFFSET + UDP_CHECKSUM_SIZE)) { ip1dbg(("ip_wput_v6: UDP hdr pullupmsg" " failed\n")); BUMP_MIB(mibptr, ipv6OutDiscards); @@ -10570,34 +10734,28 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, * into the udp checksum field, so we don't * need to explicitly sum it in here. */ - if (hdr_length == IPV6_HDR_LEN) { - /* src, dst, udp consequtive */ - up = (uint16_t *)(((uchar_t *)ip6h) + - IPV6_HDR_LEN + UDP_CSUM_OFFSET); - *up = IP_CSUM(mp, - IPV6_HDR_LEN - 2 * sizeof (in6_addr_t), - htons(IPPROTO_UDP)); - } else { - sum = htons(IPPROTO_UDP) + - up[0] + up[1] + up[2] + up[3] + - up[4] + up[5] + up[6] + up[7] + - up[8] + up[9] + up[10] + up[11] + - up[12] + up[13] + up[14] + up[15]; - sum = (sum & 0xffff) + (sum >> 16); - up = (uint16_t *)(((uchar_t *)ip6h) + - hdr_length + UDP_CSUM_OFFSET); - *up = IP_CSUM(mp, hdr_length, sum); - } + sum = up[0] + up[1] + up[2] + up[3] + + up[4] + up[5] + up[6] + up[7] + + up[8] + up[9] + up[10] + up[11] + + up[12] + up[13] + up[14] + up[15]; - /* - * According to RFC 2460, UDP in IPv6 shouldn't - * appear with all zero checksum on the wire and - * should be changed to 0xffff. - */ - if (*up == 0) - *up = 0xffff; -#undef UDP_CSUM_OFFSET -#undef UDP_CSUM_SIZE + /* Fold the initial sum */ + sum = (sum & 0xffff) + (sum >> 16); + + up = (uint16_t *)(((uchar_t *)ip6h) + + hdr_length + UDP_CHECKSUM_OFFSET); + + IP_CKSUM_XMIT(ill, ire, mp, ip6h, up, IPPROTO_UDP, + hdr_length, ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN, + ire->ire_max_frag, mctl_present, sum); + + /* Software checksum? */ + if (DB_CKSUMFLAGS(mp) == 0) { + IP6_STAT(ip6_out_sw_cksum); + IP6_STAT_UPDATE(ip6_udp_out_sw_cksum_bytes, + (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN) - + hdr_length); + } } else if (nexthdr == IPPROTO_ICMPV6) { uint16_t *up; icmp6_t *icmp6; @@ -10627,6 +10785,9 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, up[12] + up[13] + up[14] + up[15]; sum = (sum & 0xffff) + (sum >> 16); icmp6->icmp6_cksum = IP_CSUM(mp, hdr_length, sum); + if (icmp6->icmp6_cksum == 0) + icmp6->icmp6_cksum = 0xFFFF; + /* Update output mib stats */ icmp_update_out_mib_v6(ill, icmp6); } else if (nexthdr == IPPROTO_SCTP) { @@ -10764,6 +10925,223 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, } /* + * Outbound IPv6 fragmentation routine using MDT. + */ +static void +ip_wput_frag_mdt_v6(mblk_t *mp, ire_t *ire, size_t max_chunk, + size_t unfragmentable_len, uint8_t nexthdr, uint_t prev_nexthdr_offset) +{ + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + uint_t pkts, wroff, hdr_chunk_len, pbuf_idx; + mblk_t *hdr_mp, *md_mp = NULL; + int i1; + multidata_t *mmd; + unsigned char *hdr_ptr, *pld_ptr; + ip_pdescinfo_t pdi; + uint32_t ident; + size_t len; + uint16_t offset; + queue_t *stq = ire->ire_stq; + ill_t *ill = (ill_t *)stq->q_ptr; + + ASSERT(DB_TYPE(mp) == M_DATA); + ASSERT(MBLKL(mp) > unfragmentable_len); + + /* + * Move read ptr past unfragmentable portion, we don't want this part + * of the data in our fragments. + */ + mp->b_rptr += unfragmentable_len; + + /* Calculate how many packets we will send out */ + i1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgsize(mp); + pkts = (i1 + max_chunk - 1) / max_chunk; + ASSERT(pkts > 1); + + /* Allocate a message block which will hold all the IP Headers. */ + wroff = ip_wroff_extra; + hdr_chunk_len = wroff + unfragmentable_len + sizeof (ip6_frag_t); + + i1 = pkts * hdr_chunk_len; + /* + * Create the header buffer, Multidata and destination address + * and SAP attribute that should be associated with it. + */ + if ((hdr_mp = allocb(i1, BPRI_HI)) == NULL || + ((hdr_mp->b_wptr += i1), + (mmd = mmd_alloc(hdr_mp, &md_mp, KM_NOSLEEP)) == NULL) || + !ip_md_addr_attr(mmd, NULL, ire->ire_nce->nce_res_mp)) { + freemsg(mp); + if (md_mp == NULL) { + freemsg(hdr_mp); + } else { +free_mmd: IP6_STAT(ip6_frag_mdt_discarded); + freemsg(md_mp); + } + IP6_STAT(ip6_frag_mdt_allocfail); + BUMP_MIB(ill->ill_ip6_mib, ipv6OutFragFails); + UPDATE_MIB(ill->ill_ip6_mib, ipv6OutDiscards, pkts); + return; + } + IP6_STAT(ip6_frag_mdt_allocd); + + /* + * Add a payload buffer to the Multidata; this operation must not + * fail, or otherwise our logic in this routine is broken. There + * is no memory allocation done by the routine, so any returned + * failure simply tells us that we've done something wrong. + * + * A failure tells us that either we're adding the same payload + * buffer more than once, or we're trying to add more buffers than + * allowed. None of the above cases should happen, and we panic + * because either there's horrible heap corruption, and/or + * programming mistake. + */ + if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) { + goto pbuf_panic; + } + + hdr_ptr = hdr_mp->b_rptr; + pld_ptr = mp->b_rptr; + + pdi.flags = PDESC_HBUF_REF | PDESC_PBUF_REF; + + ident = htonl(atomic_add_32_nv(&ire->ire_ident, 1)); + + /* + * len is the total length of the fragmentable data in this + * datagram. For each fragment sent, we will decrement len + * by the amount of fragmentable data sent in that fragment + * until len reaches zero. + */ + len = ntohs(ip6h->ip6_plen) - (unfragmentable_len - IPV6_HDR_LEN); + + offset = 0; + prev_nexthdr_offset += wroff; + + while (len != 0) { + size_t mlen; + ip6_t *fip6h; + ip6_frag_t *fraghdr; + int error; + + ASSERT((hdr_ptr + hdr_chunk_len) <= hdr_mp->b_wptr); + mlen = MIN(len, max_chunk); + len -= mlen; + + fip6h = (ip6_t *)(hdr_ptr + wroff); + ASSERT(OK_32PTR(fip6h)); + bcopy(ip6h, fip6h, unfragmentable_len); + hdr_ptr[prev_nexthdr_offset] = IPPROTO_FRAGMENT; + + fip6h->ip6_plen = htons((uint16_t)(mlen + + unfragmentable_len - IPV6_HDR_LEN + sizeof (ip6_frag_t))); + + fraghdr = (ip6_frag_t *)((unsigned char *)fip6h + + unfragmentable_len); + fraghdr->ip6f_nxt = nexthdr; + fraghdr->ip6f_reserved = 0; + fraghdr->ip6f_offlg = htons(offset) | + ((len != 0) ? IP6F_MORE_FRAG : 0); + fraghdr->ip6f_ident = ident; + + /* + * Record offset and size of header and data of the next packet + * in the multidata message. + */ + PDESC_HDR_ADD(&pdi, hdr_ptr, wroff, + unfragmentable_len + sizeof (ip6_frag_t), 0); + PDESC_PLD_INIT(&pdi); + i1 = MIN(mp->b_wptr - pld_ptr, mlen); + ASSERT(i1 > 0); + PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, pld_ptr, i1); + if (i1 == mlen) { + pld_ptr += mlen; + } else { + i1 = mlen - i1; + mp = mp->b_cont; + ASSERT(mp != NULL); + ASSERT(MBLKL(mp) >= i1); + /* + * Attach the next payload message block to the + * multidata message. + */ + if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) + goto pbuf_panic; + PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, mp->b_rptr, i1); + pld_ptr = mp->b_rptr + i1; + } + + if ((mmd_addpdesc(mmd, (pdescinfo_t *)&pdi, &error, + KM_NOSLEEP)) == NULL) { + /* + * Any failure other than ENOMEM indicates that we + * have passed in invalid pdesc info or parameters + * to mmd_addpdesc, which must not happen. + * + * EINVAL is a result of failure on boundary checks + * against the pdesc info contents. It should not + * happen, and we panic because either there's + * horrible heap corruption, and/or programming + * mistake. + */ + if (error != ENOMEM) { + cmn_err(CE_PANIC, "ip_wput_frag_mdt_v6: " + "pdesc logic error detected for " + "mmd %p pinfo %p (%d)\n", + (void *)mmd, (void *)&pdi, error); + /* NOTREACHED */ + } + IP6_STAT(ip6_frag_mdt_addpdescfail); + /* Free unattached payload message blocks as well */ + md_mp->b_cont = mp->b_cont; + goto free_mmd; + } + + /* Advance fragment offset. */ + offset += mlen; + + /* Advance to location for next header in the buffer. */ + hdr_ptr += hdr_chunk_len; + + /* Did we reach the next payload message block? */ + if (pld_ptr == mp->b_wptr && mp->b_cont != NULL) { + mp = mp->b_cont; + /* + * Attach the next message block with payload + * data to the multidata message. + */ + if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) + goto pbuf_panic; + pld_ptr = mp->b_rptr; + } + } + + ASSERT(hdr_mp->b_wptr == hdr_ptr); + ASSERT(mp->b_wptr == pld_ptr); + + /* Update IP statistics */ + UPDATE_MIB(ill->ill_ip6_mib, ipv6OutFragCreates, pkts); + BUMP_MIB(ill->ill_ip6_mib, ipv6OutFragOKs); + IP6_STAT_UPDATE(ip6_frag_mdt_pkt_out, pkts); + + ire->ire_ob_pkt_count += pkts; + if (ire->ire_ipif != NULL) + atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, pkts); + + ire->ire_last_used_time = lbolt; + /* Send it down */ + putnext(stq, md_mp); + return; + +pbuf_panic: + cmn_err(CE_PANIC, "ip_wput_frag_mdt_v6: payload buffer logic " + "error for mmd %p pbuf %p (%d)", (void *)mmd, (void *)mp, + pbuf_idx); + /* NOTREACHED */ +} + +/* * IPv6 fragmentation. Essentially the same as IPv4 fragmentation. * We have not optimized this in terms of number of mblks * allocated. For instance, for each fragment sent we always allocate a @@ -10779,7 +11157,7 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, */ void ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp, - boolean_t caller, int max_frag) + int caller, int max_frag) { ip6_t *ip6h = (ip6_t *)mp->b_rptr; ip6_t *fip6h; @@ -10849,6 +11227,19 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp, } unfragmentable_len = (uint_t)(ptr - (uint8_t *)ip6h); + max_chunk = (min(max_frag, ire->ire_max_frag) - unfragmentable_len - + sizeof (ip6_frag_t)) & ~7; + + /* Check if we can use MDT to send out the frags. */ + ASSERT(!IRE_IS_LOCAL(ire)); + if (ip_multidata_outbound && reachable == 0 && + !(ire->ire_flags & RTF_MULTIRT) && ILL_MDT_CAPABLE(ill) && + IP_CAN_FRAG_MDT(mp, unfragmentable_len, max_chunk)) { + ip_wput_frag_mdt_v6(mp, ire, max_chunk, unfragmentable_len, + nexthdr, prev_nexthdr_offset); + return; + } + /* * Allocate an mblk with enough room for the link-layer * header, the unfragmentable part of the datagram, and the @@ -10875,7 +11266,7 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp, fraghdr->ip6f_nxt = nexthdr; fraghdr->ip6f_reserved = 0; - fraghdr->ip6f_offlg = htons(0); + fraghdr->ip6f_offlg = 0; fraghdr->ip6f_ident = htonl(ident); /* @@ -10886,9 +11277,6 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp, */ len = ntohs(ip6h->ip6_plen) - (unfragmentable_len - IPV6_HDR_LEN); - max_chunk = (min(max_frag, ire->ire_max_frag) - unfragmentable_len - - sizeof (ip6_frag_t)) & ~7; - /* * Move read ptr past unfragmentable portion, we don't want this part * of the data in our fragments. @@ -11117,7 +11505,9 @@ ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp, } } - if (IP_FLOW_CONTROLLED_ULP(ip6h->ip6_nxt) || canput(stq->q_next)) { + /* Flow-control check has been done in ip_wput_ire_v6 */ + if (IP_FLOW_CONTROLLED_ULP(ip6h->ip6_nxt) || caller == IP_WPUT || + caller == IP_WSRV || canput(stq->q_next)) { uint32_t ill_index; /* @@ -11164,7 +11554,7 @@ ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp, ill = ire_to_ill(ire); } IRB_REFRELE(irb); - } else if (connp != NULL && IS_TCP_CONN(connp) && + } else if (connp != NULL && IPCL_IS_TCP(connp) && connp->conn_mdt_ok && !connp->conn_tcp->tcp_mdt && ILL_MDT_USABLE(ill)) { /* @@ -11583,7 +11973,7 @@ ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp, (void) putbq(connp->conn_wq, mp); conn_drain_insert(connp); /* - * called_from_wsrv implies we are + * caller == IP_WSRV implies we are * the service thread, and the * queue is already noenabled. * The check for canput and diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c index 937e0d8b0d..fc793de53b 100644 --- a/usr/src/uts/common/inet/ip/ip_if.c +++ b/usr/src/uts/common/inet/ip/ip_if.c @@ -80,6 +80,7 @@ #include <inet/ip_rts.h> #include <inet/ip_ndp.h> #include <inet/ip_if.h> +#include <inet/ip_impl.h> #include <inet/tun.h> #include <inet/sctp_ip.h> @@ -1232,10 +1233,10 @@ ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) } else { /* * IP-MT XXX In the case of TLI/XTI bind / optmgmt this can't - * be just ip_ioctl_freemsg. we have to restart it + * be just inet_freemsg. we have to restart it * otherwise the thread will be stuck. */ - ip_ioctl_freemsg(mp); + inet_freemsg(mp); } return (B_TRUE); } @@ -1344,10 +1345,10 @@ ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) } else { /* * IP-MT XXX In the case of TLI/XTI bind / optmgmt - * this can't be just ip_ioctl_freemsg. we have to + * this can't be just inet_freemsg. we have to * restart it otherwise the thread will be stuck. */ - ip_ioctl_freemsg(curr); + inet_freemsg(curr); } } } @@ -1384,7 +1385,7 @@ conn_ioctl_cleanup(conn_t *connp) if (curr != NULL) { mutex_exit(&connp->conn_lock); CONN_DEC_REF(connp); - ip_ioctl_freemsg(curr); + inet_freemsg(curr); return; } /* @@ -2042,7 +2043,7 @@ ill_capability_mdt_reset(ill_t *ill, mblk_t **sc_mp) dl_capability_sub_t *dl_subcap; int size; - if (!(ill->ill_capabilities & ILL_CAPAB_MDT)) + if (!ILL_MDT_CAPABLE(ill)) return; ASSERT(ill->ill_mdt_capab != NULL); @@ -2857,6 +2858,9 @@ ill_capability_poll_capable(ill_t *ill, dl_capab_poll_t *ipoll, bcopy((void *)&poll, (void *)opoll, sizeof (dl_capab_poll_t)); ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); + ip1dbg(("ill_capability_poll_capable: asking interface %s " + "to enable polling\n", ill->ill_name)); + /* nmp points to a DL_CAPABILITY_REQ message to enable polling */ ill_dlpi_send(ill, nmp); } @@ -2944,6 +2948,8 @@ ill_capability_poll_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) ASSERT(ill->ill_poll_capab != NULL); ill->ill_capabilities |= ILL_CAPAB_POLL; } + ip1dbg(("ill_capability_poll_ack: interface %s " + "has enabled polling\n", ill->ill_name)); break; } } @@ -3048,8 +3054,9 @@ ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) return; } -#define CURR_HCKSUM_CAPAB \ - (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM) +#define CURR_HCKSUM_CAPAB \ + (HCKSUM_INET_PARTIAL | HCKSUM_INET_FULL_V4 | \ + HCKSUM_INET_FULL_V6 | HCKSUM_IPHDRCKSUM) if ((ihck->hcksum_txflags & HCKSUM_ENABLE) && (ihck->hcksum_txflags & CURR_HCKSUM_CAPAB)) { @@ -3126,10 +3133,11 @@ ill_capability_hcksum_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) * hardware checksum acceleration. */ ill_dlpi_send(ill, nmp); - } else + } else { ip1dbg(("ill_capability_hcksum_ack: interface %s has " "advertised %x hardware checksum capability flags\n", ill->ill_name, ihck->hcksum_txflags)); + } } static void @@ -3140,7 +3148,7 @@ ill_capability_hcksum_reset(ill_t *ill, mblk_t **sc_mp) dl_capability_sub_t *dl_subcap; int size; - if (!(ill->ill_capabilities & ILL_CAPAB_HCKSUM)) + if (!ILL_HCKSUM_CAPABLE(ill)) return; ASSERT(ill->ill_hcksum_capab != NULL); @@ -7300,7 +7308,7 @@ ipsq_flush(ill_t *ill) ASSERT(mp_next == NULL); ipsq->ipsq_mptail = prev; } - ip_ioctl_freemsg(mp); + inet_freemsg(mp); } else { prev = mp; } @@ -8838,7 +8846,7 @@ ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp, sin_t *sin, if (mp1 != NULL) freeb(mp1); if (pending_mp != NULL) - ip_ioctl_freemsg(pending_mp); + inet_freemsg(pending_mp); return (ENOMEM); } @@ -8848,7 +8856,7 @@ ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp, sin_t *sin, (caddr_t)&ipaddr); if (mp2 == NULL) { freeb(mp1); - ip_ioctl_freemsg(pending_mp); + inet_freemsg(pending_mp); return (ENOMEM); } /* Put together the chain. */ @@ -9743,7 +9751,7 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp) pending_mp = ill_pending_mp_get(ill, &connp, ioc_id); if (pending_mp == NULL) { ASSERT(connp == NULL); - ip_ioctl_freemsg(mp); + inet_freemsg(mp); return; } ASSERT(connp != NULL); @@ -9760,7 +9768,7 @@ ip_sioctl_iocack(queue_t *q, mblk_t *mp) */ orig_ioc_mp->b_cont->b_next = pending_mp->b_cont->b_next; orig_ioc_mp->b_cont->b_prev = pending_mp->b_cont->b_prev; - ip_ioctl_freemsg(pending_mp); + inet_freemsg(pending_mp); /* * We're done if there was an error or if this is not an SIOCG{X}ARP @@ -18114,6 +18122,8 @@ ipif_mask_reply(ipif_t *ipif) icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0); + if (icmph->icmph_checksum == 0) + icmph->icmph_checksum = 0xffff; put(ipif->ipif_wq, mp); diff --git a/usr/src/uts/common/inet/ip/ip_multi.c b/usr/src/uts/common/inet/ip/ip_multi.c index c19e886d0d..0c42de575d 100644 --- a/usr/src/uts/common/inet/ip/ip_multi.c +++ b/usr/src/uts/common/inet/ip/ip_multi.c @@ -65,6 +65,7 @@ #include <inet/ipsec_impl.h> #include <inet/sctp_ip.h> #include <inet/ip_listutils.h> +#include <inet/udp_impl.h> #include <netinet/igmp.h> @@ -1186,14 +1187,39 @@ void ip_multicast_loopback(queue_t *q, ill_t *ill, mblk_t *mp_orig, int fanout_flags, zoneid_t zoneid) { - mblk_t *mp; - mblk_t *ipsec_mp; + mblk_t *mp; + mblk_t *ipsec_mp; + + if (DB_TYPE(mp_orig) == M_DATA && + ((ipha_t *)mp_orig->b_rptr)->ipha_protocol == IPPROTO_UDP) { + uint_t hdrsz; + + hdrsz = IPH_HDR_LENGTH((ipha_t *)mp_orig->b_rptr) + + sizeof (udpha_t); + ASSERT(MBLKL(mp_orig) >= hdrsz); + + if (((mp = allocb(hdrsz, BPRI_MED)) != NULL) && + (mp_orig = dupmsg(mp_orig)) != NULL) { + bcopy(mp_orig->b_rptr, mp->b_rptr, hdrsz); + mp->b_wptr += hdrsz; + mp->b_cont = mp_orig; + mp_orig->b_rptr += hdrsz; + if (MBLKL(mp_orig) == 0) { + mp->b_cont = mp_orig->b_cont; + mp_orig->b_cont = NULL; + freeb(mp_orig); + } + } else if (mp != NULL) { + freeb(mp); + mp = NULL; + } + } else { + mp = ip_copymsg(mp_orig); + } - /* TODO this could use dup'ed messages except for the IP header. */ - mp = ip_copymsg(mp_orig); if (mp == NULL) return; - if (mp->b_datap->db_type == M_CTL) { + if (DB_TYPE(mp) == M_CTL) { ipsec_mp = mp; mp = mp->b_cont; } else { @@ -2553,7 +2579,7 @@ ip_extract_msfilter(queue_t *q, mblk_t *mp, ipif_t **ipifpp, ipsq_func_t func) zoneid = connp->conn_zoneid; /* don't allow multicast operations on a tcp conn */ - if (IS_TCP_CONN(connp)) + if (IPCL_IS_TCP(connp)) return (ENOPROTOOPT); if (cmd == SIOCSIPMSFILTER || cmd == SIOCGIPMSFILTER) { diff --git a/usr/src/uts/common/inet/ip/ip_ndp.c b/usr/src/uts/common/inet/ip/ip_ndp.c index ee9386e4af..948ccd4bc1 100644 --- a/usr/src/uts/common/inet/ip/ip_ndp.c +++ b/usr/src/uts/common/inet/ip/ip_ndp.c @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -144,7 +144,6 @@ static nce_t nce_nil; mblk_t *mp; mblk_t *template; nce_t **ncep; - int err = 0; boolean_t dropped = B_FALSE; ASSERT(MUTEX_HELD(&ndp_g_lock)); @@ -280,8 +279,15 @@ static nce_t nce_nil; mutex_exit(&nce->nce_lock); mutex_enter(&ndp_g_lock); } -done: - return (err); + /* + * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then + * we call nce_fastpath as soon as the nce is resolved in ndp_process. + * We call nce_fastpath from nce_update if the link layer address of + * the peer changes from nce_update + */ + if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) + nce_fastpath(nce); + return (0); } int @@ -1028,7 +1034,6 @@ ndp_noresolver(ill_t *ill, const in6_addr_t *dst) * Cache entry with a proper resolver cookie was * created. */ - nce_fastpath(nce); NCE_REFRELE(nce); break; case EEXIST: @@ -1108,7 +1113,6 @@ nce_set_multicast(ill_t *ill, const in6_addr_t *dst) ip1dbg(("nce_set_multicast: create failed" "%d\n", err)); return (err); } - nce_fastpath(nce); NCE_REFRELE(nce); return (0); } @@ -2168,8 +2172,7 @@ nce_set_ll(nce_t *nce, uchar_t *ll_addr) ASSERT(ll_addr != NULL); /* Always called before fast_path_probe */ - if (nce->nce_fp_mp != NULL) - return; + ASSERT(nce->nce_fp_mp == NULL); if (ill->ill_sap_length != 0) { /* * Copy the SAP type specified in the @@ -2265,8 +2268,8 @@ nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr) if (nce->nce_fp_mp != NULL) { freemsg(nce->nce_fp_mp); nce->nce_fp_mp = NULL; - need_fastpath_update = B_TRUE; } + need_fastpath_update = B_TRUE; } mutex_exit(&nce->nce_lock); if (need_stop_timer) { diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c index 2085f212ba..12907ba3b4 100644 --- a/usr/src/uts/common/inet/ip/ipclassifier.c +++ b/usr/src/uts/common/inet/ip/ipclassifier.c @@ -233,6 +233,7 @@ const char ipclassifier_version[] = "@(#)ipclassifier.c 1.6 04/03/31 SMI"; #include <inet/ip_rts.h> #include <inet/optcom.h> #include <inet/ip_ndp.h> +#include <inet/udp_impl.h> #include <inet/sctp_ip.h> #include <sys/ethernet.h> @@ -351,8 +352,7 @@ ipcl_init(void) ipcl_conn_cache = kmem_cache_create("ipcl_conn_cache", sizeof (conn_t), CACHE_ALIGN_SIZE, - NULL, NULL, - NULL, NULL, NULL, 0); + NULL, NULL, NULL, NULL, NULL, 0); ipcl_tcpconn_cache = kmem_cache_create("ipcl_tcpconn_cache", sizeof (itc_t), CACHE_ALIGN_SIZE, @@ -501,17 +501,19 @@ ipcl_conn_create(uint32_t type, int sleep) case IPCL_IPCCONN: connp = kmem_cache_alloc(ipcl_conn_cache, sleep); if (connp == NULL) - return (connp); + return (NULL); bzero(connp, sizeof (conn_t)); - mutex_init(&connp->conn_lock, NULL, - MUTEX_DEFAULT, NULL); + mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); - connp->conn_flags |= IPCL_IPCCONN; + connp->conn_flags = IPCL_IPCCONN; connp->conn_ref = 1; IPCL_DEBUG_LVL(1, ("ipcl_conn_create: connp = %p\n", (void *)connp)); ipcl_globalhash_insert(connp); break; + default: + connp = NULL; + ASSERT(0); } return (connp); @@ -521,7 +523,6 @@ void ipcl_conn_destroy(conn_t *connp) { mblk_t *mp; - tcp_t *tcp = connp->conn_tcp; ASSERT(!MUTEX_HELD(&connp->conn_lock)); ASSERT(connp->conn_ref == 0); @@ -531,6 +532,8 @@ ipcl_conn_destroy(conn_t *connp) cv_destroy(&connp->conn_cv); if (connp->conn_flags & IPCL_TCPCONN) { + tcp_t *tcp = connp->conn_tcp; + mutex_destroy(&connp->conn_lock); ASSERT(connp->conn_tcp != NULL); tcp_free(tcp); @@ -567,6 +570,7 @@ ipcl_conn_destroy(conn_t *connp) } else if (connp->conn_flags & IPCL_SCTPCONN) { sctp_free(connp); } else { + ASSERT(connp->conn_udp == NULL); mutex_destroy(&connp->conn_lock); kmem_cache_free(ipcl_conn_cache, connp); } @@ -1863,6 +1867,57 @@ ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex, return (NULL); } +/* + * ipcl_get_next_conn + * get the next entry in the conn global list + * and put a reference on the next_conn. + * decrement the reference on the current conn. + * + * This is an iterator based walker function that also provides for + * some selection by the caller. It walks through the conn_hash bucket + * searching for the next valid connp in the list, and selects connections + * that are neither closed nor condemned. It also REFHOLDS the conn + * thus ensuring that the conn exists when the caller uses the conn. + */ +conn_t * +ipcl_get_next_conn(connf_t *connfp, conn_t *connp, uint32_t conn_flags) +{ + conn_t *next_connp; + + if (connfp == NULL) + return (NULL); + + mutex_enter(&connfp->connf_lock); + + next_connp = (connp == NULL) ? + connfp->connf_head : connp->conn_g_next; + + while (next_connp != NULL) { + mutex_enter(&next_connp->conn_lock); + if (!(next_connp->conn_flags & conn_flags) || + (next_connp->conn_state_flags & + (CONN_CONDEMNED | CONN_INCIPIENT))) { + /* + * This conn has been condemned or + * is closing, or the flags don't match + */ + mutex_exit(&next_connp->conn_lock); + next_connp = next_connp->conn_g_next; + continue; + } + CONN_INC_REF_LOCKED(next_connp); + mutex_exit(&next_connp->conn_lock); + break; + } + + mutex_exit(&connfp->connf_lock); + + if (connp != NULL) + CONN_DEC_REF(connp); + + return (next_connp); +} + #ifdef CONN_DEBUG /* * Trace of the last NBUF refhold/refrele diff --git a/usr/src/uts/common/inet/ip/tun.c b/usr/src/uts/common/inet/ip/tun.c index f0507908f5..85fd4b1ec9 100644 --- a/usr/src/uts/common/inet/ip/tun.c +++ b/usr/src/uts/common/inet/ip/tun.c @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -3693,6 +3693,8 @@ tun_icmp_message_v4(queue_t *q, ipha_t *ipha, icmph_t *icmp, mblk_t *mp) *nicmp = *icmp; nicmp->icmph_checksum = 0; nicmp->icmph_checksum = IP_CSUM(send_mp, sizeof (ipha_t), 0); + if (nicmp->icmph_checksum == 0) + nicmp->icmph_checksum = 0xffff; /* let ip know we are an icmp message */ atomic_add_64(&atp->tun_HCInOctets, @@ -3757,6 +3759,8 @@ tun_icmp_message_v6(queue_t *q, ip6_t *ip6h, icmp6_t *icmp6, uint8_t hoplim, up[12] + up[13] + up[14] + up[15]; sum = (sum & 0xffff) + (sum >> 16); nicmp6->icmp6_cksum = IP_CSUM(send_mp, IPV6_HDR_LEN, sum); + if (nicmp6->icmp6_cksum == 0) + nicmp6->icmp6_cksum = 0xffff; /* let ip know we are an icmp message */ atomic_add_64(&atp->tun_HCInOctets, diff --git a/usr/src/uts/common/inet/ip6.h b/usr/src/uts/common/inet/ip6.h index 5190bb4bf7..8283250d2a 100644 --- a/usr/src/uts/common/inet/ip6.h +++ b/usr/src/uts/common/inet/ip6.h @@ -370,8 +370,7 @@ extern boolean_t ip_hdr_length_nexthdr_v6(mblk_t *, ip6_t *, uint16_t *, uint8_t **); extern int ip_hdr_length_v6(mblk_t *, ip6_t *); extern uint32_t ip_massage_options_v6(ip6_t *, ip6_rthdr_t *); -extern void ip_wput_frag_v6(mblk_t *, ire_t *, uint_t, conn_t *, - boolean_t, int); +extern void ip_wput_frag_v6(mblk_t *, ire_t *, uint_t, conn_t *, int, int); extern void ip_wput_ipsec_out_v6(queue_t *, mblk_t *, ip6_t *, ill_t *, ire_t *); extern int ip_total_hdrs_len_v6(ip6_pkt_t *); diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h new file mode 100644 index 0000000000..f55bb7d6ce --- /dev/null +++ b/usr/src/uts/common/inet/ip_impl.h @@ -0,0 +1,493 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _INET_IP_IMPL_H +#define _INET_IP_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * IP implementation private declarations. These interfaces are + * used to build the IP module and are not meant to be accessed + * by any modules except IP itself. They are undocumented and are + * subject to change without notice. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +#define IP_MOD_ID 5701 + +#ifdef _BIG_ENDIAN +#define IP_HDR_CSUM_TTL_ADJUST 256 +#define IP_TCP_CSUM_COMP IPPROTO_TCP +#define IP_UDP_CSUM_COMP IPPROTO_UDP +#else +#define IP_HDR_CSUM_TTL_ADJUST 1 +#define IP_TCP_CSUM_COMP (IPPROTO_TCP << 8) +#define IP_UDP_CSUM_COMP (IPPROTO_UDP << 8) +#endif + +#define TCP_CHECKSUM_OFFSET 16 +#define TCP_CHECKSUM_SIZE 2 + +#define UDP_CHECKSUM_OFFSET 6 +#define UDP_CHECKSUM_SIZE 2 + +#define IPH_TCPH_CHECKSUMP(ipha, hlen) \ + ((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + TCP_CHECKSUM_OFFSET))) + +#define IPH_UDPH_CHECKSUMP(ipha, hlen) \ + ((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + UDP_CHECKSUM_OFFSET))) + +#define ILL_HCKSUM_CAPABLE(ill) \ + (((ill)->ill_capabilities & ILL_CAPAB_HCKSUM) != 0) +/* + * Macro that performs software checksum calculation on the IP header. + */ +#define IP_HDR_CKSUM(ipha, sum, v_hlen_tos_len, ttl_protocol) { \ + (sum) += (ttl_protocol) + (ipha)->ipha_ident + \ + ((v_hlen_tos_len) >> 16) + \ + ((v_hlen_tos_len) & 0xFFFF) + \ + (ipha)->ipha_fragment_offset_and_flags; \ + (sum) = (((sum) & 0xFFFF) + ((sum) >> 16)); \ + (sum) = ~((sum) + ((sum) >> 16)); \ + (ipha)->ipha_hdr_checksum = (uint16_t)(sum); \ +} + +#define IS_IP_HDR_HWCKSUM(ipsec, mp, ill) \ + ((!ipsec) && (DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) && \ + ILL_HCKSUM_CAPABLE(ill) && dohwcksum) + +/* + * This macro acts as a wrapper around IP_CKSUM_XMIT_FAST, and it performs + * several checks on the IRE and ILL (among other things) in order to see + * whether or not hardware checksum offload is allowed for the outgoing + * packet. It assumes that the caller has held a reference to the IRE. + */ +#define IP_CKSUM_XMIT(ill, ire, mp, ihp, up, proto, start, end, \ + max_frag, ipsec_len, pseudo) { \ + uint32_t _hck_flags; \ + /* \ + * We offload checksum calculation to hardware when IPsec isn't \ + * present and if fragmentation isn't required. We also check \ + * if M_DATA fastpath is safe to be used on the corresponding \ + * IRE; this check is performed without grabbing ire_lock but \ + * instead by holding a reference to it. This is sufficient \ + * for IRE_CACHE; for IRE_BROADCAST on non-Ethernet links, the \ + * DL_NOTE_FASTPATH_FLUSH indication could come up from the \ + * driver and trigger the IRE (hence fp_mp) deletion. This is \ + * why only IRE_CACHE type is eligible for offload. \ + * \ + * The presense of IP options also forces the network stack to \ + * calculate the checksum in software. This is because: \ + * \ + * Wrap around: certain partial-checksum NICs (eri, ce) limit \ + * the size of "start offset" width to 6-bit. This effectively \ + * sets the largest value of the offset to 64-bytes, starting \ + * from the MAC header. When the cumulative MAC and IP headers \ + * exceed such limit, the offset will wrap around. This causes \ + * the checksum to be calculated at the wrong place. \ + * \ + * IPv4 source routing: none of the full-checksum capable NICs \ + * is capable of correctly handling the IPv4 source-routing \ + * option for purposes of calculating the pseudo-header; the \ + * actual destination is different from the destination in the \ + * header which is that of the next-hop. (This case may not be \ + * true for NICs which can parse IPv6 extension headers, but \ + * we choose to simplify the implementation by not offloading \ + * checksum when they are present.) \ + * \ + */ \ + if ((ill) != NULL && ILL_HCKSUM_CAPABLE(ill) && \ + !((ire)->ire_flags & RTF_MULTIRT) && \ + (!((ire)->ire_type & (IRE_BROADCAST|IRE_MIPRTUN)) || \ + (ill)->ill_type == IFT_ETHER) && \ + (ipsec_len) == 0 && \ + (((ire)->ire_ipversion == IPV4_VERSION && \ + (start) == IP_SIMPLE_HDR_LENGTH && \ + (ire)->ire_fp_mp != NULL && \ + MBLKHEAD(mp) >= MBLKL((ire)->ire_fp_mp)) || \ + ((ire)->ire_ipversion == IPV6_VERSION && \ + (start) == IPV6_HDR_LEN && \ + (ire)->ire_nce->nce_fp_mp != NULL && \ + MBLKHEAD(mp) >= MBLKL((ire)->ire_nce->nce_fp_mp))) && \ + (max_frag) >= (uint_t)((end) + (ipsec_len)) && \ + dohwcksum) { \ + _hck_flags = (ill)->ill_hcksum_capab->ill_hcksum_txflags; \ + } else { \ + _hck_flags = 0; \ + } \ + IP_CKSUM_XMIT_FAST((ire)->ire_ipversion, _hck_flags, mp, ihp, \ + up, proto, start, end, pseudo); \ +} + +/* + * Based on the device capabilities, this macro either marks an outgoing + * packet with hardware checksum offload information or calculate the + * checksum in software. If the latter is performed, the checksum field + * of the dblk is cleared; otherwise it will be non-zero and contain the + * necessary flag(s) for the driver. + */ +#define IP_CKSUM_XMIT_FAST(ipver, hck_flags, mp, ihp, up, proto, start, \ + end, pseudo) { \ + uint32_t _sum; \ + /* \ + * Underlying interface supports hardware checksum offload for \ + * the payload; leave the payload checksum for the hardware to \ + * calculate. N.B: We only need to set up checksum info on the \ + * first mblk. \ + */ \ + DB_CKSUMFLAGS(mp) = 0; \ + if (((ipver) == IPV4_VERSION && \ + ((hck_flags) & HCKSUM_INET_FULL_V4)) || \ + ((ipver) == IPV6_VERSION && \ + ((hck_flags) & HCKSUM_INET_FULL_V6))) { \ + /* \ + * Hardware calculates pseudo-header, header and the \ + * payload checksums, so clear the checksum field in \ + * the protocol header. \ + */ \ + *(up) = 0; \ + DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM; \ + } else if ((hck_flags) & HCKSUM_INET_PARTIAL) { \ + /* \ + * Partial checksum offload has been enabled. Fill \ + * the checksum field in the protocl header with the \ + * pseudo-header checksum value. \ + */ \ + _sum = ((proto) == IPPROTO_UDP) ? \ + IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP; \ + _sum += *(up) + (pseudo); \ + _sum = (_sum & 0xFFFF) + (_sum >> 16); \ + *(up) = (_sum & 0xFFFF) + (_sum >> 16); \ + /* \ + * Offsets are relative to beginning of IP header. \ + */ \ + DB_CKSUMSTART(mp) = (start); \ + DB_CKSUMSTUFF(mp) = ((proto) == IPPROTO_UDP) ? \ + (start) + UDP_CHECKSUM_OFFSET : \ + (start) + TCP_CHECKSUM_OFFSET; \ + DB_CKSUMEND(mp) = (end); \ + DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM; \ + } else { \ + /* \ + * Software checksumming. \ + */ \ + _sum = ((proto) == IPPROTO_UDP) ? \ + IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP; \ + _sum += (pseudo); \ + _sum = IP_CSUM(mp, start, _sum); \ + *(up) = (uint16_t)(_sum ? _sum : ~_sum); \ + } \ + /* \ + * Hardware supports IP header checksum offload; clear the \ + * contents of IP header checksum field as expected by NIC. \ + * Do this only if we offloaded either full or partial sum. \ + */ \ + if ((ipver) == IPV4_VERSION && DB_CKSUMFLAGS(mp) != 0 && \ + ((hck_flags) & HCKSUM_IPHDRCKSUM)) { \ + DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; \ + ((ipha_t *)(ihp))->ipha_hdr_checksum = 0; \ + } \ +} + +/* + * Macro to inspect the checksum of a fully-reassembled incoming datagram. + */ +#define IP_CKSUM_RECV_REASS(hck_flags, off, pseudo, sum, err) { \ + (err) = B_FALSE; \ + if ((hck_flags) & HCK_FULLCKSUM) { \ + /* \ + * The sum of all fragment checksums should \ + * result in -0 (0xFFFF) or otherwise invalid. \ + */ \ + if ((sum) != 0xFFFF) \ + (err) = B_TRUE; \ + } else if ((hck_flags) & HCK_PARTIALCKSUM) { \ + (sum) += (pseudo); \ + (sum) = ((sum) & 0xFFFF) + ((sum) >> 16); \ + (sum) = ((sum) & 0xFFFF) + ((sum) >> 16); \ + if (~(sum) & 0xFFFF) \ + (err) = B_TRUE; \ + } else if (((sum) = IP_CSUM(mp, off, pseudo)) != 0) { \ + (err) = B_TRUE; \ + } \ +} + +/* + * This macro inspects an incoming packet to see if the checksum value + * contained in it is valid; if the hardware has provided the information, + * the value is verified, otherwise it performs software checksumming. + * The checksum value is returned to caller. + */ +#define IP_CKSUM_RECV(hck_flags, sum, cksum_start, ulph_off, mp, mp1, err) { \ + int32_t _len; \ + \ + (err) = B_FALSE; \ + if ((hck_flags) & HCK_FULLCKSUM) { \ + /* \ + * Full checksum has been computed by the hardware \ + * and has been attached. If the driver wants us to \ + * verify the correctness of the attached value, in \ + * order to protect against faulty hardware, compare \ + * it against -0 (0xFFFF) to see if it's valid. \ + */ \ + (sum) = DB_CKSUM16(mp); \ + if (!((hck_flags) & HCK_FULLCKSUM_OK) && (sum) != 0xFFFF) \ + (err) = B_TRUE; \ + } else if (((hck_flags) & HCK_PARTIALCKSUM) && \ + ((mp1) == NULL || (mp1)->b_cont == NULL) && \ + (ulph_off) >= DB_CKSUMSTART(mp) && \ + ((_len = (ulph_off) - DB_CKSUMSTART(mp)) & 1) == 0) { \ + uint32_t _adj; \ + /* \ + * Partial checksum has been calculated by hardware \ + * and attached to the packet; in addition, any \ + * prepended extraneous data is even byte aligned, \ + * and there are at most two mblks associated with \ + * the packet. If any such data exists, we adjust \ + * the checksum; also take care any postpended data. \ + */ \ + IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, _len, _adj); \ + /* \ + * One's complement subtract extraneous checksum \ + */ \ + (sum) += DB_CKSUM16(mp); \ + if (_adj >= (sum)) \ + (sum) = ~(_adj - (sum)) & 0xFFFF; \ + else \ + (sum) -= _adj; \ + (sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16); \ + (sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16); \ + if (~(sum) & 0xFFFF) \ + (err) = B_TRUE; \ + } else if (((sum) = IP_CSUM(mp, ulph_off, sum)) != 0) { \ + (err) = B_TRUE; \ + } \ +} + +/* + * Macro to adjust a given checksum value depending on any prepended + * or postpended data on the packet. It expects the start offset to + * begin at an even boundary and that the packet consists of at most + * two mblks. + */ +#define IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, len, adj) { \ + /* \ + * Prepended extraneous data; adjust checksum. \ + */ \ + if ((len) > 0) \ + (adj) = IP_BCSUM_PARTIAL(cksum_start, len, 0); \ + else \ + (adj) = 0; \ + /* \ + * len is now the total length of mblk(s) \ + */ \ + (len) = MBLKL(mp); \ + if ((mp1) == NULL) \ + (mp1) = (mp); \ + else \ + (len) += MBLKL(mp1); \ + /* \ + * Postpended extraneous data; adjust checksum. \ + */ \ + if (((len) = (DB_CKSUMEND(mp) - len)) > 0) { \ + uint32_t _pad; \ + \ + _pad = IP_BCSUM_PARTIAL((mp1)->b_wptr, len, 0); \ + /* \ + * If the postpended extraneous data was odd \ + * byte aligned, swap resulting checksum bytes. \ + */ \ + if ((uintptr_t)(mp1)->b_wptr & 1) \ + (adj) += ((_pad << 8) & 0xFFFF) | (_pad >> 8); \ + else \ + (adj) += _pad; \ + (adj) = ((adj) & 0xFFFF) + ((int)(adj) >> 16); \ + } \ +} + +#define ILL_MDT_CAPABLE(ill) \ + (((ill)->ill_capabilities & ILL_CAPAB_MDT) != 0) + +/* + * ioctl identifier and structure for Multidata Transmit update + * private M_CTL communication from IP to ULP. + */ +#define MDT_IOC_INFO_UPDATE (('M' << 8) + 1020) + +typedef struct ip_mdt_info_s { + uint_t mdt_info_id; /* MDT_IOC_INFO_UPDATE */ + ill_mdt_capab_t mdt_capab; /* ILL MDT capabilities */ +} ip_mdt_info_t; + +/* + * Macro that determines whether or not a given ILL is allowed for MDT. + */ +#define ILL_MDT_USABLE(ill) \ + (ILL_MDT_CAPABLE(ill) && \ + ill->ill_mdt_capab != NULL && \ + ill->ill_mdt_capab->ill_mdt_version == MDT_VERSION_2 && \ + ill->ill_mdt_capab->ill_mdt_on != 0) + +/* + * Macro that determines whether or not a given CONN may be considered + * for fast path prior to proceeding further with Multidata. + */ +#define CONN_IS_MD_FASTPATH(connp) \ + ((connp)->conn_dontroute == 0 && /* SO_DONTROUTE */ \ + (connp)->conn_nofailover_ill == NULL && /* IPIF_NOFAILOVER */ \ + (connp)->conn_xmit_if_ill == NULL && /* IP_XMIT_IF */ \ + (connp)->conn_outgoing_pill == NULL && /* IP{V6}_BOUND_PIF */ \ + (connp)->conn_outgoing_ill == NULL) /* IP{V6}_BOUND_IF */ + +/* Definitons for fragmenting IP packets using MDT. */ + +/* + * Smaller and private version of pdescinfo_t used specifically for IP, + * which allows for only a single payload span per packet. + */ +typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2) ip_pdescinfo_t; + +/* + * Macro version of ip_can_frag_mdt() which avoids the function call if we + * only examine a single message block. + */ +#define IP_CAN_FRAG_MDT(mp, hdr_len, len) \ + (((mp)->b_cont == NULL) ? \ + (MBLKL(mp) >= ((hdr_len) + ip_wput_frag_mdt_min)) : \ + ip_can_frag_mdt((mp), (hdr_len), (len))) + +/* + * Macro that determines whether or not a given IPC requires + * outbound IPSEC processing. + */ +#define CONN_IPSEC_OUT_ENCAPSULATED(connp) \ + ((connp)->conn_out_enforce_policy || \ + ((connp)->conn_latch != NULL && \ + (connp)->conn_latch->ipl_out_policy != NULL)) + +/* + * These are used by the synchronous streams code in tcp and udp. + */ +#define STR_WAKEUP_CLEAR(stp) { \ + mutex_enter(&stp->sd_lock); \ + stp->sd_wakeq &= ~RSLEEP; \ + mutex_exit(&stp->sd_lock); \ +} + +#define STR_WAKEUP_SET(stp) { \ + mutex_enter(&stp->sd_lock); \ + if (stp->sd_flag & RSLEEP) { \ + stp->sd_flag &= ~RSLEEP; \ + cv_broadcast(&_RD(stp->sd_wrq)->q_wait); \ + } else { \ + stp->sd_wakeq |= RSLEEP; \ + } \ + mutex_exit(&stp->sd_lock); \ +} + +#define STR_SENDSIG(stp) { \ + int _events; \ + mutex_enter(&stp->sd_lock); \ + if ((_events = stp->sd_sigflags & (S_INPUT | S_RDNORM)) != 0) \ + strsendsig(stp->sd_siglist, _events, 0, 0); \ + if (stp->sd_rput_opt & SR_POLLIN) { \ + stp->sd_rput_opt &= ~SR_POLLIN; \ + mutex_exit(&stp->sd_lock); \ + pollwakeup(&stp->sd_pollist, POLLIN | POLLRDNORM); \ + } else { \ + mutex_exit(&stp->sd_lock); \ + } \ +} + +#define CONN_UDP_SYNCSTR(connp) \ + (IPCL_IS_UDP(connp) && (connp)->conn_udp->udp_direct_sockfs) + +/* + * Macro that checks whether or not a particular UDP conn is + * flow-controlling on the read-side. If udp module is directly + * above ip, check to see if the drain queue is full; note here + * that we check this without any lock protection because this + * is a coarse granularity inbound flow-control. If the module + * above ip is not udp, then use canputnext to determine the + * flow-control. + * + * Note that these checks are done after the conn is found in + * the UDP fanout table. A UDP conn in that table may have its + * IPCL_UDP bit cleared from the conn_flags when the application + * pops the udp module without issuing an unbind; in this case + * IP will still receive packets for the conn and deliver it + * upstream via putnext. This is the reason why we have to test + * against IPCL_UDP. + */ +#define CONN_UDP_FLOWCTLD(connp) \ + ((CONN_UDP_SYNCSTR(connp) && \ + (connp)->conn_udp->udp_drain_qfull) || \ + (!CONN_UDP_SYNCSTR(connp) && !canputnext((connp)->conn_rq))) + +/* + * Macro that delivers a given message upstream; if udp module + * is directly above ip, the message is passed directly into + * the stream-less entry point. Otherwise putnext is used. + */ +#define CONN_UDP_RECV(connp, mp) { \ + if (IPCL_IS_UDP(connp)) \ + udp_conn_recv(connp, mp); \ + else \ + putnext((connp)->conn_rq, mp); \ +} + +#define ILL_POLL_CAPABLE(ill) \ + (((ill)->ill_capabilities & ILL_CAPAB_POLL) != 0) + +/* + * Macro that hands off one or more messages directly to DLD + * when the interface is marked with ILL_CAPAB_POLL. + */ +#define IP_POLL_ILL_TX(ill, mp) { \ + ill_poll_capab_t *ill_poll = ill->ill_poll_capab; \ + ASSERT(ILL_POLL_CAPABLE(ill)); \ + ASSERT(ill_poll != NULL); \ + ASSERT(ill_poll->ill_tx != NULL); \ + ASSERT(ill_poll->ill_tx_handle != NULL); \ + ill_poll->ill_tx(ill_poll->ill_tx_handle, mp); \ +} + +extern int ip_wput_frag_mdt_min; +extern boolean_t ip_can_frag_mdt(mblk_t *, ssize_t, ssize_t); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _INET_IP_IMPL_H */ diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h index 78de4a0b86..a5148c57c0 100644 --- a/usr/src/uts/common/inet/ipclassifier.h +++ b/usr/src/uts/common/inet/ipclassifier.h @@ -37,6 +37,7 @@ extern "C" { #include <inet/ip.h> #include <inet/mi.h> #include <inet/tcp.h> +#include <inet/udp_impl.h> #include <inet/ip6.h> #include <netinet/in.h> /* for IPPROTO_* constants */ #include <sys/sdt.h> @@ -58,17 +59,19 @@ typedef void (*edesc_rpf)(void *, mblk_t *, void *); */ /* Conn Flags */ -#define IPCL_BOUND 0x80000000 /* Conn in bind table */ -#define IPCL_CONNECTED 0x40000000 /* Conn in connected table */ -#define IPCL_TCP4 0x08000000 /* A TCP connection */ -#define IPCL_TCP6 0x04000000 /* A TCP6 connection */ -#define IPCL_EAGER 0x01000000 /* Incoming connection */ -#define IPCL_CL_LISTENER 0x00800000 /* Cluster listener */ -#define IPCL_ACCEPTOR 0x00400000 /* Sockfs priv acceptor */ -#define IPCL_SOCKET 0x00200000 /* Sockfs connection */ -#define IPCL_CHECK_POLICY 0x00100000 /* Needs policy checking */ +#define IPCL_UDPMOD 0x00020000 /* Is UDP module instance */ +#define IPCL_TCPMOD 0x00040000 /* Is TCP module instance */ #define IPCL_FULLY_BOUND 0x00080000 /* Bound to correct squeue */ -#define IPCL_TCPMOD 0x00040000 /* Is tcp module instance */ +#define IPCL_CHECK_POLICY 0x00100000 /* Needs policy checking */ +#define IPCL_SOCKET 0x00200000 /* Sockfs connection */ +#define IPCL_ACCEPTOR 0x00400000 /* Sockfs priv acceptor */ +#define IPCL_CL_LISTENER 0x00800000 /* Cluster listener */ +#define IPCL_EAGER 0x01000000 /* Incoming connection */ +#define IPCL_UDP 0x02000000 /* A UDP connection */ +#define IPCL_TCP6 0x04000000 /* A TCP6 connection */ +#define IPCL_TCP4 0x08000000 /* A TCP connection */ +#define IPCL_CONNECTED 0x40000000 /* Conn in connected table */ +#define IPCL_BOUND 0x80000000 /* Conn in bind table */ /* Flags identifying the type of conn */ #define IPCL_TCPCONN 0x00000001 /* Flag to indicate cache */ @@ -81,8 +84,6 @@ typedef void (*edesc_rpf)(void *, mblk_t *, void *); #define IPCL_REMOVED 0x00000020 #define IPCL_REUSED 0x00000040 -#define IS_TCP_CONN(connp) (((connp)->conn_flags & IPCL_TCP) != 0) - #define IPCL_IS_TCP4(connp) \ (((connp)->conn_flags & IPCL_TCP4)) @@ -108,6 +109,13 @@ typedef void (*edesc_rpf)(void *, mblk_t *, void *); #define IPCL_IS_TCP(connp) \ ((connp)->conn_flags & (IPCL_TCP4|IPCL_TCP6)) +/* + * IPCL_UDP is set on the conn when udp is directly above ip; + * this flag is cleared the moment udp is popped. + */ +#define IPCL_IS_UDP(connp) \ + ((connp)->conn_flags & IPCL_UDP) + #define IPCL_IS_IPTUN(connp) \ ((connp)->conn_ulp == IPPROTO_ENCAP || \ (connp)->conn_ulp == IPPROTO_IPV6) @@ -169,6 +177,8 @@ struct conn_s { pad_to_bit_31 : 2; tcp_t *conn_tcp; /* Pointer to the tcp struct */ + udp_t *conn_udp; /* Pointer to the udp struct */ + squeue_t *conn_sqp; /* Squeue for processing */ edesc_rpf conn_recv; /* Pointer to recv routine */ void *conn_pad1; @@ -483,6 +493,7 @@ extern int ipcl_conn_insert(conn_t *, uint8_t, ipaddr_t, ipaddr_t, uint32_t); extern int ipcl_conn_insert_v6(conn_t *, uint8_t, const in6_addr_t *, const in6_addr_t *, uint32_t, uint_t); +extern conn_t *ipcl_get_next_conn(connf_t *, conn_t *, uint32_t); void ipcl_proto_insert(conn_t *, uint8_t); void ipcl_proto_insert_v6(conn_t *, uint8_t); diff --git a/usr/src/uts/common/inet/ipp_common.h b/usr/src/uts/common/inet/ipp_common.h index fff5a4ba7f..5703f29d48 100644 --- a/usr/src/uts/common/inet/ipp_common.h +++ b/usr/src/uts/common/inet/ipp_common.h @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2002, 2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -52,7 +52,7 @@ extern uint32_t ipp_action_count; /* Apply IPQoS policies for inbound traffic? */ #define IP6_IN_IPP(flags) (IPP_ENABLED(IPP_LOCAL_IN) && \ - (!((flags) & (IP6_NO_IPPOLICY|IP6_IN_NOCKSUM)))) + (!((flags) & IP6_NO_IPPOLICY))) /* Apply IPQoS policies for oubound traffic? */ #define IP6_OUT_IPP(flags) \ diff --git a/usr/src/uts/common/inet/led.h b/usr/src/uts/common/inet/led.h index 463c8acb70..1e7ba80cff 100644 --- a/usr/src/uts/common/inet/led.h +++ b/usr/src/uts/common/inet/led.h @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -44,12 +44,12 @@ extern "C" { #include <sys/types.h> /* - * Intel x86 can handle unaligned access. However, the checksum routine + * x86 can handle unaligned access. However, the checksum routine * assumes that the source is 16 bit aligned so we always make sure * that packet headers are 16 bit aligned. */ #define OK_16PTR(p) (!((uintptr_t)(p) & 0x1)) -#if defined(__i386) +#if defined(__x86) #define OK_32PTR(p) OK_16PTR(p) #else #define OK_32PTR(p) (!((uintptr_t)(p) & 0x3)) diff --git a/usr/src/uts/common/inet/optcom.c b/usr/src/uts/common/inet/optcom.c index af4b08b0e5..8e4ce9358a 100644 --- a/usr/src/uts/common/inet/optcom.c +++ b/usr/src/uts/common/inet/optcom.c @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -82,8 +82,6 @@ static boolean_t opt_length_ok(opdes_t *, struct T_opthdr *); static t_uscalar_t optcom_max_optbuf_len(opdes_t *, uint_t); static boolean_t opt_bloated_maxsize(opdes_t *); -extern optdb_obj_t tcp_opt_obj; - /* Common code for sending back a T_ERROR_ACK. */ void optcom_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error) @@ -220,9 +218,12 @@ svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp) opdes_t *optd; boolean_t pass_to_next = B_FALSE; boolean_t pass_to_ip = B_FALSE; + boolean_t is_tcp; struct T_optmgmt_ack *toa; struct T_optmgmt_req *tor; + is_tcp = (dbobjp == &tcp_opt_obj); + /* * Allocate M_CTL and prepend to the packet for restarting this * option if needed. IP may need to queue and restart the option @@ -550,14 +551,14 @@ no_mem:; opt1->len = opt->len; bcopy(&opt[1], &opt1[1], opt->len); /* - * Pass the option down to IP only if - * TCP hasn't processed it. + * Pass the option down to IP only + * if TCP hasn't processed it. */ - if (dbobjp == &tcp_opt_obj) + if (is_tcp) pass_to_ip = B_TRUE; - } - else + } else { opt1->len = (t_uscalar_t)len; + } opt1 = (struct opthdr *)((uchar_t *)&opt1[1] + _TPI_ALIGN_OPT(opt1->len)); } /* end for loop */ @@ -639,10 +640,10 @@ restart: optcom_err_ack(q, mp, TSYSERR, error); freeb(first_mp); return (0); - } else if (error < 0 && dbobjp == &tcp_opt_obj) { + } else if (error < 0 && is_tcp) { /* - * Pass the option down to IP only if - * TCP hasn't processed it. + * Pass the option down to IP only + * if TCP hasn't processed it. */ pass_to_ip = B_TRUE; } diff --git a/usr/src/uts/common/inet/optcom.h b/usr/src/uts/common/inet/optcom.h index 8f9226de18..84a64c5317 100644 --- a/usr/src/uts/common/inet/optcom.h +++ b/usr/src/uts/common/inet/optcom.h @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -205,6 +205,18 @@ typedef struct opt_restart_s { #define SETFN_CONN_NEGOTIATE 4 /* semantics for T_CONN_*_REQ */ /* + * Object to represent database of options to search passed to + * {sock,tpi}optcom_req() interface routine to take care of option + * management and associated methods. + */ +extern optdb_obj_t tcp_opt_obj; +extern optdb_obj_t udp_opt_obj; +extern optdb_obj_t ip_opt_obj; + +extern uint_t tcp_max_optsize; +extern uint_t udp_max_optsize; + +/* * Function prototypes */ extern void optcom_err_ack(queue_t *, mblk_t *, t_scalar_t, int); diff --git a/usr/src/uts/common/inet/snmpcom.c b/usr/src/uts/common/inet/snmpcom.c index 852fb167b9..fa417fae88 100644 --- a/usr/src/uts/common/inet/snmpcom.c +++ b/usr/src/uts/common/inet/snmpcom.c @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 1992,1997-2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -51,6 +51,11 @@ #include <inet/optcom.h> #include <inet/snmpcom.h> +#include <inet/ip.h> +#include <inet/ip6.h> +#include <inet/tcp.h> +#include <inet/udp_impl.h> + #define DEFAULT_LENGTH sizeof (long) #define DATA_MBLK_SIZE 1024 #define TOAHDR_SIZE (sizeof (struct T_optmgmt_ack) +\ @@ -90,10 +95,7 @@ static sor_t req_arr[] = { * ctl buffer. */ int -snmp_append_data(mpdata, blob, len) - mblk_t *mpdata; - char *blob; - int len; +snmp_append_data(mblk_t *mpdata, char *blob, int len) { if (!mpdata) @@ -169,12 +171,7 @@ snmp_append_data2(mblk_t *mpdata, mblk_t **last_mpp, char *blob, int len) * for them: getfn() returns 0, setfn() returns 1. */ boolean_t -snmpcom_req(q, mp, setfn, getfn, credp) - queue_t *q; - mblk_t *mp; - pfi_t setfn; - pfi_t getfn; - cred_t *credp; +snmpcom_req(queue_t *q, mblk_t *mp, pfi_t setfn, pfi_t getfn, cred_t *credp) { mblk_t *mpctl; struct opthdr *req; @@ -184,6 +181,7 @@ snmpcom_req(q, mp, setfn, getfn, credp) sor_t *sreq; struct T_optmgmt_req *tor = (struct T_optmgmt_req *)mp->b_rptr; struct T_optmgmt_ack *toa; + boolean_t pass_to_ip = B_FALSE; if (mp->b_cont) { /* don't deal with multiple mblk's */ freemsg(mp->b_cont); @@ -209,6 +207,10 @@ snmpcom_req(q, mp, setfn, getfn, credp) req_start->level <= EXPER_RANGE_END))) return (B_FALSE); + if (setfn == tcp_snmp_set || setfn == udp_snmp_set || + getfn == tcp_snmp_get || getfn == udp_snmp_get) + pass_to_ip = B_TRUE; + switch (tor->MGMT_flags) { case T_NEGOTIATE: @@ -235,8 +237,10 @@ snmpcom_req(q, mp, setfn, getfn, credp) (uchar_t *)&req[1], req->len)) goto bad_req4; } - if (q->q_next) + if (q->q_next != NULL) putnext(q, mp); + else if (pass_to_ip) + ip_output(Q_TO_CONN(q), mp, q, IP_WPUT); else freemsg(mp); return (B_TRUE); @@ -268,9 +272,12 @@ snmpcom_req(q, mp, setfn, getfn, credp) * this is bottom module of stream, send up an EOD ctl msg, * otherwise pass onto the next guy for processing. */ - if (q->q_next) { + if (q->q_next != NULL) { putnext(q, mp); return (B_TRUE); + } else if (pass_to_ip) { + ip_output(Q_TO_CONN(q), mp, q, IP_WPUT); + return (B_TRUE); } if (mp->b_cont) { freemsg(mp->b_cont); diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c index ab6aae1a88..48e9409721 100644 --- a/usr/src/uts/common/inet/squeue.c +++ b/usr/src/uts/common/inet/squeue.c @@ -729,7 +729,8 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, sqproc_t proc, void *arg, #endif #if SQUEUE_DEBUG conn_t *connp = (conn_t *)arg; - ASSERT(connp->conn_tcp->tcp_connp == connp); + ASSERT(!IPCL_IS_TCP(connp) || connp->conn_tcp->tcp_connp == connp); + ASSERT(!IPCL_IS_UDP(connp) || connp->conn_udp->udp_connp == connp); #endif ASSERT(proc != NULL); @@ -954,9 +955,10 @@ squeue_enter_nodrain(squeue_t *sqp, mblk_t *mp, sqproc_t proc, void *arg, ASSERT(sqp != NULL); ASSERT(mp != NULL); ASSERT(mp->b_next == NULL); - ASSERT(connp->conn_tcp->tcp_connp == connp); - + ASSERT(!IPCL_IS_TCP(connp) || connp->conn_tcp->tcp_connp == connp); + ASSERT(!IPCL_IS_UDP(connp) || connp->conn_udp->udp_connp == connp); ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock)); + mutex_enter(&sqp->sq_lock); being_processed = (sqp->sq_state & SQS_PROC); @@ -1100,7 +1102,8 @@ squeue_fill(squeue_t *sqp, mblk_t *mp, sqproc_t proc, void * arg, ASSERT(sqp != NULL); ASSERT(mp != NULL); ASSERT(mp->b_next == NULL); - ASSERT(connp->conn_tcp->tcp_connp == connp); + ASSERT(!IPCL_IS_TCP(connp) || connp->conn_tcp->tcp_connp == connp); + ASSERT(!IPCL_IS_UDP(connp) || connp->conn_udp->udp_connp == connp); ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock)); mutex_enter(&sqp->sq_lock); diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h index 8a2ac05292..fbd594e6e6 100644 --- a/usr/src/uts/common/inet/tcp.h +++ b/usr/src/uts/common/inet/tcp.h @@ -286,11 +286,8 @@ typedef struct tcp_s { tcp_accept_error : 1, /* Error during TLI accept */ tcp_send_discon_ind : 1, /* TLI accept err, send discon ind */ - tcp_fused : 1, /* loopback tcp in fusion mode */ - tcp_unfusable : 1, /* fusion not allowed on endpoint */ - tcp_fused_sigurg : 1, /* send SIGURG upon draining */ tcp_cork : 1, /* tcp_cork option */ - tcp_pad_to_bit_31 : 15; + tcp_pad_to_bit_31 : 18; uint32_t tcp_if_mtu; /* Outgoing interface MTU. */ @@ -514,10 +511,29 @@ typedef struct tcp_s { #define tcp_ipp_use_min_mtu tcp_sticky_ipp.ipp_use_min_mtu struct tcp_s *tcp_saved_listener; /* saved value of listener */ + uint32_t tcp_in_ack_unsent; /* ACK for unsent data cnt. */ + + /* + * The following fusion-related fields are protected by squeue. + */ struct tcp_s *tcp_loopback_peer; /* peer tcp for loopback */ mblk_t *tcp_fused_sigurg_mp; /* M_PCSIG mblk for SIGURG */ + size_t tcp_fuse_rcv_hiwater; /* fusion receive queue size */ + uint_t tcp_fuse_rcv_unread_hiwater; /* max # of outstanding pkts */ + /* + * The following fusion-related fields and bit fields are to be + * manipulated with squeue protection or with tcp_fuse_lock held. + */ + kmutex_t tcp_fuse_lock; + uint_t tcp_fuse_rcv_unread_cnt; /* # of outstanding pkts */ + uint32_t + tcp_fused : 1, /* loopback tcp in fusion mode */ + tcp_unfusable : 1, /* fusion not allowed on endpoint */ + tcp_fused_sigurg : 1, /* send SIGURG upon draining */ + tcp_direct_sockfs : 1, /* direct calls to sockfs */ - uint32_t tcp_in_ack_unsent; /* ACK for unsent data cnt. */ + tcp_fuse_syncstr_stopped : 1, /* synchronous streams stopped */ + tcp_fuse_to_bit_31 : 27; /* * This variable is accessed without any lock protection @@ -525,6 +541,8 @@ typedef struct tcp_s { * with the rest which require such condition. */ boolean_t tcp_issocket; /* this is a socket tcp */ + + uint32_t tcp_squeue_bytes; } tcp_t; extern void tcp_free(tcp_t *tcp); @@ -537,7 +555,8 @@ extern void tcp_input(void *arg, mblk_t *mp, void *arg2); extern void tcp_rput_data(void *arg, mblk_t *mp, void *arg2); extern void *tcp_get_conn(void *arg); extern void tcp_time_wait_collector(void *arg); - +extern int tcp_snmp_get(queue_t *, mblk_t *); +extern int tcp_snmp_set(queue_t *, int, int, uchar_t *, int len); /* * The TCP Fanout structure. * The hash tables and their linkage (tcp_*_hash_next, tcp_ptp*hn) are @@ -610,18 +629,6 @@ typedef struct tcp_ioc_abort_conn_s { #pragma pack() #endif -/* Named Dispatch Parameter Management Structure */ -typedef struct tcpparam_s { - uint32_t tcp_param_min; - uint32_t tcp_param_max; - uint32_t tcp_param_val; - char *tcp_param_name; -} tcpparam_t; - -extern tcpparam_t tcp_param_arr[]; - -extern boolean_t do_tcp_fusion; - #if (defined(_KERNEL) || defined(_KMEMUSER)) extern void tcp_rput_other(tcp_t *tcp, mblk_t *mp); #endif diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index 8c651d1443..9b995cd7df 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -73,6 +73,7 @@ const char tcp_version[] = "%Z%%M% %I% %E% SMI"; #include <inet/common.h> #include <inet/ip.h> +#include <inet/ip_impl.h> #include <inet/ip6.h> #include <inet/ip_ndp.h> #include <inet/mi.h> @@ -82,6 +83,7 @@ const char tcp_version[] = "%Z%%M% %I% %E% SMI"; #include <inet/snmpcom.h> #include <inet/kstatcom.h> #include <inet/tcp.h> +#include <inet/tcp_impl.h> #include <net/pfkeyv2.h> #include <inet/ipsec_info.h> #include <inet/ipdrop.h> @@ -230,8 +232,6 @@ int tcp_squeue_wput = 2; squeue_func_t tcp_squeue_close_proc; squeue_func_t tcp_squeue_wput_proc; -extern vmem_t *ip_minor_arena; - /* * This controls how tiny a write must be before we try to copy it * into the the mblk on the tail of the transmit queue. Not much @@ -278,9 +278,6 @@ int tcp_tx_pull_len = 16; * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags. */ -#define TCP_COUNTERS 1 -#define TCP_CLD_COUNTERS 0 - #ifndef TCP_DEBUG_COUNTER #ifdef DEBUG #define TCP_DEBUG_COUNTER 1 @@ -289,6 +286,7 @@ int tcp_tx_pull_len = 16; #endif #endif +#define TCP_CLD_COUNTERS 0 #define TCP_TAG_CLEAN_DEATH 1 #define TCP_MAX_CLEAN_DEATH_TAG 32 @@ -297,20 +295,6 @@ int tcp_tx_pull_len = 16; static int _lint_dummy_; #endif -#if TCP_COUNTERS -#define TCP_STAT(x) (tcp_statistics.x.value.ui64++) -#define TCP_STAT_UPDATE(x, n) (tcp_statistics.x.value.ui64 += (n)) -#define TCP_STAT_SET(x, n) (tcp_statistics.x.value.ui64 = (n)) -#elif defined(lint) -#define TCP_STAT(x) ASSERT(_lint_dummy_ == 0); -#define TCP_STAT_UPDATE(x, n) ASSERT(_lint_dummy_ == 0); -#define TCP_STAT_SET(x, n) ASSERT(_lint_dummy_ == 0); -#else -#define TCP_STAT(x) -#define TCP_STAT_UPDATE(x, n) -#define TCP_STAT_SET(x, n) -#endif - #if TCP_CLD_COUNTERS static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG]; #define TCP_CLD_STAT(x) tcp_clean_death_stat[x]++ @@ -328,96 +312,7 @@ static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG]; #define TCP_DBGSTAT(x) #endif -typedef struct tcp_stat { - kstat_named_t tcp_time_wait; - kstat_named_t tcp_time_wait_syn; - kstat_named_t tcp_time_wait_syn_success; - kstat_named_t tcp_time_wait_syn_fail; - kstat_named_t tcp_reinput_syn; - kstat_named_t tcp_ip_output; - kstat_named_t tcp_detach_non_time_wait; - kstat_named_t tcp_detach_time_wait; - kstat_named_t tcp_time_wait_reap; - kstat_named_t tcp_clean_death_nondetached; - kstat_named_t tcp_reinit_calls; - kstat_named_t tcp_eager_err1; - kstat_named_t tcp_eager_err2; - kstat_named_t tcp_eager_blowoff_calls; - kstat_named_t tcp_eager_blowoff_q; - kstat_named_t tcp_eager_blowoff_q0; - kstat_named_t tcp_not_hard_bound; - kstat_named_t tcp_no_listener; - kstat_named_t tcp_found_eager; - kstat_named_t tcp_wrong_queue; - kstat_named_t tcp_found_eager_binding1; - kstat_named_t tcp_found_eager_bound1; - kstat_named_t tcp_eager_has_listener1; - kstat_named_t tcp_open_alloc; - kstat_named_t tcp_open_detached_alloc; - kstat_named_t tcp_rput_time_wait; - kstat_named_t tcp_listendrop; - kstat_named_t tcp_listendropq0; - kstat_named_t tcp_wrong_rq; - kstat_named_t tcp_rsrv_calls; - kstat_named_t tcp_eagerfree2; - kstat_named_t tcp_eagerfree3; - kstat_named_t tcp_eagerfree4; - kstat_named_t tcp_eagerfree5; - kstat_named_t tcp_timewait_syn_fail; - kstat_named_t tcp_listen_badflags; - kstat_named_t tcp_timeout_calls; - kstat_named_t tcp_timeout_cached_alloc; - kstat_named_t tcp_timeout_cancel_reqs; - kstat_named_t tcp_timeout_canceled; - kstat_named_t tcp_timermp_alloced; - kstat_named_t tcp_timermp_freed; - kstat_named_t tcp_timermp_allocfail; - kstat_named_t tcp_timermp_allocdblfail; - kstat_named_t tcp_push_timer_cnt; - kstat_named_t tcp_ack_timer_cnt; - kstat_named_t tcp_ire_null1; - kstat_named_t tcp_ire_null; - kstat_named_t tcp_ip_send; - kstat_named_t tcp_ip_ire_send; - kstat_named_t tcp_wsrv_called; - kstat_named_t tcp_flwctl_on; - kstat_named_t tcp_timer_fire_early; - kstat_named_t tcp_timer_fire_miss; - kstat_named_t tcp_freelist_cleanup; - kstat_named_t tcp_rput_v6_error; - kstat_named_t tcp_out_sw_cksum; - kstat_named_t tcp_zcopy_on; - kstat_named_t tcp_zcopy_off; - kstat_named_t tcp_zcopy_backoff; - kstat_named_t tcp_zcopy_disable; - kstat_named_t tcp_mdt_pkt_out; - kstat_named_t tcp_mdt_pkt_out_v4; - kstat_named_t tcp_mdt_pkt_out_v6; - kstat_named_t tcp_mdt_discarded; - kstat_named_t tcp_mdt_conn_halted1; - kstat_named_t tcp_mdt_conn_halted2; - kstat_named_t tcp_mdt_conn_halted3; - kstat_named_t tcp_mdt_conn_resumed1; - kstat_named_t tcp_mdt_conn_resumed2; - kstat_named_t tcp_mdt_legacy_small; - kstat_named_t tcp_mdt_legacy_all; - kstat_named_t tcp_mdt_legacy_ret; - kstat_named_t tcp_mdt_allocfail; - kstat_named_t tcp_mdt_addpdescfail; - kstat_named_t tcp_mdt_allocd; - kstat_named_t tcp_mdt_linked; - kstat_named_t tcp_fusion_flowctl; - kstat_named_t tcp_fusion_backenabled; - kstat_named_t tcp_fusion_urg; - kstat_named_t tcp_fusion_putnext; - kstat_named_t tcp_fusion_unfusable; - kstat_named_t tcp_fusion_aborted; - kstat_named_t tcp_fusion_unqualified; - kstat_named_t tcp_in_ack_unsent_drop; -} tcp_stat_t; - -#if (TCP_COUNTERS || TCP_DEBUG_COUNTER) -static tcp_stat_t tcp_statistics = { +tcp_stat_t tcp_statistics = { { "tcp_time_wait", KSTAT_DATA_UINT64 }, { "tcp_time_wait_syn", KSTAT_DATA_UINT64 }, { "tcp_time_wait_success", KSTAT_DATA_UINT64 }, @@ -475,6 +370,7 @@ static tcp_stat_t tcp_statistics = { { "tcp_freelist_cleanup", KSTAT_DATA_UINT64 }, { "tcp_rput_v6_error", KSTAT_DATA_UINT64 }, { "tcp_out_sw_cksum", KSTAT_DATA_UINT64 }, + { "tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, { "tcp_zcopy_on", KSTAT_DATA_UINT64 }, { "tcp_zcopy_off", KSTAT_DATA_UINT64 }, { "tcp_zcopy_backoff", KSTAT_DATA_UINT64 }, @@ -502,13 +398,14 @@ static tcp_stat_t tcp_statistics = { { "tcp_fusion_unfusable", KSTAT_DATA_UINT64 }, { "tcp_fusion_aborted", KSTAT_DATA_UINT64 }, { "tcp_fusion_unqualified", KSTAT_DATA_UINT64 }, + { "tcp_fusion_rrw_busy", KSTAT_DATA_UINT64 }, + { "tcp_fusion_rrw_msgcnt", KSTAT_DATA_UINT64 }, { "tcp_in_ack_unsent_drop", KSTAT_DATA_UINT64 }, + { "tcp_sock_fallback", KSTAT_DATA_UINT64 }, }; static kstat_t *tcp_kstat; -#endif - /* * Call either ip_output or ip_output_v6. This replaces putnext() calls on the * tcp write side. @@ -519,12 +416,6 @@ static kstat_t *tcp_kstat; connp->conn_send(connp, (mp), (q), IP_WPUT); \ } -/* - * Was this tcp created via socket() interface? - */ -#define TCP_IS_SOCKET(tcp) ((tcp)->tcp_issocket) - - /* Macros for timestamp comparisons */ #define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0) #define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0) @@ -569,8 +460,6 @@ static ipdropper_t tcp_dropper; */ #define TCP_OLD_URP_INTERPRETATION 1 -#define TCP_IS_DETACHED(tcp) ((tcp)->tcp_detached) - #define TCP_IS_DETACHED_NONEAGER(tcp) \ (TCP_IS_DETACHED(tcp) && \ (!(tcp)->tcp_hard_binding)) @@ -687,22 +576,6 @@ static kmem_cache_t *tcp_timercache; kmem_cache_t *tcp_sack_info_cache; kmem_cache_t *tcp_iphc_cache; -#define TCP_TIMER(tcp, f, tim) tcp_timeout(tcp->tcp_connp, f, tim) -#define TCP_TIMER_CANCEL(tcp, id) tcp_timeout_cancel(tcp->tcp_connp, id) - -/* - * To restart the TCP retransmission timer. - */ -#define TCP_TIMER_RESTART(tcp, intvl) \ -{ \ - if ((tcp)->tcp_timer_tid != 0) { \ - (void) TCP_TIMER_CANCEL((tcp), \ - (tcp)->tcp_timer_tid); \ - } \ - (tcp)->tcp_timer_tid = TCP_TIMER((tcp), tcp_timer, \ - MSEC_TO_TICK(intvl)); \ -} - /* * For scalability, we must not run a timer for every TCP connection * in TIME_WAIT state. To see why, consider (for time wait interval of @@ -951,7 +824,6 @@ static void tcp_ip_notify(tcp_t *tcp); static mblk_t *tcp_ire_mp(mblk_t *mp); static void tcp_iss_init(tcp_t *tcp); static void tcp_keepalive_killer(void *arg); -static int tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk); static int tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt); static void tcp_mss_set(tcp_t *tcp, uint32_t size); static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, @@ -985,7 +857,6 @@ static void tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval, tcp_t *thisstream, cred_t *cr); static uint_t tcp_rcv_drain(queue_t *q, tcp_t *tcp); -static void tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len); static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags); static boolean_t tcp_send_rst_chk(void); static void tcp_ss_rexmit(tcp_t *tcp); @@ -994,9 +865,6 @@ static void tcp_process_options(tcp_t *, tcph_t *); static void tcp_rput_common(tcp_t *tcp, mblk_t *mp); static void tcp_rsrv(queue_t *q); static int tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd); -static int tcp_snmp_get(queue_t *q, mblk_t *mpctl); -static int tcp_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, - int len); static int tcp_snmp_state(tcp_t *tcp); static int tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); @@ -1018,7 +886,6 @@ static void tcp_timer(void *arg); static void tcp_timer_callback(void *); static in_port_t tcp_update_next_port(in_port_t port, boolean_t random); static in_port_t tcp_get_next_priv_port(void); -static void tcp_wput(queue_t *q, mblk_t *mp); static void tcp_wput_sock(queue_t *q, mblk_t *mp); void tcp_wput_accept(queue_t *q, mblk_t *mp); static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent); @@ -1044,7 +911,6 @@ static mblk_t *tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, boolean_t sendall, uint32_t *seg_len, boolean_t rexmit); static void tcp_ack_timer(void *arg); static mblk_t *tcp_ack_mp(tcp_t *tcp); -static void tcp_push_timer(void *arg); static void tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len); static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, @@ -1076,9 +942,6 @@ boolean_t tcp_reserved_port_del(in_port_t, in_port_t); boolean_t tcp_reserved_port_check(in_port_t); static tcp_t *tcp_alloc_temp_tcp(in_port_t); static int tcp_reserved_port_list(queue_t *, mblk_t *, caddr_t, cred_t *); -static void tcp_timers_stop(tcp_t *); -static timeout_id_t tcp_timeout(conn_t *, void (*)(void *), clock_t); -static clock_t tcp_timeout_cancel(conn_t *, timeout_id_t); static mblk_t *tcp_mdt_info_mp(mblk_t *); static void tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t); static int tcp_mdt_add_attrs(multidata_t *, const mblk_t *, @@ -1098,7 +961,6 @@ static void tcp_kstat_init(void); static void tcp_kstat_fini(void); static int tcp_kstat_update(kstat_t *kp, int rw); void tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp); -conn_t *tcp_get_next_conn(connf_t *, conn_t *); static int tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, tcph_t *tcph, uint_t ipvers, mblk_t *idmp); static int tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, @@ -1118,14 +980,6 @@ static mblk_t *tcp_zcopy_disable(tcp_t *, mblk_t *); static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, int); static void tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t); -static void tcp_fuse(tcp_t *, uchar_t *, tcph_t *); -static void tcp_unfuse(tcp_t *); -static boolean_t tcp_fuse_output(tcp_t *, mblk_t *); -static void tcp_fuse_output_urg(tcp_t *, mblk_t *); -static boolean_t tcp_fuse_rcv_drain(queue_t *, tcp_t *, mblk_t **); - -extern mblk_t *allocb_tryhard(size_t); - /* * Routines related to the TCP_IOC_ABORT_CONN ioctl command. * @@ -1155,17 +1009,12 @@ static void tcp_ioctl_abort_conn(queue_t *, mblk_t *); static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *, boolean_t); - -static void tcp_clrqfull(tcp_t *); -static void tcp_setqfull(tcp_t *); - static struct module_info tcp_rinfo = { -#define TCP_MODULE_ID 5105 - TCP_MODULE_ID, "tcp", 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER + TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER }; static struct module_info tcp_winfo = { - TCP_MODULE_ID, "tcp", 0, INFPSZ, 127, 16 + TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16 }; /* @@ -1173,11 +1022,12 @@ static struct module_info tcp_winfo = { * to pass through. */ struct qinit tcp_mod_rinit = { - (pfi_t)putnext, NULL, tcp_open, tcp_modclose, NULL, &tcp_rinfo + (pfi_t)putnext, NULL, tcp_open, ip_snmpmod_close, NULL, &tcp_rinfo, }; struct qinit tcp_mod_winit = { - (pfi_t)tcp_wput_mod, NULL, tcp_open, tcp_modclose, NULL, &tcp_rinfo + (pfi_t)ip_snmpmod_wput, NULL, tcp_open, ip_snmpmod_close, NULL, + &tcp_rinfo }; /* @@ -1210,11 +1060,18 @@ struct qinit tcp_acceptor_winit = { (pfi_t)tcp_wput_accept, NULL, NULL, NULL, NULL, &tcp_winfo }; +/* + * Entry points for TCP loopback (read side only) + */ +struct qinit tcp_loopback_rinit = { + (pfi_t)0, (pfi_t)tcp_rsrv, tcp_open, tcp_close, (pfi_t)0, + &tcp_rinfo, NULL, tcp_fuse_rrw, tcp_fuse_rinfop, STRUIOT_STANDARD +}; + struct streamtab tcpinfo = { &tcp_rinit, &tcp_winit }; - extern squeue_func_t tcp_squeue_wput_proc; extern squeue_func_t tcp_squeue_timer_proc; @@ -1306,15 +1163,6 @@ uint32_t tcp_reserved_port_array_size = 0; mib2_tcp_t tcp_mib; /* SNMP fixed size info */ kstat_t *tcp_mibkp; /* kstat exporting tcp_mib data */ -/* - * Object to represent database of options to search passed to - * {sock,tpi}optcom_req() interface routine to take care of option - * management and associated methods. - * XXX These and other externs should ideally move to a TCP header - */ -extern optdb_obj_t tcp_opt_obj; -extern uint_t tcp_max_optsize; - boolean_t tcp_icmp_source_quench = B_FALSE; /* * Following assumes TPI alignment requirements stay along 32 bit @@ -1454,76 +1302,6 @@ tcpparam_t tcp_param_arr[] = { }; /* END CSTYLED */ - -#define tcp_time_wait_interval tcp_param_arr[0].tcp_param_val -#define tcp_conn_req_max_q tcp_param_arr[1].tcp_param_val -#define tcp_conn_req_max_q0 tcp_param_arr[2].tcp_param_val -#define tcp_conn_req_min tcp_param_arr[3].tcp_param_val -#define tcp_conn_grace_period tcp_param_arr[4].tcp_param_val -#define tcp_cwnd_max_ tcp_param_arr[5].tcp_param_val -#define tcp_dbg tcp_param_arr[6].tcp_param_val -#define tcp_smallest_nonpriv_port tcp_param_arr[7].tcp_param_val -#define tcp_ip_abort_cinterval tcp_param_arr[8].tcp_param_val -#define tcp_ip_abort_linterval tcp_param_arr[9].tcp_param_val -#define tcp_ip_abort_interval tcp_param_arr[10].tcp_param_val -#define tcp_ip_notify_cinterval tcp_param_arr[11].tcp_param_val -#define tcp_ip_notify_interval tcp_param_arr[12].tcp_param_val -#define tcp_ipv4_ttl tcp_param_arr[13].tcp_param_val -#define tcp_keepalive_interval_high tcp_param_arr[14].tcp_param_max -#define tcp_keepalive_interval tcp_param_arr[14].tcp_param_val -#define tcp_keepalive_interval_low tcp_param_arr[14].tcp_param_min -#define tcp_maxpsz_multiplier tcp_param_arr[15].tcp_param_val -#define tcp_mss_def_ipv4 tcp_param_arr[16].tcp_param_val -#define tcp_mss_max_ipv4 tcp_param_arr[17].tcp_param_val -#define tcp_mss_min tcp_param_arr[18].tcp_param_val -#define tcp_naglim_def tcp_param_arr[19].tcp_param_val -#define tcp_rexmit_interval_initial tcp_param_arr[20].tcp_param_val -#define tcp_rexmit_interval_max tcp_param_arr[21].tcp_param_val -#define tcp_rexmit_interval_min tcp_param_arr[22].tcp_param_val -#define tcp_deferred_ack_interval tcp_param_arr[23].tcp_param_val -#define tcp_snd_lowat_fraction tcp_param_arr[24].tcp_param_val -#define tcp_sth_rcv_hiwat tcp_param_arr[25].tcp_param_val -#define tcp_sth_rcv_lowat tcp_param_arr[26].tcp_param_val -#define tcp_dupack_fast_retransmit tcp_param_arr[27].tcp_param_val -#define tcp_ignore_path_mtu tcp_param_arr[28].tcp_param_val -#define tcp_smallest_anon_port tcp_param_arr[29].tcp_param_val -#define tcp_largest_anon_port tcp_param_arr[30].tcp_param_val -#define tcp_xmit_hiwat tcp_param_arr[31].tcp_param_val -#define tcp_xmit_lowat tcp_param_arr[32].tcp_param_val -#define tcp_recv_hiwat tcp_param_arr[33].tcp_param_val -#define tcp_recv_hiwat_minmss tcp_param_arr[34].tcp_param_val -#define tcp_fin_wait_2_flush_interval tcp_param_arr[35].tcp_param_val -#define tcp_co_min tcp_param_arr[36].tcp_param_val -#define tcp_max_buf tcp_param_arr[37].tcp_param_val -#define tcp_strong_iss tcp_param_arr[38].tcp_param_val -#define tcp_rtt_updates tcp_param_arr[39].tcp_param_val -#define tcp_wscale_always tcp_param_arr[40].tcp_param_val -#define tcp_tstamp_always tcp_param_arr[41].tcp_param_val -#define tcp_tstamp_if_wscale tcp_param_arr[42].tcp_param_val -#define tcp_rexmit_interval_extra tcp_param_arr[43].tcp_param_val -#define tcp_deferred_acks_max tcp_param_arr[44].tcp_param_val -#define tcp_slow_start_after_idle tcp_param_arr[45].tcp_param_val -#define tcp_slow_start_initial tcp_param_arr[46].tcp_param_val -#define tcp_co_timer_interval tcp_param_arr[47].tcp_param_val -#define tcp_sack_permitted tcp_param_arr[48].tcp_param_val -#define tcp_trace tcp_param_arr[49].tcp_param_val -#define tcp_compression_enabled tcp_param_arr[50].tcp_param_val -#define tcp_ipv6_hoplimit tcp_param_arr[51].tcp_param_val -#define tcp_mss_def_ipv6 tcp_param_arr[52].tcp_param_val -#define tcp_mss_max_ipv6 tcp_param_arr[53].tcp_param_val -#define tcp_rev_src_routes tcp_param_arr[54].tcp_param_val -#define tcp_local_dack_interval tcp_param_arr[55].tcp_param_val -#define tcp_ndd_get_info_interval tcp_param_arr[56].tcp_param_val -#define tcp_local_dacks_max tcp_param_arr[57].tcp_param_val -#define tcp_ecn_permitted tcp_param_arr[58].tcp_param_val -#define tcp_rst_sent_rate_enabled tcp_param_arr[59].tcp_param_val -#define tcp_rst_sent_rate tcp_param_arr[60].tcp_param_val -#define tcp_push_timer_interval tcp_param_arr[61].tcp_param_val -#define tcp_use_smss_as_mss_opt tcp_param_arr[62].tcp_param_val -#define tcp_keepalive_abort_interval_high tcp_param_arr[63].tcp_param_max -#define tcp_keepalive_abort_interval tcp_param_arr[63].tcp_param_val -#define tcp_keepalive_abort_interval_low tcp_param_arr[63].tcp_param_min - /* * tcp_mdt_hdr_{head,tail}_min are the leading and trailing spaces of * each header fragment in the header buffer. Each parameter value has @@ -1720,642 +1498,6 @@ extern uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family, */ int cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg); -#define IPH_TCPH_CHECKSUMP(ipha, hlen) \ - ((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + 16))) - -#ifdef _BIG_ENDIAN -#define IP_TCP_CSUM_COMP IPPROTO_TCP -#else -#define IP_TCP_CSUM_COMP (IPPROTO_TCP << 8) -#endif - -#define IP_HDR_CKSUM(ipha, sum, v_hlen_tos_len, ttl_protocol) { \ - (sum) += (ttl_protocol) + (ipha)->ipha_ident + \ - ((v_hlen_tos_len) >> 16) + \ - ((v_hlen_tos_len) & 0xFFFF) + \ - (ipha)->ipha_fragment_offset_and_flags; \ - (sum) = (((sum) & 0xFFFF) + ((sum) >> 16)); \ - (sum) = ~((sum) + ((sum) >> 16)); \ - (ipha)->ipha_hdr_checksum = (uint16_t)(sum); \ -} - -/* - * Macros that determine whether or not IP processing is needed for TCP. - */ -#define TCP_IPOPT_POLICY_V4(tcp) \ - ((tcp)->tcp_ipversion == IPV4_VERSION && \ - ((tcp)->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH || \ - CONN_OUTBOUND_POLICY_PRESENT((tcp)->tcp_connp) || \ - CONN_INBOUND_POLICY_PRESENT((tcp)->tcp_connp))) - -#define TCP_IPOPT_POLICY_V6(tcp) \ - ((tcp)->tcp_ipversion == IPV6_VERSION && \ - ((tcp)->tcp_ip_hdr_len != IPV6_HDR_LEN || \ - CONN_OUTBOUND_POLICY_PRESENT_V6((tcp)->tcp_connp) || \ - CONN_INBOUND_POLICY_PRESENT_V6((tcp)->tcp_connp))) - -#define TCP_LOOPBACK_IP(tcp) \ - (TCP_IPOPT_POLICY_V4(tcp) || TCP_IPOPT_POLICY_V6(tcp) || \ - !CONN_IS_MD_FASTPATH((tcp)->tcp_connp)) - -boolean_t do_tcp_fusion = B_TRUE; - -/* - * This routine gets called by the eager tcp upon changing state from - * SYN_RCVD to ESTABLISHED. It fuses a direct path between itself - * and the active connect tcp such that the regular tcp processings - * may be bypassed under allowable circumstances. Because the fusion - * requires both endpoints to be in the same squeue, it does not work - * for simultaneous active connects because there is no easy way to - * switch from one squeue to another once the connection is created. - * This is different from the eager tcp case where we assign it the - * same squeue as the one given to the active connect tcp during open. - */ -static void -tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph) -{ - conn_t *peer_connp, *connp = tcp->tcp_connp; - tcp_t *peer_tcp; - - ASSERT(!tcp->tcp_fused); - ASSERT(tcp->tcp_loopback); - ASSERT(tcp->tcp_loopback_peer == NULL); - /* - * We need to check the listener tcp to make sure it's a socket - * endpoint, but we can't really use tcp_listener since we get - * here after sending up T_CONN_IND and tcp_wput_accept() may be - * called independently, at which point tcp_listener is cleared; - * this is why we use tcp_saved_listener. The listener itself - * is guaranteed to be around until tcp_accept_finish() is called - * on this eager -- this won't happen until we're done since - * we're inside the eager's perimeter now. - */ - ASSERT(tcp->tcp_saved_listener != NULL); - - /* - * Lookup peer endpoint; search for the remote endpoint having - * the reversed address-port quadruplet in ESTABLISHED state, - * which is guaranteed to be unique in the system. Zone check - * is applied accordingly for loopback address, but not for - * local address since we want fusion to happen across Zones. - */ - if (tcp->tcp_ipversion == IPV4_VERSION) { - peer_connp = ipcl_conn_tcp_lookup_reversed_ipv4(connp, - (ipha_t *)iphdr, tcph); - } else { - peer_connp = ipcl_conn_tcp_lookup_reversed_ipv6(connp, - (ip6_t *)iphdr, tcph); - } - - /* - * We can only proceed if peer exists, resides in the same squeue - * as our conn and is not raw-socket. The squeue assignment of - * this eager tcp was done earlier at the time of SYN processing - * in ip_fanout_tcp{_v6}. Note that similar squeues by itself - * doesn't guarantee a safe condition to fuse, hence we perform - * additional tests below. - */ - ASSERT(peer_connp == NULL || peer_connp != connp); - if (peer_connp == NULL || peer_connp->conn_sqp != connp->conn_sqp || - !IPCL_IS_TCP(peer_connp)) { - if (peer_connp != NULL) { - TCP_STAT(tcp_fusion_unqualified); - CONN_DEC_REF(peer_connp); - } - return; - } - peer_tcp = peer_connp->conn_tcp; /* active connect tcp */ - - ASSERT(peer_tcp != NULL && peer_tcp != tcp && !peer_tcp->tcp_fused); - ASSERT(peer_tcp->tcp_loopback && peer_tcp->tcp_loopback_peer == NULL); - ASSERT(peer_connp->conn_sqp == connp->conn_sqp); - - /* - * Fuse the endpoints; we perform further checks against both - * tcp endpoints to ensure that a fusion is allowed to happen. - * In particular we bail out for TPI, non-simple TCP/IP or if - * IPsec/IPQoS policy exists. We could actually do it for the - * XTI/TLI/TPI case but this requires more testing, so for now - * we handle only the socket case. - */ - if (!tcp->tcp_unfusable && !peer_tcp->tcp_unfusable && - TCP_IS_SOCKET(tcp->tcp_saved_listener) && TCP_IS_SOCKET(peer_tcp) && - !TCP_LOOPBACK_IP(tcp) && !TCP_LOOPBACK_IP(peer_tcp) && - !IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) { - mblk_t *mp; - struct stroptions *stropt; - queue_t *peer_rq = peer_tcp->tcp_rq; - size_t sth_hiwat; - - ASSERT(!TCP_IS_DETACHED(peer_tcp) && peer_rq != NULL); - - /* - * We need to drain data on both endpoints during unfuse. - * If we need to send up SIGURG at the time of draining, - * we want to be sure that an mblk is readily available. - * This is why we pre-allocate the M_PCSIG mblks for both - * endpoints which will only be used during/after unfuse. - */ - if ((mp = allocb(1, BPRI_HI)) == NULL) { - CONN_DEC_REF(peer_connp); - return; - } - ASSERT(tcp->tcp_fused_sigurg_mp == NULL); - tcp->tcp_fused_sigurg_mp = mp; - - if ((mp = allocb(1, BPRI_HI)) == NULL) { - freeb(tcp->tcp_fused_sigurg_mp); - tcp->tcp_fused_sigurg_mp = NULL; - CONN_DEC_REF(peer_connp); - return; - } - ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL); - peer_tcp->tcp_fused_sigurg_mp = mp; - - /* Allocate M_SETOPTS mblk */ - mp = allocb(sizeof (*stropt), BPRI_HI); - if (mp == NULL) { - freeb(tcp->tcp_fused_sigurg_mp); - tcp->tcp_fused_sigurg_mp = NULL; - freeb(peer_tcp->tcp_fused_sigurg_mp); - peer_tcp->tcp_fused_sigurg_mp = NULL; - CONN_DEC_REF(peer_connp); - return; - } - - /* Fuse both endpoints */ - peer_tcp->tcp_loopback_peer = tcp; - tcp->tcp_loopback_peer = peer_tcp; - peer_tcp->tcp_fused = tcp->tcp_fused = B_TRUE; - - /* - * We never use regular tcp paths in fusion and should - * therefore clear tcp_unsent on both endpoints. Having - * them set to non-zero values means asking for trouble - * especially after unfuse, where we may end up sending - * through regular tcp paths which expect xmit_list and - * friends to be correctly setup. - */ - peer_tcp->tcp_unsent = tcp->tcp_unsent = 0; - - tcp_timers_stop(tcp); - tcp_timers_stop(peer_tcp); - - /* - * Set the stream head's write offset value to zero, since we - * won't be needing any room for TCP/IP headers, and tell it - * to not break up the writes. This would reduce the amount - * of work done by kmem. In addition, we set the receive - * buffer to twice that of q_hiwat in order to simulate the - * non-fusion case. Note that we can only do this for the - * active connect tcp since our eager is still detached; - * it will be dealt with later in tcp_accept_finish(). - */ - DB_TYPE(mp) = M_SETOPTS; - mp->b_wptr += sizeof (*stropt); - - sth_hiwat = peer_rq->q_hiwat << 1; - if (sth_hiwat > tcp_max_buf) - sth_hiwat = tcp_max_buf; - - stropt = (struct stroptions *)mp->b_rptr; - stropt->so_flags = SO_MAXBLK | SO_WROFF | SO_HIWAT; - stropt->so_maxblk = tcp_maxpsz_set(peer_tcp, B_FALSE); - stropt->so_wroff = 0; - stropt->so_hiwat = MAX(sth_hiwat, tcp_sth_rcv_hiwat); - - /* Send the options up */ - putnext(peer_rq, mp); - } else { - TCP_STAT(tcp_fusion_unqualified); - } - CONN_DEC_REF(peer_connp); -} - -/* - * Unfuse a previously-fused pair of tcp loopback endpoints. - */ -static void -tcp_unfuse(tcp_t *tcp) -{ - tcp_t *peer_tcp = tcp->tcp_loopback_peer; - - ASSERT(tcp->tcp_fused && peer_tcp != NULL); - ASSERT(peer_tcp->tcp_fused && peer_tcp->tcp_loopback_peer == tcp); - ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp); - ASSERT(tcp->tcp_unsent == 0 && peer_tcp->tcp_unsent == 0); - ASSERT(tcp->tcp_fused_sigurg_mp != NULL); - ASSERT(peer_tcp->tcp_fused_sigurg_mp != NULL); - - /* - * Drain any pending data; the detached check is needed because - * we may be called from tcp_fuse_output(). Note that in case of - * a detached tcp, the draining will happen later after the tcp - * is unfused. For non-urgent data, this can be handled by the - * regular tcp_rcv_drain(). If we have urgent data sitting in - * the receive list, we will need to send up a SIGURG signal first - * before draining the data. All of these will be handled by the - * code in tcp_fuse_rcv_drain() when called from tcp_rcv_drain(). - */ - if (!TCP_IS_DETACHED(tcp)) { - (void) tcp_fuse_rcv_drain(tcp->tcp_rq, tcp, - &tcp->tcp_fused_sigurg_mp); - } - if (!TCP_IS_DETACHED(peer_tcp)) { - (void) tcp_fuse_rcv_drain(peer_tcp->tcp_rq, peer_tcp, - &peer_tcp->tcp_fused_sigurg_mp); - } - /* Lift up any flow-control conditions */ - if (tcp->tcp_flow_stopped) { - tcp_clrqfull(tcp); - tcp->tcp_flow_stopped = B_FALSE; - TCP_STAT(tcp_fusion_backenabled); - } - if (peer_tcp->tcp_flow_stopped) { - tcp_clrqfull(peer_tcp); - peer_tcp->tcp_flow_stopped = B_FALSE; - TCP_STAT(tcp_fusion_backenabled); - } - - /* Free up M_PCSIG mblk(s) if not needed */ - if (!tcp->tcp_fused_sigurg && tcp->tcp_fused_sigurg_mp != NULL) { - freeb(tcp->tcp_fused_sigurg_mp); - tcp->tcp_fused_sigurg_mp = NULL; - } - if (!peer_tcp->tcp_fused_sigurg && - peer_tcp->tcp_fused_sigurg_mp != NULL) { - freeb(peer_tcp->tcp_fused_sigurg_mp); - peer_tcp->tcp_fused_sigurg_mp = NULL; - } - - /* - * Update th_seq and th_ack in the header template - */ - U32_TO_ABE32(tcp->tcp_snxt, tcp->tcp_tcph->th_seq); - U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); - U32_TO_ABE32(peer_tcp->tcp_snxt, peer_tcp->tcp_tcph->th_seq); - U32_TO_ABE32(peer_tcp->tcp_rnxt, peer_tcp->tcp_tcph->th_ack); - - /* Unfuse the endpoints */ - peer_tcp->tcp_fused = tcp->tcp_fused = B_FALSE; - peer_tcp->tcp_loopback_peer = tcp->tcp_loopback_peer = NULL; -} - -/* - * Fusion output routine for urgent data. This routine is called by - * tcp_fuse_output() for handling non-M_DATA mblks. - */ -static void -tcp_fuse_output_urg(tcp_t *tcp, mblk_t *mp) -{ - mblk_t *mp1; - struct T_exdata_ind *tei; - tcp_t *peer_tcp = tcp->tcp_loopback_peer; - mblk_t *head, *prev_head = NULL; - - ASSERT(tcp->tcp_fused); - ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp); - ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); - ASSERT(mp->b_cont != NULL && DB_TYPE(mp->b_cont) == M_DATA); - ASSERT(MBLKL(mp) >= sizeof (*tei) && MBLKL(mp->b_cont) > 0); - - /* - * Urgent data arrives in the form of T_EXDATA_REQ from above. - * Each occurence denotes a new urgent pointer. For each new - * urgent pointer we signal (SIGURG) the receiving app to indicate - * that it needs to go into urgent mode. This is similar to the - * urgent data handling in the regular tcp. We don't need to keep - * track of where the urgent pointer is, because each T_EXDATA_REQ - * "advances" the urgent pointer for us. - * - * The actual urgent data carried by T_EXDATA_REQ is then prepended - * by a T_EXDATA_IND before being enqueued behind any existing data - * destined for the receiving app. There is only a single urgent - * pointer (out-of-band mark) for a given tcp. If the new urgent - * data arrives before the receiving app reads some existing urgent - * data, the previous marker is lost. This behavior is emulated - * accordingly below, by removing any existing T_EXDATA_IND messages - * and essentially converting old urgent data into non-urgent. - */ - ASSERT(tcp->tcp_valid_bits & TCP_URG_VALID); - /* Let sender get out of urgent mode */ - tcp->tcp_valid_bits &= ~TCP_URG_VALID; - - /* - * Send up SIGURG to the receiving peer; if the peer is detached - * or if we can't allocate the M_PCSIG, indicate that we need to - * signal upon draining to the peer by marking tcp_fused_sigurg. - * This flag will only get cleared once SIGURG is delivered and - * is not affected by the tcp_fused flag -- delivery will still - * happen even after an endpoint is unfused, to handle the case - * where the sending endpoint immediately closes/unfuses after - * sending urgent data and the accept is not yet finished. - */ - if (!TCP_IS_DETACHED(peer_tcp) && - ((mp1 = allocb(1, BPRI_HI)) != NULL || - (mp1 = allocb_tryhard(1)) != NULL)) { - peer_tcp->tcp_fused_sigurg = B_FALSE; - /* Send up the signal */ - DB_TYPE(mp1) = M_PCSIG; - *mp1->b_wptr++ = (uchar_t)SIGURG; - putnext(peer_tcp->tcp_rq, mp1); - } else { - peer_tcp->tcp_fused_sigurg = B_TRUE; - } - - /* Reuse T_EXDATA_REQ mblk for T_EXDATA_IND */ - DB_TYPE(mp) = M_PROTO; - tei = (struct T_exdata_ind *)mp->b_rptr; - tei->PRIM_type = T_EXDATA_IND; - tei->MORE_flag = 0; - mp->b_wptr = (uchar_t *)&tei[1]; - - TCP_STAT(tcp_fusion_urg); - BUMP_MIB(&tcp_mib, tcpOutUrg); - - head = peer_tcp->tcp_rcv_list; - while (head != NULL) { - /* - * Remove existing T_EXDATA_IND, keep the data which follows - * it and relink our list. Note that we don't modify the - * tcp_rcv_last_tail since it never points to T_EXDATA_IND. - */ - if (DB_TYPE(head) != M_DATA) { - mp1 = head; - - ASSERT(DB_TYPE(mp1->b_cont) == M_DATA); - head = mp1->b_cont; - mp1->b_cont = NULL; - head->b_next = mp1->b_next; - mp1->b_next = NULL; - if (prev_head != NULL) - prev_head->b_next = head; - if (peer_tcp->tcp_rcv_list == mp1) - peer_tcp->tcp_rcv_list = head; - if (peer_tcp->tcp_rcv_last_head == mp1) - peer_tcp->tcp_rcv_last_head = head; - freeb(mp1); - } - prev_head = head; - head = head->b_next; - } -} - -/* - * Fusion output routine, called by tcp_output() and tcp_wput_proto(). - */ -static boolean_t -tcp_fuse_output(tcp_t *tcp, mblk_t *mp) -{ - tcp_t *peer_tcp = tcp->tcp_loopback_peer; - queue_t *peer_rq; - mblk_t *mp_tail = mp; - uint32_t send_size = 0; - - ASSERT(tcp->tcp_fused); - ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp); - ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp); - ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO || - DB_TYPE(mp) == M_PCPROTO); - - peer_rq = peer_tcp->tcp_rq; - - /* If this connection requires IP, unfuse and use regular path */ - if (TCP_LOOPBACK_IP(tcp) || TCP_LOOPBACK_IP(peer_tcp) || - IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) { - TCP_STAT(tcp_fusion_aborted); - tcp_unfuse(tcp); - return (B_FALSE); - } - - for (;;) { - if (DB_TYPE(mp_tail) == M_DATA) - send_size += MBLKL(mp_tail); - if (mp_tail->b_cont == NULL) - break; - mp_tail = mp_tail->b_cont; - } - - if (send_size == 0) { - freemsg(mp); - return (B_TRUE); - } - - /* - * Handle urgent data; we either send up SIGURG to the peer now - * or do it later when we drain, in case the peer is detached - * or if we're short of memory for M_PCSIG mblk. - */ - if (DB_TYPE(mp) != M_DATA) - tcp_fuse_output_urg(tcp, mp); - - /* - * Enqueue data into the peer's receive list; we may or may not - * drain the contents depending on the conditions below. - */ - tcp_rcv_enqueue(peer_tcp, mp, send_size); - - /* In case it wrapped around and also to keep it constant */ - peer_tcp->tcp_rwnd += send_size; - - /* - * If peer is detached, exercise flow-control when needed; we will - * get back-enabled either in tcp_accept_finish() or tcp_unfuse(). - */ - if (TCP_IS_DETACHED(peer_tcp) && - peer_tcp->tcp_rcv_cnt > peer_rq->q_hiwat) { - tcp_setqfull(tcp); - tcp->tcp_flow_stopped = B_TRUE; - TCP_STAT(tcp_fusion_flowctl); - } - - loopback_packets++; - tcp->tcp_last_sent_len = send_size; - - /* Need to adjust the following SNMP MIB-related variables */ - tcp->tcp_snxt += send_size; - tcp->tcp_suna = tcp->tcp_snxt; - peer_tcp->tcp_rnxt += send_size; - peer_tcp->tcp_rack = peer_tcp->tcp_rnxt; - - BUMP_MIB(&tcp_mib, tcpOutDataSegs); - UPDATE_MIB(&tcp_mib, tcpOutDataBytes, send_size); - - BUMP_MIB(&tcp_mib, tcpInSegs); - BUMP_MIB(&tcp_mib, tcpInDataInorderSegs); - UPDATE_MIB(&tcp_mib, tcpInDataInorderBytes, send_size); - - BUMP_LOCAL(tcp->tcp_obsegs); - BUMP_LOCAL(peer_tcp->tcp_ibsegs); - - if (!TCP_IS_DETACHED(peer_tcp)) { - /* - * If we can't send SIGURG above due to lack of memory, - * schedule push timer and try again. Otherwise drain - * the data if we're not flow-controlled. - */ - if (peer_tcp->tcp_fused_sigurg) { - if (peer_tcp->tcp_push_tid == 0) { - peer_tcp->tcp_push_tid = - TCP_TIMER(peer_tcp, tcp_push_timer, - MSEC_TO_TICK(tcp_push_timer_interval)); - } - } else if (!tcp->tcp_flow_stopped) { - if (!canputnext(peer_rq)) { - tcp_setqfull(tcp); - tcp->tcp_flow_stopped = B_TRUE; - TCP_STAT(tcp_fusion_flowctl); - } else { - ASSERT(peer_tcp->tcp_rcv_list != NULL); - (void) tcp_fuse_rcv_drain(peer_rq, - peer_tcp, NULL); - TCP_STAT(tcp_fusion_putnext); - } - } - } - return (B_TRUE); -} - -/* - * This routine gets called to deliver data upstream on a fused or - * previously fused tcp loopback endpoint; the latter happens only - * when there is a pending SIGURG signal plus urgent data that can't - * be sent upstream in the past. - */ -static boolean_t -tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp) -{ - mblk_t *mp; -#ifdef DEBUG - uint_t cnt = 0; -#endif - - ASSERT(tcp->tcp_loopback); - ASSERT(tcp->tcp_fused || tcp->tcp_fused_sigurg); - ASSERT(!tcp->tcp_fused || tcp->tcp_loopback_peer != NULL); - ASSERT(sigurg_mpp != NULL || tcp->tcp_fused); - - /* No need for the push timer now, in case it was scheduled */ - if (tcp->tcp_push_tid != 0) { - (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); - tcp->tcp_push_tid = 0; - } - /* - * If there's urgent data sitting in receive list and we didn't - * get a chance to send up a SIGURG signal, make sure we send - * it first before draining in order to ensure that SIOCATMARK - * works properly. - */ - if (tcp->tcp_fused_sigurg) { - /* - * sigurg_mpp is normally NULL, i.e. when we're still - * fused and didn't get here because of tcp_unfuse(). - * In this case try hard to allocate the M_PCSIG mblk. - */ - if (sigurg_mpp == NULL && - (mp = allocb(1, BPRI_HI)) == NULL && - (mp = allocb_tryhard(1)) == NULL) { - /* Alloc failed; try again next time */ - tcp->tcp_push_tid = TCP_TIMER(tcp, tcp_push_timer, - MSEC_TO_TICK(tcp_push_timer_interval)); - return (B_TRUE); - } else if (sigurg_mpp != NULL) { - /* - * Use the supplied M_PCSIG mblk; it means we're - * either unfused or in the process of unfusing, - * and the drain must happen now. - */ - mp = *sigurg_mpp; - *sigurg_mpp = NULL; - } - ASSERT(mp != NULL); - - tcp->tcp_fused_sigurg = B_FALSE; - /* Send up the signal */ - DB_TYPE(mp) = M_PCSIG; - *mp->b_wptr++ = (uchar_t)SIGURG; - putnext(q, mp); - /* - * Let the regular tcp_rcv_drain() path handle - * draining the data if we're no longer fused. - */ - if (!tcp->tcp_fused) - return (B_FALSE); - } - - /* Drain the data */ - while ((mp = tcp->tcp_rcv_list) != NULL) { - tcp->tcp_rcv_list = mp->b_next; - mp->b_next = NULL; -#ifdef DEBUG - cnt += msgdsize(mp); -#endif - putnext(q, mp); - } - - ASSERT(cnt == tcp->tcp_rcv_cnt); - tcp->tcp_rcv_last_head = NULL; - tcp->tcp_rcv_last_tail = NULL; - tcp->tcp_rcv_cnt = 0; - tcp->tcp_rwnd = q->q_hiwat; - - return (B_TRUE); -} - -/* - * This is the walker function, which is TCP specific. - * It walks through the conn_hash bucket searching for the - * next valid connp/tcp in the list, selecting connp/tcp - * which haven't closed or condemned. It also REFHOLDS the - * reference for the tcp, ensuring that the tcp exists - * when the caller uses the tcp. - * - * tcp_get_next_conn - * get the next entry in the conn global list - * and put a reference on the next_conn. - * decrement the reference on the current conn. - */ -conn_t * -tcp_get_next_conn(connf_t *connfp, conn_t *connp) -{ - conn_t *next_connp; - - if (connfp == NULL) - return (NULL); - - mutex_enter(&connfp->connf_lock); - - next_connp = (connp == NULL) ? - connfp->connf_head : connp->conn_g_next; - - while (next_connp != NULL) { - mutex_enter(&next_connp->conn_lock); - if ((next_connp->conn_state_flags & - (CONN_CONDEMNED | CONN_INCIPIENT)) || - !IPCL_IS_TCP(next_connp)) { - /* - * This conn has been condemned or - * is closing. - */ - mutex_exit(&next_connp->conn_lock); - next_connp = next_connp->conn_g_next; - continue; - } - ASSERT(next_connp->conn_tcp != NULL); - CONN_INC_REF_LOCKED(next_connp); - mutex_exit(&next_connp->conn_lock); - break; - } - - mutex_exit(&connfp->connf_lock); - - if (connp != NULL) { - CONN_DEC_REF(connp); - } - - return (next_connp); -} - /* * Figure out the value of window scale opton. Note that the rwnd is * ASSUMED to be rounded up to the nearest MSS before the calculation. @@ -2808,7 +1950,7 @@ tcp_accept(tcp_t *listener, mblk_t *mp) acceptor = tcp_acceptor_hash_lookup(acceptor_id); if (acceptor == NULL) { if (listener->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_accept: did not find acceptor 0x%x\n", acceptor_id); @@ -3737,7 +2879,7 @@ tcp_bind(tcp_t *tcp, mblk_t *mp) ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE, + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: bad req, len %u", (uint_t)(mp->b_wptr - mp->b_rptr)); } @@ -3768,7 +2910,7 @@ tcp_bind(tcp_t *tcp, mblk_t *mp) goto do_bind; } if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE, + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: bad state, %d", tcp->tcp_state); } tcp_err_ack(tcp, mp, TOUTSTATE, 0); @@ -3805,7 +2947,7 @@ tcp_bind(tcp_t *tcp, mblk_t *mp) sizeof (sin_t)); if (sin == NULL || !OK_32PTR((char *)sin)) { if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: bad address parameter, " "offset %d, len %d", @@ -3835,7 +2977,7 @@ tcp_bind(tcp_t *tcp, mblk_t *mp) tbr->ADDR_offset, sizeof (sin6_t)); if (sin6 == NULL || !OK_32PTR((char *)sin6)) { if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: bad IPv6 address parameter, " "offset %d, len %d", tbr->ADDR_offset, @@ -3857,7 +2999,7 @@ tcp_bind(tcp_t *tcp, mblk_t *mp) default: if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE, + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: bad address length, %d", tbr->ADDR_length); } @@ -3945,7 +3087,7 @@ tcp_bind(tcp_t *tcp, mblk_t *mp) if (secpolicy_net_privaddr(cr, requested_port) != 0) { if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: no priv for port %d", requested_port); @@ -3963,7 +3105,7 @@ tcp_bind(tcp_t *tcp, mblk_t *mp) if (allocated_port == 0) { if (bind_to_req_port_only) { if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: requested addr busy"); } @@ -3971,7 +3113,7 @@ tcp_bind(tcp_t *tcp, mblk_t *mp) } else { /* If we are out of ports, fail the bind. */ if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: out of ports?"); } @@ -4436,7 +3578,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) (void) putnextctl1(q, M_FLUSH, FLUSHR); } if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE|SL_ERROR, + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, "tcp_clean_death: discon err %d", err); } mp = mi_tpi_discon_ind(NULL, err, 0); @@ -4444,7 +3586,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) putnext(q, mp); } else { if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_clean_death, sending M_ERROR"); } @@ -4476,7 +3618,6 @@ tcp_stop_lingering(tcp_t *tcp) if (tcp->tcp_state > TCPS_LISTEN) { tcp_acceptor_hash_remove(tcp); if (tcp->tcp_flow_stopped) { - tcp->tcp_flow_stopped = B_FALSE; tcp_clrqfull(tcp); } @@ -4621,23 +3762,6 @@ tcp_close(queue_t *q, int flags) return (0); } -int -tcp_modclose(queue_t *q) -{ - conn_t *connp = Q_TO_CONN(q); - ASSERT((connp->conn_flags & IPCL_TCPMOD) != 0); - - qprocsoff(q); - - if (connp->conn_cred != NULL) { - crfree(connp->conn_cred); - connp->conn_cred = NULL; - } - CONN_DEC_REF(connp); - q->q_ptr = WR(q)->q_ptr = NULL; - return (0); -} - static int tcpclose_accept(queue_t *q) { @@ -4798,7 +3922,6 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2) tcp_acceptor_hash_remove(tcp); if (tcp->tcp_flow_stopped) { - tcp->tcp_flow_stopped = B_FALSE; tcp_clrqfull(tcp); } @@ -4922,7 +4045,7 @@ tcp_close_detached(tcp_t *tcp) /* * Stop all TCP timers, and free the timer mblks if requested. */ -static void +void tcp_timers_stop(tcp_t *tcp) { if (tcp->tcp_timer_tid != 0) { @@ -5285,7 +4408,7 @@ tcp_drop_q0(tcp_t *tcp) return (B_FALSE); if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 3, SL_TRACE, + (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, "tcp_drop_q0: listen half-open queue (max=%d) overflow" " (%d pending) on %s, drop one", tcp_conn_req_max_q0, tcp->tcp_conn_req_cnt_q0, @@ -5371,8 +4494,8 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, connp->conn_remv6 = ip6h->ip6_src; /* db_cksumstuff is set at ip_fanout_tcp_v6 */ - ifindex = (int)mp->b_datap->db_cksumstuff; - mp->b_datap->db_cksumstuff = 0; + ifindex = (int)DB_CKSUMSTUFF(mp); + DB_CKSUMSTUFF(mp) = 0; sin6 = sin6_null; sin6.sin6_addr = ip6h->ip6_src; @@ -5727,8 +4850,8 @@ tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp) mp->b_datap->db_struioflag &= ~STRUIO_POLICY; } - new_sqp = (squeue_t *)mp->b_datap->db_cksumstart; - mp->b_datap->db_cksumstart = 0; + new_sqp = (squeue_t *)DB_CKSUMSTART(mp); + DB_CKSUMSTART(mp) = 0; ASSERT(OK_32PTR(mp->b_rptr)); ipvers = IPH_HDR_VERSION(mp->b_rptr); @@ -6012,7 +5135,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) TCP_STAT(tcp_listendrop); BUMP_MIB(&tcp_mib, tcpListenDrop); if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE|SL_ERROR, + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, "tcp_conn_request: listen backlog (max=%d) " "overflow (%d pending) on %s", tcp->tcp_conn_req_max, tcp->tcp_conn_req_cnt_q, @@ -6037,7 +5160,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) mutex_exit(&tcp->tcp_eager_lock); BUMP_MIB(&tcp_mib, tcpListenDropQ0); if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 3, SL_TRACE, + (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, "tcp_conn_request: listen half-open queue " "(max=%d) full (%d pending) on %s", tcp_conn_req_max_q0, @@ -6058,8 +5181,8 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) * otherwise an error case if neither of them is set. */ if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { - new_sqp = (squeue_t *)mp->b_datap->db_cksumstart; - mp->b_datap->db_cksumstart = 0; + new_sqp = (squeue_t *)DB_CKSUMSTART(mp); + DB_CKSUMSTART(mp) = 0; mp->b_datap->db_struioflag &= ~STRUIO_EAGER; econnp = (conn_t *)tcp_get_conn(arg2); if (econnp == NULL) @@ -6420,7 +5543,7 @@ tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2) uint32_t conn_flags; if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { - new_sqp = (squeue_t *)mp->b_datap->db_cksumstart; + new_sqp = (squeue_t *)DB_CKSUMSTART(mp); } else { goto done; } @@ -7174,7 +6297,7 @@ tcp_disconnect(tcp_t *tcp, mblk_t *mp) */ if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_hard_binding) { if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE, + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_disconnect: bad state, %d", tcp->tcp_state); } tcp_err_ack(tcp, mp, TOUTSTATE, 0); @@ -7988,10 +7111,6 @@ tcp_reinit(tcp_t *tcp) /* Cancel outstanding timers */ tcp_timers_stop(tcp); - if (tcp->tcp_flow_stopped) { - tcp->tcp_flow_stopped = B_FALSE; - tcp_clrqfull(tcp); - } /* * Reset everything in the state vector, after updating global * MIB data from instance counters. @@ -8006,6 +7125,10 @@ tcp_reinit(tcp_t *tcp) tcp_zcopy_notify(tcp); tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL; tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0; + if (tcp->tcp_flow_stopped && + TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { + tcp_clrqfull(tcp); + } tcp_close_mpp(&tcp->tcp_reass_head); tcp->tcp_reass_tail = NULL; if (tcp->tcp_rcv_list != NULL) { @@ -8193,7 +7316,6 @@ tcp_reinit_values(tcp) tcp->tcp_fin_sent = 0; tcp->tcp_ordrel_done = 0; - ASSERT(tcp->tcp_flow_stopped == 0); tcp->tcp_debug = 0; tcp->tcp_dontroute = 0; tcp->tcp_broadcast = 0; @@ -8390,14 +7512,22 @@ tcp_reinit_values(tcp) ASSERT(tcp->tcp_rthdrlen == 0); PRESERVE(tcp->tcp_drop_opt_ack_cnt); + /* Reset fusion-related fields */ tcp->tcp_fused = B_FALSE; tcp->tcp_unfusable = B_FALSE; tcp->tcp_fused_sigurg = B_FALSE; + tcp->tcp_direct_sockfs = B_FALSE; + tcp->tcp_fuse_syncstr_stopped = B_FALSE; tcp->tcp_loopback_peer = NULL; + tcp->tcp_fuse_rcv_hiwater = 0; + tcp->tcp_fuse_rcv_unread_hiwater = 0; + tcp->tcp_fuse_rcv_unread_cnt = 0; tcp->tcp_in_ack_unsent = 0; tcp->tcp_cork = B_FALSE; + PRESERVE(tcp->tcp_squeue_bytes); + #undef DONTCARE #undef PRESERVE } @@ -8469,10 +7599,16 @@ tcp_init_values(tcp_t *tcp) tcp->tcp_mdt_hdr_head = 0; tcp->tcp_mdt_hdr_tail = 0; + /* Reset fusion-related fields */ tcp->tcp_fused = B_FALSE; tcp->tcp_unfusable = B_FALSE; tcp->tcp_fused_sigurg = B_FALSE; + tcp->tcp_direct_sockfs = B_FALSE; + tcp->tcp_fuse_syncstr_stopped = B_FALSE; tcp->tcp_loopback_peer = NULL; + tcp->tcp_fuse_rcv_hiwater = 0; + tcp->tcp_fuse_rcv_unread_hiwater = 0; + tcp->tcp_fuse_rcv_unread_cnt = 0; /* Initialize the header template */ if (tcp->tcp_ipversion == IPV4_VERSION) { @@ -9505,7 +8641,7 @@ tcp_keepalive_killer(void *arg) MSEC_TO_TICK(firetime)); } -static int +int tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) { queue_t *q = tcp->tcp_rq; @@ -9515,7 +8651,10 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) if (TCP_IS_DETACHED(tcp)) return (mss); - if (tcp->tcp_mdt || tcp->tcp_maxpsz == 0) { + if (tcp->tcp_fused) { + maxpsz = tcp_fuse_maxpsz_set(tcp); + mss = INFPSZ; + } else if (tcp->tcp_mdt || tcp->tcp_maxpsz == 0) { /* * Set the sd_qn_maxpsz according to the socket send buffer * size, and sd_maxblk to INFPSZ (-1). This will essentially @@ -9545,9 +8684,6 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) if (set_maxblk) (void) mi_set_sth_maxblk(q, mss); - if (tcp->tcp_loopback) - (void) mi_set_sth_copyopt(tcp->tcp_rq, COPYCACHED); - return (mss); } @@ -9868,7 +9004,6 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) */ connp->conn_flags |= IPCL_SOCKET; tcp->tcp_issocket = 1; - WR(q)->q_qinfo = &tcp_sock_winit; } else { #ifdef _ILP32 @@ -10452,32 +9587,45 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, if (!checkonly) tcp->tcp_dgram_errind = onoff; break; - case SO_SNDBUF: + case SO_SNDBUF: { + tcp_t *peer_tcp; + if (*i1 > tcp_max_buf) { *outlenp = 0; return (ENOBUFS); } - if (!checkonly) { - tcp->tcp_xmit_hiwater = *i1; - if (tcp_snd_lowat_fraction != 0) - tcp->tcp_xmit_lowater = - tcp->tcp_xmit_hiwater / - tcp_snd_lowat_fraction; - (void) tcp_maxpsz_set(tcp, B_TRUE); - /* - * If we are flow-controlled, recheck the - * condition. There are apps that increase - * SO_SNDBUF size when flow-controlled - * (EWOULDBLOCK), and expect the flow control - * condition to be lifted right away. - */ - if (tcp->tcp_flow_stopped && - tcp->tcp_unsent < tcp->tcp_xmit_hiwater) { - tcp->tcp_flow_stopped = B_FALSE; - tcp_clrqfull(tcp); - } + if (checkonly) + break; + + tcp->tcp_xmit_hiwater = *i1; + if (tcp_snd_lowat_fraction != 0) + tcp->tcp_xmit_lowater = + tcp->tcp_xmit_hiwater / + tcp_snd_lowat_fraction; + (void) tcp_maxpsz_set(tcp, B_TRUE); + /* + * If we are flow-controlled, recheck the condition. + * There are apps that increase SO_SNDBUF size when + * flow-controlled (EWOULDBLOCK), and expect the flow + * control condition to be lifted right away. + * + * For the fused tcp loopback case, in order to avoid + * a race with the peer's tcp_fuse_rrw() we need to + * hold its fuse_lock while accessing tcp_flow_stopped. + */ + peer_tcp = tcp->tcp_loopback_peer; + ASSERT(!tcp->tcp_fused || peer_tcp != NULL); + if (tcp->tcp_fused) + mutex_enter(&peer_tcp->tcp_fuse_lock); + + if (tcp->tcp_flow_stopped && + TCP_UNSENT_BYTES(tcp) < tcp->tcp_xmit_hiwater) { + tcp_clrqfull(tcp); } + if (tcp->tcp_fused) + mutex_exit(&peer_tcp->tcp_fuse_lock); break; + } case SO_RCVBUF: if (*i1 > tcp_max_buf) { *outlenp = 0; @@ -11892,7 +11040,7 @@ tcp_rcv_drain(queue_t *q, tcp_t *tcp) * M_DATA messages are added to the current element. * Other messages are added as new (b_next) elements. */ -static void +void tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len) { ASSERT(seg_len == msgdsize(mp)); @@ -12380,7 +11528,7 @@ tcp_check_policy(tcp_t *tcp, mblk_t *first_mp, ipha_t *ipha, ip6_t *ip6h, BUMP_MIB(&ip_mib, ipsecInSucceeded); return (B_TRUE); } - (void) strlog(TCP_MODULE_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE, + (void) strlog(TCP_MOD_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE, "tcp inbound policy mismatch: %s, packet dropped\n", reason); BUMP_MIB(&ip_mib, ipsecInFailed); @@ -13469,7 +12617,7 @@ try_again:; */ seg_len -= gap; if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE, + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_rput: unacceptable, gap %d, rgap %d, " "flags 0x%x, seg_seq %u, seg_ack %u, " "seg_len %d, rnxt %u, snxt %u, %s", @@ -13873,7 +13021,7 @@ ok:; tcp->tcp_urp_mark_mp = mp1; flags |= TH_SEND_URP_MARK; #ifdef DEBUG - (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE, + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_rput: sent M_PCSIG 2 seq %x urp %x " "last %x, %s", seg_seq, urp, tcp->tcp_urp_last, @@ -14012,7 +13160,7 @@ ok:; mp1->b_wptr = (uchar_t *)&tei[1]; tcp->tcp_urp_mp = mp1; #ifdef DEBUG - (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE, + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_rput: allocated exdata_ind %s", tcp_display(tcp, NULL, DISP_PORT_ONLY)); @@ -14059,7 +13207,7 @@ ok:; tcp->tcp_urp_mark_mp->b_flag |= MSGMARKNEXT; } #ifdef DEBUG - (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE, + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_rput: AT MARK, len %d, flags 0x%x, %s", seg_len, flags, tcp_display(tcp, NULL, DISP_PORT_ONLY)); @@ -14067,7 +13215,7 @@ ok:; } else { /* Data left until we hit mark */ #ifdef DEBUG - (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE, + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_rput: URP %d bytes left, %s", urp - seg_len, tcp_display(tcp, NULL, DISP_PORT_ONLY)); @@ -14990,7 +14138,7 @@ est: /* Ready for a new signal. */ tcp->tcp_urp_last_valid = B_FALSE; #ifdef DEBUG - (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE, + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_rput: sending exdata_ind %s", tcp_display(tcp, NULL, DISP_PORT_ONLY)); #endif /* DEBUG */ @@ -15026,7 +14174,7 @@ est: tcp->tcp_fused_sigurg); if (flags & TH_MARKNEXT_NEEDED) { #ifdef DEBUG - (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE, + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_rput: sending MSGMARKNEXT %s", tcp_display(tcp, NULL, DISP_PORT_ONLY)); @@ -15167,7 +14315,7 @@ ack_check: mp1 = tcp->tcp_urp_mark_mp; tcp->tcp_urp_mark_mp = NULL; #ifdef DEBUG - (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE, + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_rput: sending zero-length %s %s", ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" : "MSGNOTMARKNEXT"), @@ -15853,7 +15001,7 @@ tcp_rput_other(tcp_t *tcp, mblk_t *mp) return; case T_ERROR_ACK: if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, "tcp_rput_other: case T_ERROR_ACK, " "ERROR_prim == %d", @@ -15984,11 +15132,20 @@ tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2) ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp); + /* + * Normally we would not get backenabled in synchronous + * streams mode, but in case this happens, we need to stop + * synchronous streams temporarily to prevent a race with + * tcp_fuse_rrw() or tcp_fuse_rinfop(). It is safe to access + * tcp_rcv_list here because those entry points will return + * right away when synchronous streams is stopped. + */ + TCP_FUSE_SYNCSTR_STOP(tcp); if (tcp->tcp_rcv_list != NULL) (void) tcp_rcv_drain(tcp->tcp_rq, tcp); tcp_clrqfull(peer_tcp); - peer_tcp->tcp_flow_stopped = B_FALSE; + TCP_FUSE_SYNCSTR_RESUME(tcp); TCP_STAT(tcp_fusion_backenabled); return; } @@ -16118,6 +15275,30 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) uint32_t max_transmittable_rwnd; boolean_t tcp_detached = TCP_IS_DETACHED(tcp); + if (tcp->tcp_fused) { + size_t sth_hiwat; + tcp_t *peer_tcp = tcp->tcp_loopback_peer; + + ASSERT(peer_tcp != NULL); + /* + * Record the stream head's high water mark for + * this endpoint; this is used for flow-control + * purposes in tcp_fuse_output(). + */ + sth_hiwat = tcp_fuse_set_rcv_hiwat(tcp, rwnd); + if (!tcp_detached) + (void) mi_set_sth_hiwat(tcp->tcp_rq, sth_hiwat); + + /* + * In the fusion case, the maxpsz stream head value of + * our peer is set according to its send buffer size + * and our receive buffer size; since the latter may + * have changed we need to update the peer's maxpsz. + */ + (void) tcp_maxpsz_set(peer_tcp, B_TRUE); + return (rwnd); + } + if (tcp_detached) old_max_rwnd = tcp->tcp_rwnd; else @@ -16196,23 +15377,16 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) * Set the Stream head high water mark. This doesn't have to be * here, since we are simply using default values, but we would * prefer to choose these values algorithmically, with a likely - * relationship to rwnd. For fused loopback tcp, we double the - * amount of buffer in order to simulate the normal tcp case. + * relationship to rwnd. */ - if (tcp->tcp_fused) { - (void) mi_set_sth_hiwat(tcp->tcp_rq, MAX(rwnd << 1, - tcp_sth_rcv_hiwat)); - } else { - (void) mi_set_sth_hiwat(tcp->tcp_rq, MAX(rwnd, - tcp_sth_rcv_hiwat)); - } + (void) mi_set_sth_hiwat(tcp->tcp_rq, MAX(rwnd, tcp_sth_rcv_hiwat)); return (rwnd); } /* * Return SNMP stuff in buffer in mpdata. */ -static int +int tcp_snmp_get(queue_t *q, mblk_t *mpctl) { mblk_t *mpdata; @@ -16261,7 +15435,8 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl) connp = NULL; - while ((connp = tcp_get_next_conn(connfp, connp))) { + while ((connp = + ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { tcp_t *tcp; if (connp->conn_zoneid != zoneid) @@ -16406,7 +15581,7 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl) /* Return 0 if invalid set request, 1 otherwise, including non-tcp requests */ /* ARGSUSED */ -static int +int tcp_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len) { mib2_tcpConnEntry_t *tce = (mib2_tcpConnEntry_t *)ptr; @@ -16627,7 +15802,8 @@ tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) connp = NULL; - while ((connp = tcp_get_next_conn(connfp, connp))) { + while ((connp = + ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { tcp = connp->conn_tcp; if (zoneid != GLOBAL_ZONEID && zoneid != connp->conn_zoneid) @@ -16723,7 +15899,8 @@ tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) for (i = 0; i < ipcl_bind_fanout_size; i++) { connfp = &ipcl_bind_fanout[i]; connp = NULL; - while ((connp = tcp_get_next_conn(connfp, connp))) { + while ((connp = + ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { tcp = connp->conn_tcp; if (zoneid != GLOBAL_ZONEID && zoneid != connp->conn_zoneid) @@ -16770,7 +15947,8 @@ tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) for (i = 0; i < ipcl_conn_fanout_size; i++) { connfp = &ipcl_conn_fanout[i]; connp = NULL; - while ((connp = tcp_get_next_conn(connfp, connp))) { + while ((connp = + ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { tcp = connp->conn_tcp; if (zoneid != GLOBAL_ZONEID && zoneid != connp->conn_zoneid) @@ -16927,7 +16105,7 @@ tcp_timer(void *arg) */ if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) { if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_timer: zero win"); } } else { @@ -17040,7 +16218,7 @@ tcp_timer(void *arg) return; default: if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE|SL_ERROR, + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, "tcp_timer: strange state (%d) %s", tcp->tcp_state, tcp_display(tcp, NULL, DISP_PORT_ONLY)); @@ -17372,52 +16550,6 @@ tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2) } /* - * Write side put procedure for TCP module instance. - * TCP as a module is only used for MIB browsers that push TCP over IP or - * ARP. The only supported primitives are T_SVR4_OPTMGMT_REQ and - * T_OPTMGMT_REQ. M_FLUSH messages are only passed downstream; we don't flush - * our queues as we never enqueue messages there. All ioctls are NAKed and - * everything else is freed. - */ -static void -tcp_wput_mod(queue_t *q, mblk_t *mp) -{ - switch (DB_TYPE(mp)) { - case M_PROTO: - case M_PCPROTO: - if ((MBLKL(mp) >= sizeof (t_scalar_t)) && - ((((union T_primitives *)mp->b_rptr)->type == - T_SVR4_OPTMGMT_REQ) || - (((union T_primitives *)mp->b_rptr)->type == - T_OPTMGMT_REQ))) { - /* - * This is the only TPI primitive supported. Its - * handling does not require tcp_t, but it does require - * conn_t to check permissions. - */ - cred_t *cr = DB_CREDDEF(mp, Q_TO_CONN(q)->conn_cred); - if (!snmpcom_req(q, mp, tcp_snmp_set, - tcp_snmp_get, cr)) { - freemsg(mp); - return; - } - } else if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, ENOTSUP)) - != NULL) - qreply(q, mp); - break; - case M_FLUSH: - putnext(q, mp); - break; - case M_IOCTL: - miocnak(q, mp, 0, ENOTSUP); - break; - default: - freemsg(mp); - break; - } -} - -/* * The TCP fast path write put procedure. * NOTE: the logic of the fast path is duplicated from tcp_wput_data() */ @@ -17441,6 +16573,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) int usable; conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; + uint32_t msize; /* * Try and ASSERT the minimum possible references on the @@ -17455,8 +16588,15 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) (connp->conn_fanout == NULL && connp->conn_ref >= 3)); /* Bypass tcp protocol for fused tcp loopback */ - if (tcp->tcp_fused && tcp_fuse_output(tcp, mp)) - return; + if (tcp->tcp_fused) { + msize = msgdsize(mp); + mutex_enter(&connp->conn_lock); + tcp->tcp_squeue_bytes -= msize; + mutex_exit(&connp->conn_lock); + + if (tcp_fuse_output(tcp, mp, msize)) + return; + } mss = tcp->tcp_mss; if (tcp->tcp_xmit_zc_clean) @@ -17482,6 +16622,11 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) (len == 0) || (len > mss) || (tcp->tcp_valid_bits != 0)) { + msize = msgdsize(mp); + mutex_enter(&connp->conn_lock); + tcp->tcp_squeue_bytes -= msize; + mutex_exit(&connp->conn_lock); + tcp_wput_data(tcp, mp, B_FALSE); return; } @@ -17489,6 +16634,10 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) ASSERT(tcp->tcp_xmit_tail_unsent == 0); ASSERT(tcp->tcp_fin_sent == 0); + mutex_enter(&connp->conn_lock); + tcp->tcp_squeue_bytes -= len; + mutex_exit(&connp->conn_lock); + /* queue new packet onto retransmission queue */ if (tcp->tcp_xmit_head == NULL) { tcp->tcp_xmit_head = mp; @@ -17536,6 +16685,11 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) goto slow; } + if (tcp->tcp_flow_stopped && + TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { + tcp_clrqfull(tcp); + } + /* * determine if anything to send (Nagle). * @@ -17789,6 +16943,13 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) mp = NULL; /* + * For a loopback connection with tcp_direct_sockfs on, note that + * we don't have to protect tcp_rcv_list yet because synchronous + * streams has not yet been enabled and tcp_fuse_rrw() cannot + * possibly race with us. + */ + + /* * Set the max window size (tcp_rq->q_hiwat) of the acceptor * properly. This is the first time we know of the acceptor' * queue. So we do it here. @@ -17828,9 +16989,8 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) /* Allocate room for SACK options if needed. */ stropt->so_flags |= SO_WROFF; if (tcp->tcp_fused) { - size_t sth_hiwat; - ASSERT(tcp->tcp_loopback); + ASSERT(tcp->tcp_loopback_peer != NULL); /* * For fused tcp loopback, set the stream head's write * offset value to zero since we won't be needing any room @@ -17839,16 +16999,16 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) * Non-fused tcp loopback case is handled separately below. */ stropt->so_wroff = 0; - /* - * Override q_hiwat and set it to be twice that of the - * previous value; this is to simulate non-fusion case. + * Record the stream head's high water mark for this endpoint; + * this is used for flow-control purposes in tcp_fuse_output(). */ - sth_hiwat = q->q_hiwat << 1; - if (sth_hiwat > tcp_max_buf) - sth_hiwat = tcp_max_buf; - - stropt->so_hiwat = MAX(sth_hiwat, tcp_sth_rcv_hiwat); + stropt->so_hiwat = tcp_fuse_set_rcv_hiwat(tcp, q->q_hiwat); + /* + * Update the peer's transmit parameters according to + * our recently calculated high water mark value. + */ + (void) tcp_maxpsz_set(tcp->tcp_loopback_peer, B_TRUE); } else if (tcp->tcp_snd_sack_ok) { stropt->so_wroff = tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN + (tcp->tcp_loopback ? 0 : tcp_wroff_xtra); @@ -17857,15 +17017,6 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) tcp_wroff_xtra); } - /* - * If loopback, set COPYCACHED option to make sure NOT to use - * non-temporal access. - */ - if (tcp->tcp_loopback) { - stropt->so_flags |= SO_COPYOPT; - stropt->so_copyopt = COPYCACHED; - } - /* Send the options up */ putnext(q, stropt_mp); @@ -17909,7 +17060,6 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) ASSERT(peer_tcp->tcp_fused); tcp_clrqfull(peer_tcp); - peer_tcp->tcp_flow_stopped = B_FALSE; TCP_STAT(tcp_fusion_backenabled); } } @@ -17924,11 +17074,9 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) * tcp_clean_death was deferred * for T_ORDREL_IND - do it now */ - (void) tcp_clean_death( - tcp, - tcp->tcp_client_errno, 21); - tcp->tcp_deferred_clean_death = - B_FALSE; + (void) tcp_clean_death(tcp, + tcp->tcp_client_errno, 21); + tcp->tcp_deferred_clean_death = B_FALSE; } } else { /* @@ -17942,8 +17090,14 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) tcp->tcp_hard_binding = B_FALSE; tcp->tcp_hard_bound = B_TRUE; } + tcp->tcp_detached = B_FALSE; + /* We can enable synchronous streams now */ + if (tcp->tcp_fused) { + tcp_fuse_syncstr_enable_pair(tcp); + } + if (tcp->tcp_ka_enabled) { tcp->tcp_ka_last_intrvl = 0; tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, @@ -18236,7 +17390,7 @@ tcp_wput_accept(queue_t *q, mblk_t *mp) } } -static void +void tcp_wput(queue_t *q, mblk_t *mp) { conn_t *connp = Q_TO_CONN(q); @@ -18245,12 +17399,27 @@ tcp_wput(queue_t *q, mblk_t *mp) t_scalar_t type; uchar_t *rptr; struct iocblk *iocp; + uint32_t msize; ASSERT(connp->conn_ref >= 2); switch (DB_TYPE(mp)) { case M_DATA: - CONN_INC_REF(connp); + tcp = connp->conn_tcp; + ASSERT(tcp != NULL); + + msize = msgdsize(mp); + + mutex_enter(&connp->conn_lock); + CONN_INC_REF_LOCKED(connp); + + tcp->tcp_squeue_bytes += msize; + if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) { + mutex_exit(&connp->conn_lock); + tcp_setqfull(tcp); + } else + mutex_exit(&connp->conn_lock); + (*tcp_squeue_wput_proc)(connp->conn_sqp, mp, tcp_output, connp, SQTAG_TCP_OUTPUT); return; @@ -18265,7 +17434,7 @@ tcp_wput(queue_t *q, mblk_t *mp) type = ((union T_primitives *)rptr)->type; } else { if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_wput_proto, dropping one..."); } @@ -18292,7 +17461,7 @@ tcp_wput(queue_t *q, mblk_t *mp) /* * Most ioctls can be processed right away without going via * squeues - process them right here. Those that do require - * squeue (currently TCP_IOC_DEFAULT_Q and SIOCPOPSOCKFS) + * squeue (currently TCP_IOC_DEFAULT_Q and _SIOCSOCKFALLBACK) * are processed by tcp_wput_ioctl(). */ iocp = (struct iocblk *)mp->b_rptr; @@ -18372,7 +17541,7 @@ tcp_wput_sock(queue_t *wq, mblk_t *mp) ASSERT(wq->q_qinfo == &tcp_sock_winit); wq->q_qinfo = &tcp_winit; - ASSERT(IS_TCP_CONN(connp)); + ASSERT(IPCL_IS_TCP(connp)); ASSERT(TCP_IS_SOCKET(tcp)); if (DB_TYPE(mp) == M_PCPROTO && @@ -18540,7 +17709,6 @@ tcp_zcopy_notify(tcp_t *tcp) mutex_exit(&stp->sd_lock); } - static void tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp) { @@ -18555,7 +17723,6 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp) uint32_t hcksum_txflags = 0; mblk_t *ire_fp_mp; uint_t ire_fp_mp_len; - ill_poll_capab_t *ill_poll; ASSERT(DB_TYPE(mp) == M_DATA); @@ -18699,7 +17866,7 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp) */ } - if ((ill->ill_capabilities & ILL_CAPAB_HCKSUM) && dohwcksum) { + if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) { ASSERT(ill->ill_hcksum_capab != NULL); hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags; } @@ -18710,53 +17877,21 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp) ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION); up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH); - /* - * Underlying interface supports hardware checksum offload for - * the tcp payload, along with M_DATA fast path; leave the payload - * checksum for the hardware to calculate. - * - * N.B: We only need to set up checksum info on the first mblk. - */ - if (hcksum_txflags & HCKSUM_INET_FULL_V4) { - /* - * Hardware calculates pseudo-header, header and payload - * checksums, so clear checksum field in TCP header. - */ - *up = 0; - mp->b_datap->db_struioun.cksum.flags |= HCK_FULLCKSUM; - } else if (hcksum_txflags & HCKSUM_INET_PARTIAL) { - uint32_t sum; - /* - * Partial checksum offload has been enabled. Fill the - * checksum field in the TCP header with the pseudo-header - * checksum value. - */ - sum = *up + cksum + IP_TCP_CSUM_COMP; - sum = (sum & 0xFFFF) + (sum >> 16); - *up = (sum & 0xFFFF) + (sum >> 16); - mp->b_datap->db_cksumstart = IP_SIMPLE_HDR_LENGTH; - mp->b_datap->db_cksumstuff = IP_SIMPLE_HDR_LENGTH + 16; - mp->b_datap->db_cksumend = ntohs(ipha->ipha_length); - mp->b_datap->db_struioun.cksum.flags |= HCK_PARTIALCKSUM; - } else { - /* software checksumming */ + IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, mp, ipha, up, + IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum); + + /* Software checksum? */ + if (DB_CKSUMFLAGS(mp) == 0) { TCP_STAT(tcp_out_sw_cksum); - *up = IP_CSUM(mp, IP_SIMPLE_HDR_LENGTH, - cksum + IP_TCP_CSUM_COMP); - mp->b_datap->db_struioun.cksum.flags = 0; + TCP_STAT_UPDATE(tcp_out_sw_cksum_bytes, + ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH); } ipha->ipha_fragment_offset_and_flags |= (uint32_t)htons(ire->ire_frag_flag); - /* - * Hardware supports IP header checksum offload; clear contents - * of IP header checksum field. Otherwise we calculate it. - */ - if (hcksum_txflags & HCKSUM_IPHDRCKSUM) { - ipha->ipha_hdr_checksum = 0; - mp->b_datap->db_struioun.cksum.flags |= HCK_IPV4_HDRCKSUM; - } else { + /* Calculate IP header checksum if hardware isn't capable */ + if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { IP_HDR_CKSUM(ipha, cksum, ((uint32_t *)ipha)[0], ((uint16_t *)ipha)[4]); } @@ -18769,13 +17904,13 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp) ire->ire_last_used_time = lbolt; BUMP_MIB(&ip_mib, ipOutRequests); - if (ill->ill_capabilities & ILL_CAPAB_POLL) { - ill_poll = ill->ill_poll_capab; - ASSERT(ill_poll != NULL); - ASSERT(ill_poll->ill_tx != NULL); - ASSERT(ill_poll->ill_tx_handle != NULL); - - ill_poll->ill_tx(ill_poll->ill_tx_handle, mp); + if (ILL_POLL_CAPABLE(ill)) { + /* + * Send the packet directly to DLD, where it may be queued + * depending on the availability of transmit resources at + * the media layer. + */ + IP_POLL_ILL_TX(ill, mp); } else { putnext(ire->ire_stq, mp); } @@ -18876,7 +18011,7 @@ tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent) DISP_ADDR_AND_PORT)); #else if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, "tcp_wput_data: data after ordrel, %s\n", tcp_display(tcp, NULL, @@ -18888,6 +18023,10 @@ tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent) (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) != 0) tcp_zcopy_notify(tcp); freemsg(mp); + if (tcp->tcp_flow_stopped && + TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { + tcp_clrqfull(tcp); + } return; } @@ -19214,15 +18353,12 @@ done:; TCP_TIMER_RESTART(tcp, tcp->tcp_rto); } /* Note that len is the amount we just sent but with a negative sign */ - len += tcp->tcp_unsent; - tcp->tcp_unsent = len; + tcp->tcp_unsent += len; if (tcp->tcp_flow_stopped) { - if (len <= tcp->tcp_xmit_lowater) { - tcp->tcp_flow_stopped = B_FALSE; + if (TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { tcp_clrqfull(tcp); } - } else if (len >= tcp->tcp_xmit_hiwater) { - tcp->tcp_flow_stopped = B_TRUE; + } else if (TCP_UNSENT_BYTES(tcp) >= tcp->tcp_xmit_hiwater) { tcp_setqfull(tcp); } } @@ -19361,6 +18497,12 @@ tcp_mdt_add_attrs(multidata_t *mmd, const mblk_t *dlmp, const boolean_t hwcksum, } /* + * Smaller and private version of pdescinfo_t used specifically for TCP, + * which allows for only two payload spans per packet. + */ +typedef struct tcp_pdescinfo_s PDESCINFO_STRUCT(2) tcp_pdescinfo_t; + +/* * tcp_multisend() is called by tcp_wput_data() for Multidata Transmit * scheme, and returns one the following: * @@ -19404,9 +18546,6 @@ tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, #define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 4) & 0x7) #endif -#define TCP_CSUM_OFFSET 16 -#define TCP_CSUM_SIZE 2 - #define PREP_NEW_MULTIDATA() { \ mmd = NULL; \ md_mp = md_hbuf = NULL; \ @@ -19542,8 +18681,7 @@ tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, ill = ire_to_ill(ire); ASSERT(ill != NULL); - ASSERT((ill->ill_capabilities & ILL_CAPAB_MDT) == 0 || - ill->ill_mdt_capab != NULL); + ASSERT(!ILL_MDT_CAPABLE(ill) || ill->ill_mdt_capab != NULL); if (!tcp->tcp_ire_ill_check_done) { tcp_ire_ill_check(tcp, ire, ill, B_TRUE); @@ -19576,16 +18714,16 @@ tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, /* does the interface support hardware checksum offload? */ hwcksum_flags = 0; - if ((ill->ill_capabilities & ILL_CAPAB_HCKSUM) && + if (ILL_HCKSUM_CAPABLE(ill) && (ill->ill_hcksum_capab->ill_hcksum_txflags & - (HCKSUM_INET_FULL_V4 | HCKSUM_INET_PARTIAL | HCKSUM_IPHDRCKSUM)) && - dohwcksum) { + (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6 | HCKSUM_INET_PARTIAL | + HCKSUM_IPHDRCKSUM)) && dohwcksum) { if (ill->ill_hcksum_capab->ill_hcksum_txflags & HCKSUM_IPHDRCKSUM) hwcksum_flags = HCK_IPV4_HDRCKSUM; if (ill->ill_hcksum_capab->ill_hcksum_txflags & - HCKSUM_INET_FULL_V4) + (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6)) hwcksum_flags |= HCK_FULLCKSUM; else if (ill->ill_hcksum_capab->ill_hcksum_txflags & HCKSUM_INET_PARTIAL) @@ -19726,10 +18864,16 @@ tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, * checksum offload; these are currently for IPv4. * For full checksum offload, they are set to zero. */ - if (af == AF_INET && - (hwcksum_flags & HCK_PARTIALCKSUM)) { - start = IP_SIMPLE_HDR_LENGTH; - stuff = IP_SIMPLE_HDR_LENGTH + TCP_CSUM_OFFSET; + if ((hwcksum_flags & HCK_PARTIALCKSUM)) { + if (af == AF_INET) { + start = IP_SIMPLE_HDR_LENGTH; + stuff = IP_SIMPLE_HDR_LENGTH + + TCP_CHECKSUM_OFFSET; + } else { + start = IPV6_HDR_LEN; + stuff = IPV6_HDR_LEN + + TCP_CHECKSUM_OFFSET; + } } else { start = stuff = 0; } @@ -19748,8 +18892,8 @@ tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, /* fastpath mblk */ (af == AF_INET) ? ire->ire_dlureq_mp : ire->ire_nce->nce_res_mp, - /* hardware checksum enabled (IPv4 only) */ - (af == AF_INET && hwcksum_flags != 0), + /* hardware checksum enabled */ + (hwcksum_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)), /* hardware checksum offsets */ start, stuff, 0, /* hardware checksum flag */ @@ -20224,8 +19368,8 @@ legacy_send_no_md: ASSERT(IPVER(ip6h) == IPV6_VERSION); ASSERT(ip6h->ip6_nxt == IPPROTO_TCP); ASSERT(PDESC_HDRL(pkt_info) >= - (IPV6_HDR_LEN + TCP_CSUM_OFFSET + - TCP_CSUM_SIZE)); + (IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET + + TCP_CHECKSUM_SIZE)); ASSERT(tcp->tcp_ipversion == IPV6_VERSION); if (tcp->tcp_ip_forward_progress) { @@ -20273,29 +19417,45 @@ legacy_send_no_md: /* offset for TCP header checksum */ up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH); + } else { + up = (uint16_t *)&ip6h->ip6_src; - if (hwcksum_flags & HCK_FULLCKSUM) { - /* - * Hardware calculates pseudo-header, - * header and payload checksums, so - * zero out this field. - */ - *up = 0; - } else if (hwcksum_flags & HCK_PARTIALCKSUM) { - uint32_t sum; - - /* pseudo-header checksumming */ - sum = *up + cksum + IP_TCP_CSUM_COMP; - sum = (sum & 0xFFFF) + (sum >> 16); - *up = (sum & 0xFFFF) + (sum >> 16); - } else { - /* software checksumming */ - TCP_STAT(tcp_out_sw_cksum); - *up = IP_MD_CSUM(pkt, - IP_SIMPLE_HDR_LENGTH, - cksum + IP_TCP_CSUM_COMP); - } + /* calculate pseudo-header checksum */ + cksum = up[0] + up[1] + up[2] + up[3] + + up[4] + up[5] + up[6] + up[7] + + up[8] + up[9] + up[10] + up[11] + + up[12] + up[13] + up[14] + up[15]; + + /* Fold the initial sum */ + cksum = (cksum & 0xffff) + (cksum >> 16); + + up = (uint16_t *)(((uchar_t *)ip6h) + + IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET); + } + if (hwcksum_flags & HCK_FULLCKSUM) { + /* clear checksum field for hardware */ + *up = 0; + } else if (hwcksum_flags & HCK_PARTIALCKSUM) { + uint32_t sum; + + /* pseudo-header checksumming */ + sum = *up + cksum + IP_TCP_CSUM_COMP; + sum = (sum & 0xFFFF) + (sum >> 16); + *up = (sum & 0xFFFF) + (sum >> 16); + } else { + /* software checksumming */ + TCP_STAT(tcp_out_sw_cksum); + TCP_STAT_UPDATE(tcp_out_sw_cksum_bytes, + tcp->tcp_hdr_len + tcp->tcp_last_sent_len); + *up = IP_MD_CSUM(pkt, tcp->tcp_ip_hdr_len, + cksum + IP_TCP_CSUM_COMP); + if (*up == 0) + *up = 0xFFFF; + } + + /* IPv4 header checksum */ + if (af == AF_INET) { ipha->ipha_fragment_offset_and_flags |= (uint32_t)htons(ire->ire_frag_flag); @@ -20306,19 +19466,6 @@ legacy_send_no_md: ((uint32_t *)ipha)[0], ((uint16_t *)ipha)[4]); } - } else { - up = (uint16_t *)(((uchar_t *)ip6h) + - IPV6_HDR_LEN + TCP_CSUM_OFFSET); - - /* - * Software checksumming (hardware checksum - * offload for IPv6 will hopefully be - * implemented one day). - */ - TCP_STAT(tcp_out_sw_cksum); - *up = IP_MD_CSUM(pkt, - IPV6_HDR_LEN - 2 * sizeof (in6_addr_t), - htons(IPPROTO_TCP)); } /* advance header offset */ @@ -20373,8 +19520,6 @@ legacy_send_no_md: #undef PREP_NEW_MULTIDATA #undef PREP_NEW_PBUF #undef IPVER -#undef TCP_CSUM_OFFSET -#undef TCP_CSUM_SIZE IRE_REFRELE(ire); return (0); @@ -20999,7 +20144,7 @@ tcp_ire_ill_check(tcp_t *tcp, ire_t *ire, ill_t *ill, boolean_t check_mdt) */ if (ip_multidata_outbound && check_mdt && !(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && - ill != NULL && (ill->ill_capabilities & ILL_CAPAB_MDT) && + ill != NULL && ILL_MDT_CAPABLE(ill) && !CONN_IPSEC_OUT_ENCAPSULATED(connp) && !(ire->ire_flags & RTF_MULTIRT) && !IPP_ENABLED(IPP_LOCAL_OUT) && @@ -21112,7 +20257,6 @@ tcp_wput_flush(tcp_t *tcp, mblk_t *mp) * tcp_xmit_lowater, so re-enable flow. */ if (tcp->tcp_flow_stopped) { - tcp->tcp_flow_stopped = B_FALSE; tcp_clrqfull(tcp); } } @@ -21305,26 +20449,47 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2) } tcp_def_q_set(tcp, mp); return; - case SIOCPOPSOCKFS: + case _SIOCSOCKFALLBACK: /* - * sockfs is being I_POP'ed, reset the flag - * indicating this - */ - tcp->tcp_issocket = B_FALSE; - - /* - * Insert this socket into the acceptor hash. - * We might need it for T_CONN_RES message + * Either sockmod is about to be popped and the socket + * would now be treated as a plain stream, or a module + * is about to be pushed so we could no longer use read- + * side synchronous streams for fused loopback tcp. + * Drain any queued data and disable direct sockfs + * interface from now on. */ + if (!tcp->tcp_issocket) { + DB_TYPE(mp) = M_IOCNAK; + iocp->ioc_error = EINVAL; + } else { #ifdef _ILP32 - tcp->tcp_acceptor_id = (t_uscalar_t)RD(q); + tcp->tcp_acceptor_id = (t_uscalar_t)RD(q); #else - tcp->tcp_acceptor_id = tcp->tcp_connp->conn_dev; + tcp->tcp_acceptor_id = tcp->tcp_connp->conn_dev; #endif - tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); - mp->b_datap->db_type = M_IOCACK; + /* + * Insert this socket into the acceptor hash. + * We might need it for T_CONN_RES message + */ + tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); + + if (tcp->tcp_fused) { + /* + * This is a fused loopback tcp; disable + * read-side synchronous streams interface + * and drain any queued data. It is okay + * to do this for non-synchronous streams + * fused tcp as well. + */ + tcp_fuse_disable_pair(tcp, B_FALSE); + } + tcp->tcp_issocket = B_FALSE; + TCP_STAT(tcp_sock_fallback); + + DB_TYPE(mp) = M_IOCACK; + iocp->ioc_error = 0; + } iocp->ioc_count = 0; - iocp->ioc_error = 0; iocp->ioc_rval = 0; qreply(q, mp); return; @@ -21364,7 +20529,9 @@ tcp_wput_proto(void *arg, mblk_t *mp, void *arg2) if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { type = ((union T_primitives *)rptr)->type; if (type == T_EXDATA_REQ) { - len = msgdsize(mp->b_cont) - 1; + uint32_t msize = msgdsize(mp->b_cont); + + len = msize - 1; if (len < 0) { freemsg(mp); return; @@ -21381,7 +20548,7 @@ tcp_wput_proto(void *arg, mblk_t *mp, void *arg2) tcp->tcp_valid_bits |= TCP_URG_VALID; /* Bypass tcp protocol for fused tcp loopback */ - if (tcp->tcp_fused && tcp_fuse_output(tcp, mp)) + if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) return; } else if (type != T_DATA_REQ) { goto non_urgent_data; @@ -21393,7 +20560,7 @@ tcp_wput_proto(void *arg, mblk_t *mp, void *arg2) return; } else { if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE, + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_wput_proto, dropping one..."); } freemsg(mp); @@ -21454,7 +20621,7 @@ non_urgent_data: * the other side. Just ignore it. */ if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_wput_proto, T_ORDREL_REQ out of " "state %s", @@ -21468,7 +20635,7 @@ non_urgent_data: break; default: if (tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, SL_ERROR|SL_TRACE, + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_wput_proto, bogus TPI msg, type %d", tprim->type); } @@ -21530,7 +20697,7 @@ tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl) /* If a text string is passed in with the request, pass it to strlog. */ if (str != NULL && tcp->tcp_debug) { - (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE, + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x", str, seq, ack, ctl); } @@ -21737,7 +20904,7 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, } if (str && q && tcp_dbg) { - (void) strlog(TCP_MODULE_ID, 0, 1, SL_TRACE, + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, " "flags 0x%x", str, seq, ack, ctl); @@ -22478,7 +21645,7 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, } /* This function handles the push timeout. */ -static void +void tcp_push_timer(void *arg) { conn_t *connp = (conn_t *)arg; @@ -22488,10 +21655,18 @@ tcp_push_timer(void *arg) ASSERT(tcp->tcp_listener == NULL); + /* + * We need to stop synchronous streams temporarily to prevent a race + * with tcp_fuse_rrw() or tcp_fusion rinfop(). It is safe to access + * tcp_rcv_list here because those entry points will return right + * away when synchronous streams is stopped. + */ + TCP_FUSE_SYNCSTR_STOP(tcp); tcp->tcp_push_tid = 0; if ((tcp->tcp_rcv_list != NULL) && (tcp_rcv_drain(tcp->tcp_rq, tcp) == TH_ACK_NEEDED)) tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); + TCP_FUSE_SYNCSTR_RESUME(tcp); } /* @@ -24059,15 +23234,14 @@ tcp_ddi_init(void) tcp_iss_key_init((uint8_t *)&tcp_g_t_info_ack, sizeof (tcp_g_t_info_ack)); -#if TCP_COUNTERS || TCP_DEBUG_COUNTER - if ((tcp_kstat = kstat_create("tcp", 0, "tcpstat", + if ((tcp_kstat = kstat_create(TCP_MOD_NAME, 0, "tcpstat", "net", KSTAT_TYPE_NAMED, sizeof (tcp_statistics) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL)) != NULL) { tcp_kstat->ks_data = &tcp_statistics; kstat_install(tcp_kstat); } -#endif + tcp_kstat_init(); } @@ -24181,7 +23355,8 @@ cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg) connfp = &ipcl_globalhash_fanout[i]; connp = NULL; - while ((connp = tcp_get_next_conn(connfp, connp))) { + while ((connp = + ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { tcp = connp->conn_tcp; cl_tcpi.cl_tcpi_version = CL_TCPI_V1; @@ -24373,7 +23548,7 @@ tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp) */ if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES) logflags |= SL_CONSOLE; - (void) strlog(TCP_MODULE_ID, 0, 1, logflags, + (void) strlog(TCP_MOD_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: local = %s:%d, remote = %s:%d, " "start = %d, end = %d\n", lbuf, lport, rbuf, rport, acp->ac_start, acp->ac_end); @@ -24529,7 +23704,7 @@ tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp) */ if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES) logflags |= SL_CONSOLE; - (void) strlog(TCP_MODULE_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: " + (void) strlog(TCP_MOD_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: " "aborted %d connection%c\n", count, ((count > 1) ? 's' : ' ')); if (err == 0 && count == 0) err = ENOENT; @@ -24846,7 +24021,7 @@ process_ack: } done: if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { - mp->b_datap->db_cksumstart = 0; + DB_CKSUMSTART(mp) = 0; mp->b_datap->db_struioflag &= ~STRUIO_EAGER; TCP_STAT(tcp_time_wait_syn_fail); } @@ -24965,7 +24140,7 @@ tcp_setsockopt_mp(int level, int cmd, char *opt, int optlen) /* * TCP Timers Implementation. */ -static timeout_id_t +timeout_id_t tcp_timeout(conn_t *connp, void (*f)(void *), clock_t tim) { mblk_t *mp; @@ -25038,7 +24213,7 @@ tcp_timer_handler(void *arg, mblk_t *mp, void *arg2) * it. But since both should execute on the same squeue, this race should not * occur. */ -static clock_t +clock_t tcp_timeout_cancel(conn_t *connp, timeout_id_t id) { mblk_t *mp = (mblk_t *)id; @@ -25165,30 +24340,48 @@ tcp_timer_free(tcp_t *tcp, mblk_t *mp) * End of TCP Timers implementation. */ -static void +/* + * tcp_{set,clr}qfull() functions are used to either set or clear QFULL + * on the specified backing STREAMS q. Note, the caller may make the + * decision to call based on the tcp_t.tcp_flow_stopped value which + * when check outside the q's lock is only an advisory check ... + */ + +void tcp_setqfull(tcp_t *tcp) { queue_t *q = tcp->tcp_wq; if (!(q->q_flag & QFULL)) { - TCP_STAT(tcp_flwctl_on); mutex_enter(QLOCK(q)); - q->q_flag |= QFULL; - mutex_exit(QLOCK(q)); + if (!(q->q_flag & QFULL)) { + /* still need to set QFULL */ + q->q_flag |= QFULL; + tcp->tcp_flow_stopped = B_TRUE; + mutex_exit(QLOCK(q)); + TCP_STAT(tcp_flwctl_on); + } else { + mutex_exit(QLOCK(q)); + } } } -static void +void tcp_clrqfull(tcp_t *tcp) { queue_t *q = tcp->tcp_wq; if (q->q_flag & QFULL) { mutex_enter(QLOCK(q)); - q->q_flag &= ~QFULL; - mutex_exit(QLOCK(q)); - if (q->q_flag & QWANTW) - qbackenable(q, 0); + if (q->q_flag & QFULL) { + q->q_flag &= ~QFULL; + tcp->tcp_flow_stopped = B_FALSE; + mutex_exit(QLOCK(q)); + if (q->q_flag & QWANTW) + qbackenable(q, 0); + } else { + mutex_exit(QLOCK(q)); + } } } @@ -25254,8 +24447,8 @@ tcp_kstat_init(void) { "connTableSize6", KSTAT_DATA_INT32, 0 } }; - tcp_mibkp = kstat_create("tcp", 0, "tcp", "mib2", KSTAT_TYPE_NAMED, - NUM_OF_FIELDS(tcp_named_kstat_t), 0); + tcp_mibkp = kstat_create(TCP_MOD_NAME, 0, TCP_MOD_NAME, + "mib2", KSTAT_TYPE_NAMED, NUM_OF_FIELDS(tcp_named_kstat_t), 0); if (tcp_mibkp == NULL) return; @@ -25304,7 +24497,8 @@ tcp_kstat_update(kstat_t *kp, int rw) for (i = 0; i < CONN_G_HASH_SIZE; i++) { connfp = &ipcl_globalhash_fanout[i]; connp = NULL; - while ((connp = tcp_get_next_conn(connfp, connp))) { + while ((connp = + ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { tcp = connp->conn_tcp; switch (tcp_snmp_state(tcp)) { case MIB2_TCP_established: @@ -25401,7 +24595,7 @@ tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp) tcph = (tcph_t *)&mp->b_rptr[hdr_len]; if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { mp->b_datap->db_struioflag |= STRUIO_EAGER; - mp->b_datap->db_cksumstart = (intptr_t)sqp; + DB_CKSUMSTART(mp) = (intptr_t)sqp; } squeue_fill(connp->conn_sqp, mp, connp->conn_recv, connp, diff --git a/usr/src/uts/common/inet/tcp/tcp6ddi.c b/usr/src/uts/common/inet/tcp/tcp6ddi.c index c055414f0a..3ccef00029 100644 --- a/usr/src/uts/common/inet/tcp/tcp6ddi.c +++ b/usr/src/uts/common/inet/tcp/tcp6ddi.c @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -37,7 +37,13 @@ #define INET_DEVDESC "TCP6 STREAMS driver %I%" #define INET_MODDESC "TCP6 STREAMS module %I%" #define INET_DEVMINOR TCP_MINOR6 -#define INET_DEVMTFLAGS D_MP +/* + * Note that unlike UDP, TCP uses synchronous STREAMS only + * for TCP Fusion (loopback); this is why we don't define + * D_SYNCSTR here. Since TCP as a module is used only for + * SNMP purposes, we define _D_DIRECT for device instance. + */ +#define INET_DEVMTFLAGS (D_MP|_D_DIRECT) #define INET_MODMTFLAGS D_MP #include "../inetddi.c" diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c new file mode 100644 index 0000000000..31d54d6f95 --- /dev/null +++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c @@ -0,0 +1,1087 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/types.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/strsubr.h> +#include <sys/debug.h> +#include <sys/cmn_err.h> +#include <sys/tihdr.h> + +#include <inet/common.h> +#include <inet/ip.h> +#include <inet/ip_impl.h> +#include <inet/tcp.h> +#include <inet/tcp_impl.h> +#include <inet/ipsec_impl.h> +#include <inet/ipclassifier.h> +#include <inet/ipp_common.h> + +/* + * This file implements TCP fusion - a protocol-less data path for TCP + * loopback connections. The fusion of two local TCP endpoints occurs + * at connection establishment time. Various conditions (see details + * in tcp_fuse()) need to be met for fusion to be successful. If it + * fails, we fall back to the regular TCP data path; if it succeeds, + * both endpoints proceed to use tcp_fuse_output() as the transmit path. + * tcp_fuse_output() enqueues application data directly onto the peer's + * receive queue; no protocol processing is involved. After enqueueing + * the data, the sender can either push (putnext) data up the receiver's + * read queue; or the sender can simply return and let the receiver + * retrieve the enqueued data via the synchronous streams entry point + * tcp_fuse_rrw(). The latter path is taken if synchronous streams is + * enabled (the default). It is disabled if sockfs no longer resides + * directly on top of tcp module due to a module insertion or removal. + * It also needs to be temporarily disabled when sending urgent data + * because the tcp_fuse_rrw() path bypasses the M_PROTO processing done + * by strsock_proto() hook. + * + * Sychronization is handled by squeue and the mutex tcp_fuse_lock. + * One of the requirements for fusion to succeed is that both endpoints + * need to be using the same squeue. This ensures that neither side + * can disappear while the other side is still sending data. By itself, + * squeue is not sufficient for guaranteeing safety when synchronous + * streams is enabled. The reason is that tcp_fuse_rrw() doesn't enter + * the squeue and its access to tcp_rcv_list and other fusion-related + * fields needs to be sychronized with the sender. tcp_fuse_lock is + * used for this purpose. When there is urgent data, the sender needs + * to push the data up the receiver's streams read queue. In order to + * avoid holding the tcp_fuse_lock across putnext(), the sender sets + * the peer tcp's tcp_fuse_syncstr_stopped bit and releases tcp_fuse_lock + * (see macro TCP_FUSE_SYNCSTR_STOP()). If tcp_fuse_rrw() enters after + * this point, it will see that synchronous streams is temporarily + * stopped and it will immediately return EBUSY without accessing the + * tcp_rcv_list or other fields protected by the tcp_fuse_lock. This + * will result in strget() calling getq_noenab() to dequeue data from + * the stream head instead. After the sender has finished pushing up + * all urgent data, it will clear the tcp_fuse_syncstr_stopped bit using + * TCP_FUSE_SYNCSTR_RESUME and the receiver may then resume using + * tcp_fuse_rrw() to retrieve data from tcp_rcv_list. + * + * The following note applies only to the synchronous streams mode. + * + * Flow control is done by checking the size of receive buffer and + * the number of data blocks, both set to different limits. This is + * different than regular streams flow control where cumulative size + * check dominates block count check -- streams queue high water mark + * typically represents bytes. Each enqueue triggers notifications + * to the receiving process; a build up of data blocks indicates a + * slow receiver and the sender should be blocked or informed at the + * earliest moment instead of further wasting system resources. In + * effect, this is equivalent to limiting the number of outstanding + * segments in flight. + */ + +/* + * Macros that determine whether or not IP processing is needed for TCP. + */ +#define TCP_IPOPT_POLICY_V4(tcp) \ + ((tcp)->tcp_ipversion == IPV4_VERSION && \ + ((tcp)->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH || \ + CONN_OUTBOUND_POLICY_PRESENT((tcp)->tcp_connp) || \ + CONN_INBOUND_POLICY_PRESENT((tcp)->tcp_connp))) + +#define TCP_IPOPT_POLICY_V6(tcp) \ + ((tcp)->tcp_ipversion == IPV6_VERSION && \ + ((tcp)->tcp_ip_hdr_len != IPV6_HDR_LEN || \ + CONN_OUTBOUND_POLICY_PRESENT_V6((tcp)->tcp_connp) || \ + CONN_INBOUND_POLICY_PRESENT_V6((tcp)->tcp_connp))) + +#define TCP_LOOPBACK_IP(tcp) \ + (TCP_IPOPT_POLICY_V4(tcp) || TCP_IPOPT_POLICY_V6(tcp) || \ + !CONN_IS_MD_FASTPATH((tcp)->tcp_connp)) + +/* + * Setting this to false means we disable fusion altogether and + * loopback connections would go through the protocol paths. + */ +boolean_t do_tcp_fusion = B_TRUE; + +/* + * Enabling this flag allows sockfs to retrieve data directly + * from a fused tcp endpoint using synchronous streams interface. + */ +boolean_t do_tcp_direct_sockfs = B_TRUE; + +/* + * This is the minimum amount of outstanding writes allowed on + * a synchronous streams-enabled receiving endpoint before the + * sender gets flow-controlled. Setting this value to 0 means + * that the data block limit is equivalent to the byte count + * limit, which essentially disables the check. + */ +#define TCP_FUSION_RCV_UNREAD_MIN 8 +uint_t tcp_fusion_rcv_unread_min = TCP_FUSION_RCV_UNREAD_MIN; + +static void tcp_fuse_syncstr_enable(tcp_t *); +static void tcp_fuse_syncstr_disable(tcp_t *); +static void strrput_sig(queue_t *, boolean_t); + +/* + * This routine gets called by the eager tcp upon changing state from + * SYN_RCVD to ESTABLISHED. It fuses a direct path between itself + * and the active connect tcp such that the regular tcp processings + * may be bypassed under allowable circumstances. Because the fusion + * requires both endpoints to be in the same squeue, it does not work + * for simultaneous active connects because there is no easy way to + * switch from one squeue to another once the connection is created. + * This is different from the eager tcp case where we assign it the + * same squeue as the one given to the active connect tcp during open. + */ +void +tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph) +{ + conn_t *peer_connp, *connp = tcp->tcp_connp; + tcp_t *peer_tcp; + + ASSERT(!tcp->tcp_fused); + ASSERT(tcp->tcp_loopback); + ASSERT(tcp->tcp_loopback_peer == NULL); + /* + * We need to inherit q_hiwat of the listener tcp, but we can't + * really use tcp_listener since we get here after sending up + * T_CONN_IND and tcp_wput_accept() may be called independently, + * at which point tcp_listener is cleared; this is why we use + * tcp_saved_listener. The listener itself is guaranteed to be + * around until tcp_accept_finish() is called on this eager -- + * this won't happen until we're done since we're inside the + * eager's perimeter now. + */ + ASSERT(tcp->tcp_saved_listener != NULL); + + /* + * Lookup peer endpoint; search for the remote endpoint having + * the reversed address-port quadruplet in ESTABLISHED state, + * which is guaranteed to be unique in the system. Zone check + * is applied accordingly for loopback address, but not for + * local address since we want fusion to happen across Zones. + */ + if (tcp->tcp_ipversion == IPV4_VERSION) { + peer_connp = ipcl_conn_tcp_lookup_reversed_ipv4(connp, + (ipha_t *)iphdr, tcph); + } else { + peer_connp = ipcl_conn_tcp_lookup_reversed_ipv6(connp, + (ip6_t *)iphdr, tcph); + } + + /* + * We can only proceed if peer exists, resides in the same squeue + * as our conn and is not raw-socket. The squeue assignment of + * this eager tcp was done earlier at the time of SYN processing + * in ip_fanout_tcp{_v6}. Note that similar squeues by itself + * doesn't guarantee a safe condition to fuse, hence we perform + * additional tests below. + */ + ASSERT(peer_connp == NULL || peer_connp != connp); + if (peer_connp == NULL || peer_connp->conn_sqp != connp->conn_sqp || + !IPCL_IS_TCP(peer_connp)) { + if (peer_connp != NULL) { + TCP_STAT(tcp_fusion_unqualified); + CONN_DEC_REF(peer_connp); + } + return; + } + peer_tcp = peer_connp->conn_tcp; /* active connect tcp */ + + ASSERT(peer_tcp != NULL && peer_tcp != tcp && !peer_tcp->tcp_fused); + ASSERT(peer_tcp->tcp_loopback && peer_tcp->tcp_loopback_peer == NULL); + ASSERT(peer_connp->conn_sqp == connp->conn_sqp); + + /* + * Fuse the endpoints; we perform further checks against both + * tcp endpoints to ensure that a fusion is allowed to happen. + * In particular we bail out for non-simple TCP/IP or if IPsec/ + * IPQoS policy exists. + */ + if (!tcp->tcp_unfusable && !peer_tcp->tcp_unfusable && + !TCP_LOOPBACK_IP(tcp) && !TCP_LOOPBACK_IP(peer_tcp) && + !IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) { + mblk_t *mp; + struct stroptions *stropt; + queue_t *peer_rq = peer_tcp->tcp_rq; + + ASSERT(!TCP_IS_DETACHED(peer_tcp) && peer_rq != NULL); + ASSERT(tcp->tcp_fused_sigurg_mp == NULL); + ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL); + + /* + * We need to drain data on both endpoints during unfuse. + * If we need to send up SIGURG at the time of draining, + * we want to be sure that an mblk is readily available. + * This is why we pre-allocate the M_PCSIG mblks for both + * endpoints which will only be used during/after unfuse. + */ + if ((mp = allocb(1, BPRI_HI)) == NULL) + goto failed; + + tcp->tcp_fused_sigurg_mp = mp; + + if ((mp = allocb(1, BPRI_HI)) == NULL) + goto failed; + + peer_tcp->tcp_fused_sigurg_mp = mp; + + /* Allocate M_SETOPTS mblk */ + if ((mp = allocb(sizeof (*stropt), BPRI_HI)) == NULL) + goto failed; + + /* Fuse both endpoints */ + peer_tcp->tcp_loopback_peer = tcp; + tcp->tcp_loopback_peer = peer_tcp; + peer_tcp->tcp_fused = tcp->tcp_fused = B_TRUE; + + /* + * We never use regular tcp paths in fusion and should + * therefore clear tcp_unsent on both endpoints. Having + * them set to non-zero values means asking for trouble + * especially after unfuse, where we may end up sending + * through regular tcp paths which expect xmit_list and + * friends to be correctly setup. + */ + peer_tcp->tcp_unsent = tcp->tcp_unsent = 0; + + tcp_timers_stop(tcp); + tcp_timers_stop(peer_tcp); + + /* + * At this point we are a detached eager tcp and therefore + * don't have a queue assigned to us until accept happens. + * In the mean time the peer endpoint may immediately send + * us data as soon as fusion is finished, and we need to be + * able to flow control it in case it sends down huge amount + * of data while we're still detached. To prevent that we + * inherit the listener's q_hiwat value; this is temporary + * since we'll repeat the process in tcp_accept_finish(). + */ + (void) tcp_fuse_set_rcv_hiwat(tcp, + tcp->tcp_saved_listener->tcp_rq->q_hiwat); + + /* + * Set the stream head's write offset value to zero since we + * won't be needing any room for TCP/IP headers; tell it to + * not break up the writes (this would reduce the amount of + * work done by kmem); and configure our receive buffer. + * Note that we can only do this for the active connect tcp + * since our eager is still detached; it will be dealt with + * later in tcp_accept_finish(). + */ + DB_TYPE(mp) = M_SETOPTS; + mp->b_wptr += sizeof (*stropt); + + stropt = (struct stroptions *)mp->b_rptr; + stropt->so_flags = SO_MAXBLK | SO_WROFF | SO_HIWAT; + stropt->so_maxblk = tcp_maxpsz_set(peer_tcp, B_FALSE); + stropt->so_wroff = 0; + + /* + * Record the stream head's high water mark for + * peer endpoint; this is used for flow-control + * purposes in tcp_fuse_output(). + */ + stropt->so_hiwat = tcp_fuse_set_rcv_hiwat(peer_tcp, + peer_rq->q_hiwat); + + /* Send the options up */ + putnext(peer_rq, mp); + } else { + TCP_STAT(tcp_fusion_unqualified); + } + CONN_DEC_REF(peer_connp); + return; + +failed: + if (tcp->tcp_fused_sigurg_mp != NULL) { + freeb(tcp->tcp_fused_sigurg_mp); + tcp->tcp_fused_sigurg_mp = NULL; + } + if (peer_tcp->tcp_fused_sigurg_mp != NULL) { + freeb(peer_tcp->tcp_fused_sigurg_mp); + peer_tcp->tcp_fused_sigurg_mp = NULL; + } + CONN_DEC_REF(peer_connp); +} + +/* + * Unfuse a previously-fused pair of tcp loopback endpoints. + */ +void +tcp_unfuse(tcp_t *tcp) +{ + tcp_t *peer_tcp = tcp->tcp_loopback_peer; + + ASSERT(tcp->tcp_fused && peer_tcp != NULL); + ASSERT(peer_tcp->tcp_fused && peer_tcp->tcp_loopback_peer == tcp); + ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp); + ASSERT(tcp->tcp_unsent == 0 && peer_tcp->tcp_unsent == 0); + ASSERT(tcp->tcp_fused_sigurg_mp != NULL); + ASSERT(peer_tcp->tcp_fused_sigurg_mp != NULL); + + /* + * We disable synchronous streams, drain any queued data and + * clear tcp_direct_sockfs. The synchronous streams entry + * points will become no-ops after this point. + */ + tcp_fuse_disable_pair(tcp, B_TRUE); + + /* + * Update th_seq and th_ack in the header template + */ + U32_TO_ABE32(tcp->tcp_snxt, tcp->tcp_tcph->th_seq); + U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); + U32_TO_ABE32(peer_tcp->tcp_snxt, peer_tcp->tcp_tcph->th_seq); + U32_TO_ABE32(peer_tcp->tcp_rnxt, peer_tcp->tcp_tcph->th_ack); + + /* Unfuse the endpoints */ + peer_tcp->tcp_fused = tcp->tcp_fused = B_FALSE; + peer_tcp->tcp_loopback_peer = tcp->tcp_loopback_peer = NULL; +} + +/* + * Fusion output routine for urgent data. This routine is called by + * tcp_fuse_output() for handling non-M_DATA mblks. + */ +void +tcp_fuse_output_urg(tcp_t *tcp, mblk_t *mp) +{ + mblk_t *mp1; + struct T_exdata_ind *tei; + tcp_t *peer_tcp = tcp->tcp_loopback_peer; + mblk_t *head, *prev_head = NULL; + + ASSERT(tcp->tcp_fused); + ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp); + ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); + ASSERT(mp->b_cont != NULL && DB_TYPE(mp->b_cont) == M_DATA); + ASSERT(MBLKL(mp) >= sizeof (*tei) && MBLKL(mp->b_cont) > 0); + + /* + * Urgent data arrives in the form of T_EXDATA_REQ from above. + * Each occurence denotes a new urgent pointer. For each new + * urgent pointer we signal (SIGURG) the receiving app to indicate + * that it needs to go into urgent mode. This is similar to the + * urgent data handling in the regular tcp. We don't need to keep + * track of where the urgent pointer is, because each T_EXDATA_REQ + * "advances" the urgent pointer for us. + * + * The actual urgent data carried by T_EXDATA_REQ is then prepended + * by a T_EXDATA_IND before being enqueued behind any existing data + * destined for the receiving app. There is only a single urgent + * pointer (out-of-band mark) for a given tcp. If the new urgent + * data arrives before the receiving app reads some existing urgent + * data, the previous marker is lost. This behavior is emulated + * accordingly below, by removing any existing T_EXDATA_IND messages + * and essentially converting old urgent data into non-urgent. + */ + ASSERT(tcp->tcp_valid_bits & TCP_URG_VALID); + /* Let sender get out of urgent mode */ + tcp->tcp_valid_bits &= ~TCP_URG_VALID; + + /* + * This flag indicates that a signal needs to be sent up. + * This flag will only get cleared once SIGURG is delivered and + * is not affected by the tcp_fused flag -- delivery will still + * happen even after an endpoint is unfused, to handle the case + * where the sending endpoint immediately closes/unfuses after + * sending urgent data and the accept is not yet finished. + */ + peer_tcp->tcp_fused_sigurg = B_TRUE; + + /* Reuse T_EXDATA_REQ mblk for T_EXDATA_IND */ + DB_TYPE(mp) = M_PROTO; + tei = (struct T_exdata_ind *)mp->b_rptr; + tei->PRIM_type = T_EXDATA_IND; + tei->MORE_flag = 0; + mp->b_wptr = (uchar_t *)&tei[1]; + + TCP_STAT(tcp_fusion_urg); + BUMP_MIB(&tcp_mib, tcpOutUrg); + + head = peer_tcp->tcp_rcv_list; + while (head != NULL) { + /* + * Remove existing T_EXDATA_IND, keep the data which follows + * it and relink our list. Note that we don't modify the + * tcp_rcv_last_tail since it never points to T_EXDATA_IND. + */ + if (DB_TYPE(head) != M_DATA) { + mp1 = head; + + ASSERT(DB_TYPE(mp1->b_cont) == M_DATA); + head = mp1->b_cont; + mp1->b_cont = NULL; + head->b_next = mp1->b_next; + mp1->b_next = NULL; + if (prev_head != NULL) + prev_head->b_next = head; + if (peer_tcp->tcp_rcv_list == mp1) + peer_tcp->tcp_rcv_list = head; + if (peer_tcp->tcp_rcv_last_head == mp1) + peer_tcp->tcp_rcv_last_head = head; + freeb(mp1); + } + prev_head = head; + head = head->b_next; + } +} + +/* + * Fusion output routine, called by tcp_output() and tcp_wput_proto(). + */ +boolean_t +tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size) +{ + tcp_t *peer_tcp = tcp->tcp_loopback_peer; + queue_t *peer_rq; + uint_t max_unread; + boolean_t flow_stopped; + boolean_t urgent = (DB_TYPE(mp) != M_DATA); + + ASSERT(tcp->tcp_fused); + ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp); + ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp); + ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO || + DB_TYPE(mp) == M_PCPROTO); + + peer_rq = peer_tcp->tcp_rq; + max_unread = peer_tcp->tcp_fuse_rcv_unread_hiwater; + + /* If this connection requires IP, unfuse and use regular path */ + if (TCP_LOOPBACK_IP(tcp) || TCP_LOOPBACK_IP(peer_tcp) || + IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN)) { + TCP_STAT(tcp_fusion_aborted); + tcp_unfuse(tcp); + return (B_FALSE); + } + + if (send_size == 0) { + freemsg(mp); + return (B_TRUE); + } + + /* + * Handle urgent data; we either send up SIGURG to the peer now + * or do it later when we drain, in case the peer is detached + * or if we're short of memory for M_PCSIG mblk. + */ + if (urgent) { + /* + * We stop synchronous streams when we have urgent data + * queued to prevent tcp_fuse_rrw() from pulling it. If + * for some reasons the urgent data can't be delivered + * below, synchronous streams will remain stopped until + * someone drains the tcp_rcv_list. + */ + TCP_FUSE_SYNCSTR_STOP(peer_tcp); + tcp_fuse_output_urg(tcp, mp); + } + + mutex_enter(&peer_tcp->tcp_fuse_lock); + /* + * Wake up and signal the peer; it is okay to do this before + * enqueueing because we are holding the lock. One of the + * advantages of synchronous streams is the ability for us to + * find out when the application performs a read on the socket, + * by way of tcp_fuse_rrw() entry point being called. Every + * data that gets enqueued onto the receiver is treated as if + * it has arrived at the receiving endpoint, thus generating + * SIGPOLL/SIGIO for asynchronous socket just as in the strrput() + * case. However, we only wake up the application when necessary, + * i.e. during the first enqueue. When tcp_fuse_rrw() is called + * it will send everything upstream. + */ + if (peer_tcp->tcp_direct_sockfs && !urgent && + !TCP_IS_DETACHED(peer_tcp)) { + if (peer_tcp->tcp_rcv_list == NULL) + STR_WAKEUP_SET(STREAM(peer_tcp->tcp_rq)); + /* Update poll events and send SIGPOLL/SIGIO if necessary */ + STR_SENDSIG(STREAM(peer_tcp->tcp_rq)); + } + + /* + * Enqueue data into the peer's receive list; we may or may not + * drain the contents depending on the conditions below. + */ + tcp_rcv_enqueue(peer_tcp, mp, send_size); + + /* In case it wrapped around and also to keep it constant */ + peer_tcp->tcp_rwnd += send_size; + + /* + * Exercise flow-control when needed; we will get back-enabled + * in either tcp_accept_finish(), tcp_unfuse(), or tcp_fuse_rrw(). + * If tcp_direct_sockfs is on or if the peer endpoint is detached, + * we emulate streams flow control by checking the peer's queue + * size and high water mark; otherwise we simply use canputnext() + * to decide if we need to stop our flow. + * + * The outstanding unread data block check does not apply for a + * detached receiver; this is to avoid unnecessary blocking of the + * sender while the accept is currently in progress and is quite + * similar to the regular tcp. + */ + if (TCP_IS_DETACHED(peer_tcp) || max_unread == 0) + max_unread = UINT_MAX; + + flow_stopped = tcp->tcp_flow_stopped; + if (!flow_stopped && + (((peer_tcp->tcp_direct_sockfs || TCP_IS_DETACHED(peer_tcp)) && + (peer_tcp->tcp_rcv_cnt >= peer_tcp->tcp_fuse_rcv_hiwater || + ++peer_tcp->tcp_fuse_rcv_unread_cnt >= max_unread)) || + (!peer_tcp->tcp_direct_sockfs && + !TCP_IS_DETACHED(peer_tcp) && !canputnext(peer_tcp->tcp_rq)))) { + tcp_setqfull(tcp); + flow_stopped = B_TRUE; + TCP_STAT(tcp_fusion_flowctl); + DTRACE_PROBE4(tcp__fuse__output__flowctl, tcp_t *, tcp, + uint_t, send_size, uint_t, peer_tcp->tcp_rcv_cnt, + uint_t, peer_tcp->tcp_fuse_rcv_unread_cnt); + } else if (flow_stopped && + TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { + tcp_clrqfull(tcp); + } + + loopback_packets++; + tcp->tcp_last_sent_len = send_size; + + /* Need to adjust the following SNMP MIB-related variables */ + tcp->tcp_snxt += send_size; + tcp->tcp_suna = tcp->tcp_snxt; + peer_tcp->tcp_rnxt += send_size; + peer_tcp->tcp_rack = peer_tcp->tcp_rnxt; + + BUMP_MIB(&tcp_mib, tcpOutDataSegs); + UPDATE_MIB(&tcp_mib, tcpOutDataBytes, send_size); + + BUMP_MIB(&tcp_mib, tcpInSegs); + BUMP_MIB(&tcp_mib, tcpInDataInorderSegs); + UPDATE_MIB(&tcp_mib, tcpInDataInorderBytes, send_size); + + BUMP_LOCAL(tcp->tcp_obsegs); + BUMP_LOCAL(peer_tcp->tcp_ibsegs); + + mutex_exit(&peer_tcp->tcp_fuse_lock); + + DTRACE_PROBE2(tcp__fuse__output, tcp_t *, tcp, uint_t, send_size); + + if (!TCP_IS_DETACHED(peer_tcp)) { + /* + * Drain the peer's receive queue it has urgent data or if + * we're not flow-controlled. There is no need for draining + * normal data when tcp_direct_sockfs is on because the peer + * will pull the data via tcp_fuse_rrw(). + */ + if (urgent || (!flow_stopped && !peer_tcp->tcp_direct_sockfs)) { + ASSERT(peer_tcp->tcp_rcv_list != NULL); + (void) tcp_fuse_rcv_drain(peer_rq, peer_tcp, NULL); + /* + * If synchronous streams was stopped above due + * to the presence of urgent data, re-enable it. + */ + if (urgent) + TCP_FUSE_SYNCSTR_RESUME(peer_tcp); + } + } + return (B_TRUE); +} + +/* + * This routine gets called to deliver data upstream on a fused or + * previously fused tcp loopback endpoint; the latter happens only + * when there is a pending SIGURG signal plus urgent data that can't + * be sent upstream in the past. + */ +boolean_t +tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp) +{ + mblk_t *mp; +#ifdef DEBUG + uint_t cnt = 0; +#endif + + ASSERT(tcp->tcp_loopback); + ASSERT(tcp->tcp_fused || tcp->tcp_fused_sigurg); + ASSERT(!tcp->tcp_fused || tcp->tcp_loopback_peer != NULL); + ASSERT(sigurg_mpp != NULL || tcp->tcp_fused); + + /* No need for the push timer now, in case it was scheduled */ + if (tcp->tcp_push_tid != 0) { + (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); + tcp->tcp_push_tid = 0; + } + /* + * If there's urgent data sitting in receive list and we didn't + * get a chance to send up a SIGURG signal, make sure we send + * it first before draining in order to ensure that SIOCATMARK + * works properly. + */ + if (tcp->tcp_fused_sigurg) { + /* + * sigurg_mpp is normally NULL, i.e. when we're still + * fused and didn't get here because of tcp_unfuse(). + * In this case try hard to allocate the M_PCSIG mblk. + */ + if (sigurg_mpp == NULL && + (mp = allocb(1, BPRI_HI)) == NULL && + (mp = allocb_tryhard(1)) == NULL) { + /* Alloc failed; try again next time */ + tcp->tcp_push_tid = TCP_TIMER(tcp, tcp_push_timer, + MSEC_TO_TICK(tcp_push_timer_interval)); + return (B_TRUE); + } else if (sigurg_mpp != NULL) { + /* + * Use the supplied M_PCSIG mblk; it means we're + * either unfused or in the process of unfusing, + * and the drain must happen now. + */ + mp = *sigurg_mpp; + *sigurg_mpp = NULL; + } + ASSERT(mp != NULL); + + tcp->tcp_fused_sigurg = B_FALSE; + /* Send up the signal */ + DB_TYPE(mp) = M_PCSIG; + *mp->b_wptr++ = (uchar_t)SIGURG; + putnext(q, mp); + /* + * Let the regular tcp_rcv_drain() path handle + * draining the data if we're no longer fused. + */ + if (!tcp->tcp_fused) + return (B_FALSE); + } + + /* + * In the synchronous streams case, we generate SIGPOLL/SIGIO for + * each M_DATA that gets enqueued onto the receiver. At this point + * we are about to drain any queued data via putnext(). In order + * to avoid extraneous signal generation from strrput(), we set + * STRGETINPROG flag at the stream head prior to the draining and + * restore it afterwards. This masks out signal generation only + * for M_DATA messages and does not affect urgent data. + */ + if (tcp->tcp_direct_sockfs) + strrput_sig(q, B_FALSE); + + /* Drain the data */ + while ((mp = tcp->tcp_rcv_list) != NULL) { + tcp->tcp_rcv_list = mp->b_next; + mp->b_next = NULL; +#ifdef DEBUG + cnt += msgdsize(mp); +#endif + putnext(q, mp); + TCP_STAT(tcp_fusion_putnext); + } + + if (tcp->tcp_direct_sockfs) + strrput_sig(q, B_TRUE); + + ASSERT(cnt == tcp->tcp_rcv_cnt); + tcp->tcp_rcv_last_head = NULL; + tcp->tcp_rcv_last_tail = NULL; + tcp->tcp_rcv_cnt = 0; + tcp->tcp_fuse_rcv_unread_cnt = 0; + tcp->tcp_rwnd = q->q_hiwat; + + return (B_TRUE); +} + +/* + * Synchronous stream entry point for sockfs to retrieve + * data directly from tcp_rcv_list. + */ +int +tcp_fuse_rrw(queue_t *q, struiod_t *dp) +{ + tcp_t *tcp = Q_TO_CONN(q)->conn_tcp; + mblk_t *mp; + + mutex_enter(&tcp->tcp_fuse_lock); + /* + * If someone had turned off tcp_direct_sockfs or if synchronous + * streams is temporarily disabled, we return EBUSY. This causes + * strget() to dequeue data from the stream head instead. + */ + if (!tcp->tcp_direct_sockfs || tcp->tcp_fuse_syncstr_stopped) { + mutex_exit(&tcp->tcp_fuse_lock); + TCP_STAT(tcp_fusion_rrw_busy); + return (EBUSY); + } + + if ((mp = tcp->tcp_rcv_list) != NULL) { + tcp_t *peer_tcp = tcp->tcp_loopback_peer; + + DTRACE_PROBE3(tcp__fuse__rrw, tcp_t *, tcp, + uint32_t, tcp->tcp_rcv_cnt, ssize_t, dp->d_uio.uio_resid); + + tcp->tcp_rcv_list = NULL; + TCP_STAT(tcp_fusion_rrw_msgcnt); + + /* + * At this point nothing should be left in tcp_rcv_list. + * The only possible case where we would have a chain of + * b_next-linked messages is urgent data, but we wouldn't + * be here if that's true since urgent data is delivered + * via putnext() and synchronous streams is stopped until + * tcp_fuse_rcv_drain() is finished. + */ + ASSERT(DB_TYPE(mp) == M_DATA && mp->b_next == NULL); + + tcp->tcp_rcv_last_head = NULL; + tcp->tcp_rcv_last_tail = NULL; + tcp->tcp_rcv_cnt = 0; + tcp->tcp_fuse_rcv_unread_cnt = 0; + + if (peer_tcp->tcp_flow_stopped) { + tcp_clrqfull(peer_tcp); + TCP_STAT(tcp_fusion_backenabled); + } + } + + /* + * Either we just dequeued everything or we get here from sockfs + * and have nothing to return; in this case clear RSLEEP. + */ + ASSERT(tcp->tcp_rcv_last_head == NULL); + ASSERT(tcp->tcp_rcv_last_tail == NULL); + ASSERT(tcp->tcp_rcv_cnt == 0); + ASSERT(tcp->tcp_fuse_rcv_unread_cnt == 0); + STR_WAKEUP_CLEAR(STREAM(q)); + + mutex_exit(&tcp->tcp_fuse_lock); + dp->d_mp = mp; + return (0); +} + +/* + * Synchronous stream entry point used by certain ioctls to retrieve + * information about or peek into the tcp_rcv_list. + */ +int +tcp_fuse_rinfop(queue_t *q, infod_t *dp) +{ + tcp_t *tcp = Q_TO_CONN(q)->conn_tcp; + mblk_t *mp; + uint_t cmd = dp->d_cmd; + int res = 0; + int error = 0; + struct stdata *stp = STREAM(q); + + mutex_enter(&tcp->tcp_fuse_lock); + /* If shutdown on read has happened, return nothing */ + mutex_enter(&stp->sd_lock); + if (stp->sd_flag & STREOF) { + mutex_exit(&stp->sd_lock); + goto done; + } + mutex_exit(&stp->sd_lock); + + /* + * It is OK not to return an answer if tcp_rcv_list is + * currently not accessible. + */ + if (!tcp->tcp_direct_sockfs || tcp->tcp_fuse_syncstr_stopped || + (mp = tcp->tcp_rcv_list) == NULL) + goto done; + + if (cmd & INFOD_COUNT) { + /* + * We have at least one message and + * could return only one at a time. + */ + dp->d_count++; + res |= INFOD_COUNT; + } + if (cmd & INFOD_BYTES) { + /* + * Return size of all data messages. + */ + dp->d_bytes += tcp->tcp_rcv_cnt; + res |= INFOD_BYTES; + } + if (cmd & INFOD_FIRSTBYTES) { + /* + * Return size of first data message. + */ + dp->d_bytes = msgdsize(mp); + res |= INFOD_FIRSTBYTES; + dp->d_cmd &= ~INFOD_FIRSTBYTES; + } + if (cmd & INFOD_COPYOUT) { + mblk_t *mp1; + int n; + + if (DB_TYPE(mp) == M_DATA) { + mp1 = mp; + } else { + mp1 = mp->b_cont; + ASSERT(mp1 != NULL); + } + + /* + * Return data contents of first message. + */ + ASSERT(DB_TYPE(mp1) == M_DATA); + while (mp1 != NULL && dp->d_uiop->uio_resid > 0) { + n = MIN(dp->d_uiop->uio_resid, MBLKL(mp1)); + if (n != 0 && (error = uiomove((char *)mp1->b_rptr, n, + UIO_READ, dp->d_uiop)) != 0) { + goto done; + } + mp1 = mp1->b_cont; + } + res |= INFOD_COPYOUT; + dp->d_cmd &= ~INFOD_COPYOUT; + } +done: + mutex_exit(&tcp->tcp_fuse_lock); + + dp->d_res |= res; + + return (error); +} + +/* + * Enable synchronous streams on a fused tcp loopback endpoint. + */ +static void +tcp_fuse_syncstr_enable(tcp_t *tcp) +{ + queue_t *rq = tcp->tcp_rq; + struct stdata *stp = STREAM(rq); + + /* We can only enable synchronous streams for sockfs mode */ + tcp->tcp_direct_sockfs = tcp->tcp_issocket && do_tcp_direct_sockfs; + + if (!tcp->tcp_direct_sockfs) + return; + + mutex_enter(&stp->sd_lock); + mutex_enter(QLOCK(rq)); + + /* + * We replace our q_qinfo with one that has the qi_rwp entry point. + * Clear SR_SIGALLDATA because we generate the equivalent signal(s) + * for every enqueued data in tcp_fuse_output(). + */ + rq->q_qinfo = &tcp_loopback_rinit; + rq->q_struiot = tcp_loopback_rinit.qi_struiot; + stp->sd_struiordq = rq; + stp->sd_rput_opt &= ~SR_SIGALLDATA; + + mutex_exit(QLOCK(rq)); + mutex_exit(&stp->sd_lock); +} + +/* + * Disable synchronous streams on a fused tcp loopback endpoint. + */ +static void +tcp_fuse_syncstr_disable(tcp_t *tcp) +{ + queue_t *rq = tcp->tcp_rq; + struct stdata *stp = STREAM(rq); + + if (!tcp->tcp_direct_sockfs) + return; + + mutex_enter(&stp->sd_lock); + mutex_enter(QLOCK(rq)); + + /* + * Reset q_qinfo to point to the default tcp entry points. + * Also restore SR_SIGALLDATA so that strrput() can generate + * the signals again for future M_DATA messages. + */ + rq->q_qinfo = &tcp_rinit; + rq->q_struiot = tcp_rinit.qi_struiot; + stp->sd_struiordq = NULL; + stp->sd_rput_opt |= SR_SIGALLDATA; + tcp->tcp_direct_sockfs = B_FALSE; + + mutex_exit(QLOCK(rq)); + mutex_exit(&stp->sd_lock); +} + +/* + * Enable synchronous streams on a pair of fused tcp endpoints. + */ +void +tcp_fuse_syncstr_enable_pair(tcp_t *tcp) +{ + tcp_t *peer_tcp = tcp->tcp_loopback_peer; + + ASSERT(tcp->tcp_fused); + ASSERT(peer_tcp != NULL); + + tcp_fuse_syncstr_enable(tcp); + tcp_fuse_syncstr_enable(peer_tcp); +} + +/* + * Allow or disallow signals to be generated by strrput(). + */ +static void +strrput_sig(queue_t *q, boolean_t on) +{ + struct stdata *stp = STREAM(q); + + mutex_enter(&stp->sd_lock); + if (on) + stp->sd_flag &= ~STRGETINPROG; + else + stp->sd_flag |= STRGETINPROG; + mutex_exit(&stp->sd_lock); +} + +/* + * Disable synchronous streams on a pair of fused tcp endpoints and drain + * any queued data; called either during unfuse or upon transitioning from + * a socket to a stream endpoint due to _SIOCSOCKFALLBACK. + */ +void +tcp_fuse_disable_pair(tcp_t *tcp, boolean_t unfusing) +{ + tcp_t *peer_tcp = tcp->tcp_loopback_peer; + + ASSERT(tcp->tcp_fused); + ASSERT(peer_tcp != NULL); + + /* + * We need to prevent tcp_fuse_rrw() from entering before + * we can disable synchronous streams. + */ + TCP_FUSE_SYNCSTR_STOP(tcp); + TCP_FUSE_SYNCSTR_STOP(peer_tcp); + + /* + * Drain any pending data; the detached check is needed because + * we may be called as a result of a tcp_unfuse() triggered by + * tcp_fuse_output(). Note that in case of a detached tcp, the + * draining will happen later after the tcp is unfused. For non- + * urgent data, this can be handled by the regular tcp_rcv_drain(). + * If we have urgent data sitting in the receive list, we will + * need to send up a SIGURG signal first before draining the data. + * All of these will be handled by the code in tcp_fuse_rcv_drain() + * when called from tcp_rcv_drain(). + */ + if (!TCP_IS_DETACHED(tcp)) { + (void) tcp_fuse_rcv_drain(tcp->tcp_rq, tcp, + (unfusing ? &tcp->tcp_fused_sigurg_mp : NULL)); + } + if (!TCP_IS_DETACHED(peer_tcp)) { + (void) tcp_fuse_rcv_drain(peer_tcp->tcp_rq, peer_tcp, + (unfusing ? &peer_tcp->tcp_fused_sigurg_mp : NULL)); + } + + /* Lift up any flow-control conditions */ + if (tcp->tcp_flow_stopped) { + tcp_clrqfull(tcp); + TCP_STAT(tcp_fusion_backenabled); + } + if (peer_tcp->tcp_flow_stopped) { + tcp_clrqfull(peer_tcp); + TCP_STAT(tcp_fusion_backenabled); + } + + /* Disable synchronous streams */ + tcp_fuse_syncstr_disable(tcp); + tcp_fuse_syncstr_disable(peer_tcp); +} + +/* + * Calculate the size of receive buffer for a fused tcp endpoint. + */ +size_t +tcp_fuse_set_rcv_hiwat(tcp_t *tcp, size_t rwnd) +{ + ASSERT(tcp->tcp_fused); + + /* Ensure that value is within the maximum upper bound */ + if (rwnd > tcp_max_buf) + rwnd = tcp_max_buf; + + /* Obey the absolute minimum tcp receive high water mark */ + if (rwnd < tcp_sth_rcv_hiwat) + rwnd = tcp_sth_rcv_hiwat; + + /* + * Round up to system page size in case SO_RCVBUF is modified + * after SO_SNDBUF; the latter is also similarly rounded up. + */ + rwnd = P2ROUNDUP_TYPED(rwnd, PAGESIZE, size_t); + tcp->tcp_fuse_rcv_hiwater = rwnd; + return (rwnd); +} + +/* + * Calculate the maximum outstanding unread data block for a fused tcp endpoint. + */ +int +tcp_fuse_maxpsz_set(tcp_t *tcp) +{ + tcp_t *peer_tcp = tcp->tcp_loopback_peer; + uint_t sndbuf = tcp->tcp_xmit_hiwater; + uint_t maxpsz = sndbuf; + + ASSERT(tcp->tcp_fused); + ASSERT(peer_tcp != NULL); + ASSERT(peer_tcp->tcp_fuse_rcv_hiwater != 0); + /* + * In the fused loopback case, we want the stream head to split + * up larger writes into smaller chunks for a more accurate flow- + * control accounting. Our maxpsz is half of the sender's send + * buffer or the receiver's receive buffer, whichever is smaller. + * We round up the buffer to system page size due to the lack of + * TCP MSS concept in Fusion. + */ + if (maxpsz > peer_tcp->tcp_fuse_rcv_hiwater) + maxpsz = peer_tcp->tcp_fuse_rcv_hiwater; + maxpsz = P2ROUNDUP_TYPED(maxpsz, PAGESIZE, uint_t) >> 1; + + /* + * Calculate the peer's limit for the number of outstanding unread + * data block. This is the amount of data blocks that are allowed + * to reside in the receiver's queue before the sender gets flow + * controlled. It is used only in the synchronous streams mode as + * a way to throttle the sender when it performs consecutive writes + * faster than can be read. The value is derived from SO_SNDBUF in + * order to give the sender some control; we divide it with a large + * value (16KB) to produce a fairly low initial limit. + */ + if (tcp_fusion_rcv_unread_min == 0) { + /* A value of 0 means that we disable the check */ + peer_tcp->tcp_fuse_rcv_unread_hiwater = 0; + } else { + peer_tcp->tcp_fuse_rcv_unread_hiwater = + MAX(sndbuf >> 14, tcp_fusion_rcv_unread_min); + } + return (maxpsz); +} diff --git a/usr/src/uts/common/inet/tcp/tcpddi.c b/usr/src/uts/common/inet/tcp/tcpddi.c index d6d21f16b5..391fc3e65d 100644 --- a/usr/src/uts/common/inet/tcp/tcpddi.c +++ b/usr/src/uts/common/inet/tcp/tcpddi.c @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -38,7 +38,13 @@ #define INET_DEVDESC "TCP STREAMS driver %I%" #define INET_MODDESC "TCP STREAMS module %I%" #define INET_DEVMINOR TCP_MINOR -#define INET_DEVMTFLAGS D_MP +/* + * Note that unlike UDP, TCP uses synchronous STREAMS only + * for TCP Fusion (loopback); this is why we don't define + * D_SYNCSTR here. Since TCP as a module is used only for + * SNMP purposes, we define _D_DIRECT for device instance. + */ +#define INET_DEVMTFLAGS (D_MP|_D_DIRECT) #define INET_MODMTFLAGS D_MP #include "../inetddi.c" diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h new file mode 100644 index 0000000000..93c08cb144 --- /dev/null +++ b/usr/src/uts/common/inet/tcp_impl.h @@ -0,0 +1,332 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _INET_TCP_IMPL_H +#define _INET_TCP_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * TCP implementation private declarations. These interfaces are + * used to build the IP module and are not meant to be accessed + * by any modules except IP itself. They are undocumented and are + * subject to change without notice. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +#include <inet/tcp.h> + +#define TCP_MOD_ID 5105 + +/* + * Was this tcp created via socket() interface? + */ +#define TCP_IS_SOCKET(tcp) ((tcp)->tcp_issocket) + +/* + * Is this tcp not attached to any upper client? + */ +#define TCP_IS_DETACHED(tcp) ((tcp)->tcp_detached) + +#define TCP_TIMER(tcp, f, tim) \ + tcp_timeout(tcp->tcp_connp, f, tim) +#define TCP_TIMER_CANCEL(tcp, id) \ + tcp_timeout_cancel(tcp->tcp_connp, id) + +/* + * To restart the TCP retransmission timer. + */ +#define TCP_TIMER_RESTART(tcp, intvl) { \ + if ((tcp)->tcp_timer_tid != 0) \ + (void) TCP_TIMER_CANCEL((tcp), (tcp)->tcp_timer_tid); \ + (tcp)->tcp_timer_tid = TCP_TIMER((tcp), tcp_timer, \ + MSEC_TO_TICK(intvl)); \ +} + +/* + * This stops synchronous streams for a fused tcp endpoint + * and prevents tcp_rrw() from pulling data from it. + */ +#define TCP_FUSE_SYNCSTR_STOP(tcp) { \ + if ((tcp)->tcp_direct_sockfs) { \ + mutex_enter(&(tcp)->tcp_fuse_lock); \ + (tcp)->tcp_fuse_syncstr_stopped = B_TRUE; \ + mutex_exit(&(tcp)->tcp_fuse_lock); \ + } \ +} + +/* + * This resumes synchronous streams for this fused tcp endpoint + * and allows tcp_rrw() to pull data from it again. + */ +#define TCP_FUSE_SYNCSTR_RESUME(tcp) { \ + if ((tcp)->tcp_direct_sockfs) { \ + mutex_enter(&(tcp)->tcp_fuse_lock); \ + (tcp)->tcp_fuse_syncstr_stopped = B_FALSE; \ + mutex_exit(&(tcp)->tcp_fuse_lock); \ + } \ +} + +/* + * Write-side flow-control is implemented via the per instance STREAMS + * write-side Q by explicitly setting QFULL to stop the flow of mblk_t(s) + * and clearing QFULL and calling qbackenable() to restart the flow based + * on the number of TCP unsent bytes (i.e. those not on the wire waiting + * for a remote ACK). + * + * This is different than a standard STREAMS kmod which when using the + * STREAMS Q the framework would automatictly flow-control based on the + * defined hiwat/lowat values as mblk_t's are enqueued/dequeued. + * + * As of FireEngine TCP write-side flow-control needs to take into account + * both the unsent tcp_xmit list bytes but also any squeue_t enqueued bytes + * (i.e. from tcp_wput() -> tcp_output()). + * + * This is accomplished by adding a new tcp_t fields, tcp_squeue_bytes, to + * count the number of bytes enqueued by tcp_wput() and the number of bytes + * dequeued and processed by tcp_output(). + * + * So, the total number of bytes unsent is (squeue_bytes + unsent) with all + * flow-control uses of unsent replaced with the macro TCP_UNSENT_BYTES. + */ +extern void tcp_clrqfull(tcp_t *); +extern void tcp_setqfull(tcp_t *); + +#define TCP_UNSENT_BYTES(tcp) \ + ((tcp)->tcp_squeue_bytes + (tcp)->tcp_unsent) + +/* Named Dispatch Parameter Management Structure */ +typedef struct tcpparam_s { + uint32_t tcp_param_min; + uint32_t tcp_param_max; + uint32_t tcp_param_val; + char *tcp_param_name; +} tcpparam_t; + +extern tcpparam_t tcp_param_arr[]; + +#define tcp_time_wait_interval tcp_param_arr[0].tcp_param_val +#define tcp_conn_req_max_q tcp_param_arr[1].tcp_param_val +#define tcp_conn_req_max_q0 tcp_param_arr[2].tcp_param_val +#define tcp_conn_req_min tcp_param_arr[3].tcp_param_val +#define tcp_conn_grace_period tcp_param_arr[4].tcp_param_val +#define tcp_cwnd_max_ tcp_param_arr[5].tcp_param_val +#define tcp_dbg tcp_param_arr[6].tcp_param_val +#define tcp_smallest_nonpriv_port tcp_param_arr[7].tcp_param_val +#define tcp_ip_abort_cinterval tcp_param_arr[8].tcp_param_val +#define tcp_ip_abort_linterval tcp_param_arr[9].tcp_param_val +#define tcp_ip_abort_interval tcp_param_arr[10].tcp_param_val +#define tcp_ip_notify_cinterval tcp_param_arr[11].tcp_param_val +#define tcp_ip_notify_interval tcp_param_arr[12].tcp_param_val +#define tcp_ipv4_ttl tcp_param_arr[13].tcp_param_val +#define tcp_keepalive_interval_high tcp_param_arr[14].tcp_param_max +#define tcp_keepalive_interval tcp_param_arr[14].tcp_param_val +#define tcp_keepalive_interval_low tcp_param_arr[14].tcp_param_min +#define tcp_maxpsz_multiplier tcp_param_arr[15].tcp_param_val +#define tcp_mss_def_ipv4 tcp_param_arr[16].tcp_param_val +#define tcp_mss_max_ipv4 tcp_param_arr[17].tcp_param_val +#define tcp_mss_min tcp_param_arr[18].tcp_param_val +#define tcp_naglim_def tcp_param_arr[19].tcp_param_val +#define tcp_rexmit_interval_initial tcp_param_arr[20].tcp_param_val +#define tcp_rexmit_interval_max tcp_param_arr[21].tcp_param_val +#define tcp_rexmit_interval_min tcp_param_arr[22].tcp_param_val +#define tcp_deferred_ack_interval tcp_param_arr[23].tcp_param_val +#define tcp_snd_lowat_fraction tcp_param_arr[24].tcp_param_val +#define tcp_sth_rcv_hiwat tcp_param_arr[25].tcp_param_val +#define tcp_sth_rcv_lowat tcp_param_arr[26].tcp_param_val +#define tcp_dupack_fast_retransmit tcp_param_arr[27].tcp_param_val +#define tcp_ignore_path_mtu tcp_param_arr[28].tcp_param_val +#define tcp_smallest_anon_port tcp_param_arr[29].tcp_param_val +#define tcp_largest_anon_port tcp_param_arr[30].tcp_param_val +#define tcp_xmit_hiwat tcp_param_arr[31].tcp_param_val +#define tcp_xmit_lowat tcp_param_arr[32].tcp_param_val +#define tcp_recv_hiwat tcp_param_arr[33].tcp_param_val +#define tcp_recv_hiwat_minmss tcp_param_arr[34].tcp_param_val +#define tcp_fin_wait_2_flush_interval tcp_param_arr[35].tcp_param_val +#define tcp_co_min tcp_param_arr[36].tcp_param_val +#define tcp_max_buf tcp_param_arr[37].tcp_param_val +#define tcp_strong_iss tcp_param_arr[38].tcp_param_val +#define tcp_rtt_updates tcp_param_arr[39].tcp_param_val +#define tcp_wscale_always tcp_param_arr[40].tcp_param_val +#define tcp_tstamp_always tcp_param_arr[41].tcp_param_val +#define tcp_tstamp_if_wscale tcp_param_arr[42].tcp_param_val +#define tcp_rexmit_interval_extra tcp_param_arr[43].tcp_param_val +#define tcp_deferred_acks_max tcp_param_arr[44].tcp_param_val +#define tcp_slow_start_after_idle tcp_param_arr[45].tcp_param_val +#define tcp_slow_start_initial tcp_param_arr[46].tcp_param_val +#define tcp_co_timer_interval tcp_param_arr[47].tcp_param_val +#define tcp_sack_permitted tcp_param_arr[48].tcp_param_val +#define tcp_trace tcp_param_arr[49].tcp_param_val +#define tcp_compression_enabled tcp_param_arr[50].tcp_param_val +#define tcp_ipv6_hoplimit tcp_param_arr[51].tcp_param_val +#define tcp_mss_def_ipv6 tcp_param_arr[52].tcp_param_val +#define tcp_mss_max_ipv6 tcp_param_arr[53].tcp_param_val +#define tcp_rev_src_routes tcp_param_arr[54].tcp_param_val +#define tcp_local_dack_interval tcp_param_arr[55].tcp_param_val +#define tcp_ndd_get_info_interval tcp_param_arr[56].tcp_param_val +#define tcp_local_dacks_max tcp_param_arr[57].tcp_param_val +#define tcp_ecn_permitted tcp_param_arr[58].tcp_param_val +#define tcp_rst_sent_rate_enabled tcp_param_arr[59].tcp_param_val +#define tcp_rst_sent_rate tcp_param_arr[60].tcp_param_val +#define tcp_push_timer_interval tcp_param_arr[61].tcp_param_val +#define tcp_use_smss_as_mss_opt tcp_param_arr[62].tcp_param_val +#define tcp_keepalive_abort_interval_high tcp_param_arr[63].tcp_param_max +#define tcp_keepalive_abort_interval tcp_param_arr[63].tcp_param_val +#define tcp_keepalive_abort_interval_low tcp_param_arr[63].tcp_param_min + +/* Kstats */ +typedef struct tcp_stat { + kstat_named_t tcp_time_wait; + kstat_named_t tcp_time_wait_syn; + kstat_named_t tcp_time_wait_syn_success; + kstat_named_t tcp_time_wait_syn_fail; + kstat_named_t tcp_reinput_syn; + kstat_named_t tcp_ip_output; + kstat_named_t tcp_detach_non_time_wait; + kstat_named_t tcp_detach_time_wait; + kstat_named_t tcp_time_wait_reap; + kstat_named_t tcp_clean_death_nondetached; + kstat_named_t tcp_reinit_calls; + kstat_named_t tcp_eager_err1; + kstat_named_t tcp_eager_err2; + kstat_named_t tcp_eager_blowoff_calls; + kstat_named_t tcp_eager_blowoff_q; + kstat_named_t tcp_eager_blowoff_q0; + kstat_named_t tcp_not_hard_bound; + kstat_named_t tcp_no_listener; + kstat_named_t tcp_found_eager; + kstat_named_t tcp_wrong_queue; + kstat_named_t tcp_found_eager_binding1; + kstat_named_t tcp_found_eager_bound1; + kstat_named_t tcp_eager_has_listener1; + kstat_named_t tcp_open_alloc; + kstat_named_t tcp_open_detached_alloc; + kstat_named_t tcp_rput_time_wait; + kstat_named_t tcp_listendrop; + kstat_named_t tcp_listendropq0; + kstat_named_t tcp_wrong_rq; + kstat_named_t tcp_rsrv_calls; + kstat_named_t tcp_eagerfree2; + kstat_named_t tcp_eagerfree3; + kstat_named_t tcp_eagerfree4; + kstat_named_t tcp_eagerfree5; + kstat_named_t tcp_timewait_syn_fail; + kstat_named_t tcp_listen_badflags; + kstat_named_t tcp_timeout_calls; + kstat_named_t tcp_timeout_cached_alloc; + kstat_named_t tcp_timeout_cancel_reqs; + kstat_named_t tcp_timeout_canceled; + kstat_named_t tcp_timermp_alloced; + kstat_named_t tcp_timermp_freed; + kstat_named_t tcp_timermp_allocfail; + kstat_named_t tcp_timermp_allocdblfail; + kstat_named_t tcp_push_timer_cnt; + kstat_named_t tcp_ack_timer_cnt; + kstat_named_t tcp_ire_null1; + kstat_named_t tcp_ire_null; + kstat_named_t tcp_ip_send; + kstat_named_t tcp_ip_ire_send; + kstat_named_t tcp_wsrv_called; + kstat_named_t tcp_flwctl_on; + kstat_named_t tcp_timer_fire_early; + kstat_named_t tcp_timer_fire_miss; + kstat_named_t tcp_freelist_cleanup; + kstat_named_t tcp_rput_v6_error; + kstat_named_t tcp_out_sw_cksum; + kstat_named_t tcp_out_sw_cksum_bytes; + kstat_named_t tcp_zcopy_on; + kstat_named_t tcp_zcopy_off; + kstat_named_t tcp_zcopy_backoff; + kstat_named_t tcp_zcopy_disable; + kstat_named_t tcp_mdt_pkt_out; + kstat_named_t tcp_mdt_pkt_out_v4; + kstat_named_t tcp_mdt_pkt_out_v6; + kstat_named_t tcp_mdt_discarded; + kstat_named_t tcp_mdt_conn_halted1; + kstat_named_t tcp_mdt_conn_halted2; + kstat_named_t tcp_mdt_conn_halted3; + kstat_named_t tcp_mdt_conn_resumed1; + kstat_named_t tcp_mdt_conn_resumed2; + kstat_named_t tcp_mdt_legacy_small; + kstat_named_t tcp_mdt_legacy_all; + kstat_named_t tcp_mdt_legacy_ret; + kstat_named_t tcp_mdt_allocfail; + kstat_named_t tcp_mdt_addpdescfail; + kstat_named_t tcp_mdt_allocd; + kstat_named_t tcp_mdt_linked; + kstat_named_t tcp_fusion_flowctl; + kstat_named_t tcp_fusion_backenabled; + kstat_named_t tcp_fusion_urg; + kstat_named_t tcp_fusion_putnext; + kstat_named_t tcp_fusion_unfusable; + kstat_named_t tcp_fusion_aborted; + kstat_named_t tcp_fusion_unqualified; + kstat_named_t tcp_fusion_rrw_busy; + kstat_named_t tcp_fusion_rrw_msgcnt; + kstat_named_t tcp_in_ack_unsent_drop; + kstat_named_t tcp_sock_fallback; +} tcp_stat_t; + +extern tcp_stat_t tcp_statistics; + +#define TCP_STAT(x) (tcp_statistics.x.value.ui64++) +#define TCP_STAT_UPDATE(x, n) (tcp_statistics.x.value.ui64 += (n)) +#define TCP_STAT_SET(x, n) (tcp_statistics.x.value.ui64 = (n)) + +extern struct qinit tcp_loopback_rinit, tcp_rinit; +extern boolean_t do_tcp_fusion; + +extern int tcp_maxpsz_set(tcp_t *, boolean_t); +extern void tcp_timers_stop(tcp_t *); +extern void tcp_rcv_enqueue(tcp_t *, mblk_t *, uint_t); +extern void tcp_push_timer(void *); +extern timeout_id_t tcp_timeout(conn_t *, void (*)(void *), clock_t); +extern clock_t tcp_timeout_cancel(conn_t *, timeout_id_t); + +extern void tcp_fuse(tcp_t *, uchar_t *, tcph_t *); +extern void tcp_unfuse(tcp_t *); +extern boolean_t tcp_fuse_output(tcp_t *, mblk_t *, uint32_t); +extern void tcp_fuse_output_urg(tcp_t *, mblk_t *); +extern boolean_t tcp_fuse_rcv_drain(queue_t *, tcp_t *, mblk_t **); +extern void tcp_fuse_syncstr_enable_pair(tcp_t *); +extern void tcp_fuse_disable_pair(tcp_t *, boolean_t); +extern int tcp_fuse_rrw(queue_t *, struiod_t *); +extern int tcp_fuse_rinfop(queue_t *, infod_t *); +extern size_t tcp_fuse_set_rcv_hiwat(tcp_t *, size_t); +extern int tcp_fuse_maxpsz_set(tcp_t *); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _INET_TCP_IMPL_H */ diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c index 5bed5bf992..d804018911 100644 --- a/usr/src/uts/common/inet/udp/udp.c +++ b/usr/src/uts/common/inet/udp/udp.c @@ -31,6 +31,8 @@ const char udp_version[] = "%Z%%M% %I% %E% SMI"; #include <sys/types.h> #include <sys/stream.h> +#include <sys/dlpi.h> +#include <sys/pattr.h> #include <sys/stropts.h> #include <sys/strlog.h> #include <sys/strsun.h> @@ -50,6 +52,7 @@ const char udp_version[] = "%Z%%M% %I% %E% SMI"; #include <sys/zone.h> #include <sys/socket.h> +#include <sys/sockio.h> #include <sys/vtrace.h> #include <sys/debug.h> #include <sys/isa_defs.h> @@ -59,11 +62,15 @@ const char udp_version[] = "%Z%%M% %I% %E% SMI"; #include <netinet/icmp6.h> #include <netinet/udp.h> #include <net/if.h> +#include <net/route.h> #include <inet/common.h> #include <inet/ip.h> +#include <inet/ip_impl.h> #include <inet/ip6.h> #include <inet/ip_ire.h> +#include <inet/ip_if.h> +#include <inet/ip_multi.h> #include <inet/mi.h> #include <inet/mib2.h> #include <inet/nd.h> @@ -71,9 +78,12 @@ const char udp_version[] = "%Z%%M% %I% %E% SMI"; #include <inet/snmpcom.h> #include <inet/kstatcom.h> #include <inet/udp_impl.h> +#include <inet/ipclassifier.h> +#include <inet/ipsec_impl.h> +#include <inet/ipp_common.h> /* - * The ipsec_info.h header file is here since it has the defination for the + * The ipsec_info.h header file is here since it has the definition for the * M_CTL message types used by IP to convey information to the ULP. The * ipsec_info.h needs the pfkeyv2.h, hence the latters presence. */ @@ -81,40 +91,138 @@ const char udp_version[] = "%Z%%M% %I% %E% SMI"; #include <inet/ipsec_info.h> /* - * Object to represent database of options to search passed to - * {sock,tpi}optcom_req() interface routine to take care of option - * management and associated methods. - * XXX. These and other externs should really move to a udp header file. - */ -extern optdb_obj_t udp_opt_obj; -extern uint_t udp_max_optsize; - - -/* * Synchronization notes: * - * UDP uses a combination of the queue-pair STREAMS perimeter, a global - * lock and a set of bind hash locks to protect its data structures. + * UDP uses a combination of its internal perimeter, a global lock and + * a set of bind hash locks to protect its data structures. Please see + * the note above udp_mode_assertions for details about the internal + * perimeter. * - * The queue-pair perimeter is not acquired exclusively in the put - * procedures thus when udp_rput or udp_wput needs exclusive access to - * the udp_t instance structure it will use qwriter(..., PERIM_INNER) to - * asynchronously acquire exclusive access to the udp_t instance. - * - * When UDP global data needs to be modified the udp_g_lock mutex is acquired. - * Currently, udp_g_head and udp_g_epriv_ports[] are protected by it. - * - * When an UDP endpoint is bound to a local port, it is inserted into + * When a UDP endpoint is bound to a local port, it is inserted into * a bind hash list. The list consists of an array of udp_fanout_t buckets. * The size of the array is controlled by the udp_bind_fanout_size variable. * This variable can be changed in /etc/system if the default value is - * not large enough. Each bind hash bucket is protected by a per bucket lock. - * It protects the udp_bind_hash and udp_ptpbhn fields in the udp_t + * not large enough. Each bind hash bucket is protected by a per bucket + * lock. It protects the udp_bind_hash and udp_ptpbhn fields in the udp_t * structure. An UDP endpoint is removed from the bind hash list only * when it is being unbound or being closed. The per bucket lock also - * protects an UDP endpoint's state changes. + * protects a UDP endpoint's state changes. + * + * Plumbing notes: + * + * Both udp and ip are merged, but the streams plumbing is kept unchanged + * in that udp is always pushed atop /dev/ip. This is done to preserve + * backwards compatibility for certain applications which rely on such + * plumbing geometry to do things such as issuing I_POP on the stream + * in order to obtain direct access to /dev/ip, etc. + * + * All UDP processings happen in the /dev/ip instance; the udp module + * instance does not possess any state about the endpoint, and merely + * acts as a dummy module whose presence is to keep the streams plumbing + * appearance unchanged. At open time /dev/ip allocates a conn_t that + * happens to embed a udp_t. This stays dormant until the time udp is + * pushed, which indicates to /dev/ip that it must convert itself from + * an IP to a UDP endpoint. + * + * We only allow for the following plumbing cases: + * + * Normal: + * /dev/ip is first opened and later udp is pushed directly on top. + * This is the default action that happens when a udp socket or + * /dev/udp is opened. The conn_t created by /dev/ip instance is + * now shared and is marked with IPCL_UDP. + * + * SNMP-only: + * udp is pushed on top of a module other than /dev/ip. When this + * happens it will support only SNMP semantics. A new conn_t is + * allocated and marked with IPCL_UDPMOD. + * + * The above cases imply that we don't support any intermediate module to + * reside in between /dev/ip and udp -- in fact, we never supported such + * scenario in the past as the inter-layer communication semantics have + * always been private. Also note that the normal case allows for SNMP + * requests to be processed in addition to the rest of UDP operations. + * + * The normal case plumbing is depicted by the following diagram: + * + * +---------------+---------------+ + * | | | udp + * | udp_wq | udp_rq | + * | | UDP_RD | + * | | | + * +---------------+---------------+ + * | ^ + * v | + * +---------------+---------------+ + * | | | /dev/ip + * | ip_wq | ip_rq | conn_t + * | UDP_WR | | + * | | | + * +---------------+---------------+ + * + * Messages arriving at udp_wq from above will end up in ip_wq before + * it gets processed, i.e. udp write entry points will advance udp_wq + * and use its q_next value as ip_wq in order to use the conn_t that + * is stored in its q_ptr. Likewise, messages generated by ip to the + * module above udp will appear as if they are originated from udp_rq, + * i.e. putnext() calls to the module above udp is done using the + * udp_rq instead of ip_rq in order to avoid udp_rput() which does + * nothing more than calling putnext(). + * + * The above implies the following rule of thumb: + * + * 1. udp_t is obtained from conn_t, which is created by the /dev/ip + * instance and is stored in q_ptr of both ip_wq and ip_rq. There + * is no direct reference to conn_t from either udp_wq or udp_rq. + * + * 2. Write-side entry points of udp can obtain the conn_t via the + * Q_TO_CONN() macro, using the queue value obtain from UDP_WR(). + * + * 3. While in /dev/ip context, putnext() to the module above udp can + * be done by supplying the queue value obtained from UDP_RD(). + * */ +static queue_t *UDP_WR(queue_t *); +static queue_t *UDP_RD(queue_t *); + +udp_stat_t udp_statistics = { + { "udp_ip_send", KSTAT_DATA_UINT64 }, + { "udp_ip_ire_send", KSTAT_DATA_UINT64 }, + { "udp_ire_null", KSTAT_DATA_UINT64 }, + { "udp_drain", KSTAT_DATA_UINT64 }, + { "udp_sock_fallback", KSTAT_DATA_UINT64 }, + { "udp_rrw_busy", KSTAT_DATA_UINT64 }, + { "udp_rrw_msgcnt", KSTAT_DATA_UINT64 }, + { "udp_out_sw_cksum", KSTAT_DATA_UINT64 }, + { "udp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, + { "udp_out_opt", KSTAT_DATA_UINT64 }, + { "udp_out_err_notconn", KSTAT_DATA_UINT64 }, + { "udp_out_err_output", KSTAT_DATA_UINT64 }, + { "udp_out_err_tudr", KSTAT_DATA_UINT64 }, + { "udp_in_pktinfo", KSTAT_DATA_UINT64 }, + { "udp_in_recvdstaddr", KSTAT_DATA_UINT64 }, + { "udp_in_recvopts", KSTAT_DATA_UINT64 }, + { "udp_in_recvif", KSTAT_DATA_UINT64 }, + { "udp_in_recvslla", KSTAT_DATA_UINT64 }, + { "udp_in_recvucred", KSTAT_DATA_UINT64 }, + { "udp_in_recvttl", KSTAT_DATA_UINT64 }, + { "udp_in_recvhopopts", KSTAT_DATA_UINT64 }, + { "udp_in_recvhoplimit", KSTAT_DATA_UINT64 }, + { "udp_in_recvdstopts", KSTAT_DATA_UINT64 }, + { "udp_in_recvrtdstopts", KSTAT_DATA_UINT64 }, + { "udp_in_recvrthdr", KSTAT_DATA_UINT64 }, + { "udp_in_recvpktinfo", KSTAT_DATA_UINT64 }, + { "udp_in_recvtclass", KSTAT_DATA_UINT64 }, +#ifdef DEBUG + { "udp_data_conn", KSTAT_DATA_UINT64 }, + { "udp_data_notconn", KSTAT_DATA_UINT64 }, +#endif +}; + +static kstat_t *udp_ksp; +struct kmem_cache *udp_cache; + /* * Bind hash list size and hash function. It has to be a power of 2 for * hashing. @@ -151,14 +259,6 @@ static clock_t udp_last_ndd_get_info_time; "later.\n" #define NDD_OUT_OF_BUF_MSG "<< Out of buffer >>\n" -/* Named Dispatch Parameter Management Structure */ -typedef struct udpparam_s { - uint32_t udp_param_min; - uint32_t udp_param_max; - uint32_t udp_param_value; - char *udp_param_name; -} udpparam_t; - static void udp_addr_req(queue_t *q, mblk_t *mp); static void udp_bind(queue_t *q, mblk_t *mp); static void udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp); @@ -188,15 +288,6 @@ static int udp_open(queue_t *q, dev_t *devp, int flag, int sflag, static int udp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp, void *thisdg_attrs); static boolean_t udp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name); -int udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, - uchar_t *ptr); -int udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, - uchar_t *ptr); -int udp_opt_set(queue_t *q, uint_t optset_context, - int level, int name, - uint_t inlen, uchar_t *invalp, - uint_t *outlenp, uchar_t *outvalp, - void *thisdg_attrs, cred_t *cr, mblk_t *mblk); static int udp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); static boolean_t udp_param_register(udpparam_t *udppa, int cnt); static int udp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, @@ -205,62 +296,91 @@ static int udp_pkt_set(uchar_t *invalp, uint_t inlen, boolean_t sticky, uchar_t **optbufp, uint_t *optlenp); static void udp_report_item(mblk_t *mp, udp_t *udp); static void udp_rput(queue_t *q, mblk_t *mp); +static void udp_rput_other(queue_t *, mblk_t *); +static int udp_rinfop(queue_t *q, infod_t *dp); +static int udp_rrw(queue_t *q, struiod_t *dp); static void udp_rput_bind_ack(queue_t *q, mblk_t *mp); -static void udp_rput_other(queue_t *q, mblk_t *mp); -static int udp_snmp_get(queue_t *q, mblk_t *mpctl); -static int udp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, - uchar_t *ptr, int len); static int udp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); -static void udp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err); +static void udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha); +static void udp_ud_err(queue_t *q, mblk_t *mp, uchar_t *destaddr, + t_scalar_t destlen, t_scalar_t err); static void udp_unbind(queue_t *q, mblk_t *mp); static in_port_t udp_update_next_port(in_port_t port, boolean_t random); static void udp_wput(queue_t *q, mblk_t *mp); -static void udp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6, - t_scalar_t tudr_optlen); +static mblk_t *udp_output_v4(conn_t *, mblk_t *mp, ipaddr_t v4dst, + uint16_t port, uint_t srcid, int *error); +static mblk_t *udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, + t_scalar_t tudr_optlen, int *error); static void udp_wput_other(queue_t *q, mblk_t *mp); static void udp_wput_iocdata(queue_t *q, mblk_t *mp); +static void udp_output(conn_t *connp, mblk_t *mp, struct sockaddr *addr, + socklen_t addrlen); +static size_t udp_set_rcv_hiwat(udp_t *udp, size_t size); static void udp_kstat_init(void); static void udp_kstat_fini(void); static int udp_kstat_update(kstat_t *kp, int rw); +static void udp_input_wrapper(void *arg, mblk_t *mp, void *arg2); +static void udp_rput_other_wrapper(void *arg, mblk_t *mp, void *arg2); +static void udp_wput_other_wrapper(void *arg, mblk_t *mp, void *arg2); +static void udp_resume_bind_cb(void *arg, mblk_t *mp, void *arg2); + +static void udp_rcv_enqueue(queue_t *q, udp_t *udp, mblk_t *mp, + uint_t pkt_len); +static void udp_rcv_drain(queue_t *q, udp_t *udp, boolean_t closing); +static void udp_enter(conn_t *, mblk_t *, sqproc_t, uint8_t); +static void udp_exit(conn_t *); +static void udp_become_writer(conn_t *, mblk_t *, sqproc_t, uint8_t); +#ifdef DEBUG +static void udp_mode_assertions(udp_t *, int); +#endif /* DEBUG */ major_t UDP6_MAJ; -#define UDP6 "udp6" +#define UDP6 "udp6" + +#define UDP_RECV_HIWATER (56 * 1024) +#define UDP_RECV_LOWATER 128 +#define UDP_XMIT_HIWATER (56 * 1024) +#define UDP_XMIT_LOWATER 1024 -#define UDP_MAXPACKET_IPV4 \ - (IP_MAXPACKET - UDPH_SIZE - IP_SIMPLE_HDR_LENGTH) -#define UDP_MAXPACKET_IPV6 \ - (IP_MAXPACKET - UDPH_SIZE - IPV6_HDR_LEN) +static struct module_info udp_info = { + UDP_MOD_ID, UDP_MOD_NAME, 1, INFPSZ, UDP_RECV_HIWATER, UDP_RECV_LOWATER +}; + +static struct qinit udp_rinit = { + (pfi_t)udp_rput, NULL, udp_open, udp_close, NULL, + &udp_info, NULL, udp_rrw, udp_rinfop, STRUIOT_STANDARD +}; -static struct module_info info = { - 5607, "udp", 1, INFPSZ, 512, 128 +static struct qinit udp_winit = { + (pfi_t)udp_wput, NULL, NULL, NULL, NULL, + &udp_info, NULL, NULL, NULL, STRUIOT_NONE }; -static struct qinit rinit = { - (pfi_t)udp_rput, NULL, udp_open, udp_close, NULL, &info +/* Support for just SNMP if UDP is not pushed directly over device IP */ +struct qinit udp_snmp_rinit = { + (pfi_t)putnext, NULL, udp_open, ip_snmpmod_close, NULL, + &udp_info, NULL, NULL, NULL, STRUIOT_NONE }; -static struct qinit winit = { - (pfi_t)udp_wput, NULL, NULL, NULL, NULL, &info +struct qinit udp_snmp_winit = { + (pfi_t)ip_snmpmod_wput, NULL, udp_open, ip_snmpmod_close, NULL, + &udp_info, NULL, NULL, NULL, STRUIOT_NONE }; struct streamtab udpinfo = { - &rinit, &winit + &udp_rinit, &udp_winit }; static sin_t sin_null; /* Zero address for quick clears */ static sin6_t sin6_null; /* Zero address for quick clears */ -/* Protected by udp_g_lock */ -static void *udp_g_head; /* Head for list of open udp streams. */ -kmutex_t udp_g_lock; /* Protects the above variable */ - /* Hint not protected by any lock */ static in_port_t udp_g_next_port_to_try; /* - * Extra privileged ports. In host byte order. Protected by udp_g_lock. + * Extra privileged ports. In host byte order. */ #define UDP_NUM_EPRIV_PORTS 64 static int udp_g_num_epriv_ports = UDP_NUM_EPRIV_PORTS; @@ -273,6 +393,7 @@ static IDP udp_g_nd; /* Points to table of UDP ND variables. */ static mib2_udp_t udp_mib; /* SNMP fixed size info */ static kstat_t *udp_mibkp; /* kstat exporting udp_mib data */ +#define UDP_MAXPACKET_IPV4 (IP_MAXPACKET - UDPH_SIZE - IP_SIMPLE_HDR_LENGTH) /* Default structure copied into T_INFO_ACK messages */ static struct T_info_ack udp_g_t_info_ack_ipv4 = { @@ -289,6 +410,8 @@ static struct T_info_ack udp_g_t_info_ack_ipv4 = { (XPG4_1|SENDZERO) /* PROVIDER_flag */ }; +#define UDP_MAXPACKET_IPV6 (IP_MAXPACKET - UDPH_SIZE - IPV6_HDR_LEN) + static struct T_info_ack udp_g_t_info_ack_ipv6 = { T_INFO_ACK, UDP_MAXPACKET_IPV6, /* TSDU_size. Excl. headers */ @@ -311,33 +434,23 @@ static struct T_info_ack udp_g_t_info_ack_ipv6 = { * in udp_open. * All of these are alterable, within the min/max values given, at run time. */ -static udpparam_t udp_param_arr[] = { - /* min max value name */ - { 0L, 256, 32, "udp_wroff_extra" }, - { 1L, 255, 255, "udp_ipv4_ttl" }, - { 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS, "udp_ipv6_hoplimit"}, - { 1024, (32 * 1024), 1024, "udp_smallest_nonpriv_port" }, - { 0, 1, 1, "udp_do_checksum" }, - { 1024, UDP_MAX_PORT, (32 * 1024), "udp_smallest_anon_port" }, - { 1024, UDP_MAX_PORT, UDP_MAX_PORT, "udp_largest_anon_port" }, - { 4096, 1024*1024, 56*1024, "udp_xmit_hiwat"}, - { 0, 1024*1024, 1024, "udp_xmit_lowat"}, - { 4096, 1024*1024, 56*1024, "udp_recv_hiwat"}, - { 65536, 1024*1024*1024, 2*1024*1024, "udp_max_buf"}, - { 100, 60000, 1000, "udp_ndd_get_info_interval"}, +/* BEGIN CSTYLED */ +udpparam_t udp_param_arr[] = { + /*min max value name */ + { 0L, 256, 32, "udp_wroff_extra" }, + { 1L, 255, 255, "udp_ipv4_ttl" }, + { 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS, "udp_ipv6_hoplimit"}, + { 1024, (32 * 1024), 1024, "udp_smallest_nonpriv_port" }, + { 0, 1, 1, "udp_do_checksum" }, + { 1024, UDP_MAX_PORT, (32 * 1024), "udp_smallest_anon_port" }, + { 1024, UDP_MAX_PORT, UDP_MAX_PORT, "udp_largest_anon_port" }, + { UDP_XMIT_LOWATER, (1<<30), UDP_XMIT_HIWATER, "udp_xmit_hiwat"}, + { 0, (1<<30), UDP_XMIT_LOWATER, "udp_xmit_lowat"}, + { UDP_RECV_LOWATER, (1<<30), UDP_RECV_HIWATER, "udp_recv_hiwat"}, + { 65536, (1<<30), 2*1024*1024, "udp_max_buf"}, + { 100, 60000, 1000, "udp_ndd_get_info_interval"}, }; -#define udp_wroff_extra udp_param_arr[0].udp_param_value -#define udp_ipv4_ttl udp_param_arr[1].udp_param_value -#define udp_ipv6_hoplimit udp_param_arr[2].udp_param_value -#define udp_smallest_nonpriv_port udp_param_arr[3].udp_param_value -#define udp_do_checksum udp_param_arr[4].udp_param_value -#define udp_smallest_anon_port udp_param_arr[5].udp_param_value -#define udp_largest_anon_port udp_param_arr[6].udp_param_value -#define udp_xmit_hiwat udp_param_arr[7].udp_param_value -#define udp_xmit_lowat udp_param_arr[8].udp_param_value -#define udp_recv_hiwat udp_param_arr[9].udp_param_value -#define udp_max_buf udp_param_arr[10].udp_param_value -#define udp_ndd_get_info_interval udp_param_arr[11].udp_param_value +/* END CSTYLED */ /* * The smallest anonymous port in the priviledged port range which UDP @@ -354,9 +467,434 @@ uint32_t udp_random_anon_port = 1; */ void (*cl_inet_bind)(uchar_t protocol, sa_family_t addr_family, - uint8_t *laddrp, in_port_t lport) = NULL; + uint8_t *laddrp, in_port_t lport) = NULL; void (*cl_inet_unbind)(uint8_t protocol, sa_family_t addr_family, - uint8_t *laddrp, in_port_t lport) = NULL; + uint8_t *laddrp, in_port_t lport) = NULL; + +typedef union T_primitives *t_primp_t; + +#define UDP_ENQUEUE_MP(udp, mp, proc, tag) { \ + ASSERT((mp)->b_prev == NULL && (mp)->b_queue == NULL); \ + ASSERT(MUTEX_HELD(&(udp)->udp_connp->conn_lock)); \ + (mp)->b_queue = (queue_t *)((uintptr_t)tag); \ + (mp)->b_prev = (mblk_t *)proc; \ + if ((udp)->udp_mphead == NULL) \ + (udp)->udp_mphead = (mp); \ + else \ + (udp)->udp_mptail->b_next = (mp); \ + (udp)->udp_mptail = (mp); \ + (udp)->udp_mpcount++; \ +} + +#define UDP_READERS_INCREF(udp) { \ + ASSERT(MUTEX_HELD(&(udp)->udp_connp->conn_lock)); \ + (udp)->udp_reader_count++; \ +} + +#define UDP_READERS_DECREF(udp) { \ + ASSERT(MUTEX_HELD(&(udp)->udp_connp->conn_lock)); \ + (udp)->udp_reader_count--; \ + if ((udp)->udp_reader_count == 0) \ + cv_broadcast(&(udp)->udp_connp->conn_cv); \ +} + +#define UDP_SQUEUE_DECREF(udp) { \ + ASSERT(MUTEX_HELD(&(udp)->udp_connp->conn_lock)); \ + (udp)->udp_squeue_count--; \ + if ((udp)->udp_squeue_count == 0) \ + cv_broadcast(&(udp)->udp_connp->conn_cv); \ +} + +/* + * Notes on UDP endpoint synchronization: + * + * UDP needs exclusive operation on a per endpoint basis, when executing + * functions that modify the endpoint state. udp_rput_other() deals with + * packets with IP options, and processing these packets end up having + * to update the endpoint's option related state. udp_wput_other() deals + * with control operations from the top, e.g. connect() that needs to + * update the endpoint state. These could be synchronized using locks, + * but the current version uses squeues for this purpose. squeues may + * give performance improvement for certain cases such as connected UDP + * sockets; thus the framework allows for using squeues. + * + * The perimeter routines are described as follows: + * + * udp_enter(): + * Enter the UDP endpoint perimeter. + * + * udp_become_writer(): + * Become exclusive on the UDP endpoint. Specifies a function + * that will be called exclusively either immediately or later + * when the perimeter is available exclusively. + * + * udp_exit(): + * Exit the UDP perimeter. + * + * Entering UDP from the top or from the bottom must be done using + * udp_enter(). No lock must be held while attempting to enter the UDP + * perimeter. When finished, udp_exit() must be called to get out of + * the perimeter. + * + * UDP operates in either MT_HOT mode or in SQUEUE mode. In MT_HOT mode, + * multiple threads may enter a UDP endpoint concurrently. This is used + * for sending and/or receiving normal data. Control operations and other + * special cases call udp_become_writer() to become exclusive on a per + * endpoint basis and this results in transitioning to SQUEUE mode. squeue + * by definition serializes access to the conn_t. When there are no more + * pending messages on the squeue for the UDP connection, the endpoint + * reverts to MT_HOT mode. During the interregnum when not all MT threads + * of an endpoint have finished, messages are queued in the UDP endpoint + * and the UDP is in UDP_MT_QUEUED mode or UDP_QUEUED_SQUEUE mode. + * + * These modes have the following analogs: + * + * UDP_MT_HOT/udp_reader_count==0 none + * UDP_MT_HOT/udp_reader_count>0 RW_READ_LOCK + * UDP_MT_QUEUED RW_WRITE_WANTED + * UDP_SQUEUE or UDP_QUEUED_SQUEUE RW_WRITE_LOCKED + * + * Stable modes: UDP_MT_HOT, UDP_SQUEUE + * Transient modes: UDP_MT_QUEUED, UDP_QUEUED_SQUEUE + * + * While in stable modes, UDP keeps track of the number of threads + * operating on the endpoint. The udp_reader_count variable represents + * the number of threads entering the endpoint as readers while it is + * in UDP_MT_HOT mode. Transitioning to UDP_SQUEUE happens when there + * is only a single reader, i.e. when this counter drops to 1. Likewise, + * udp_squeue_count represents the number of threads operating on the + * endpoint's squeue while it is in UDP_SQUEUE mode. The mode transition + * to UDP_MT_HOT happens after the last thread exits the endpoint, i.e. + * when this counter drops to 0. + * + * The default mode is set to UDP_MT_HOT and UDP alternates between + * UDP_MT_HOT and UDP_SQUEUE as shown in the state transition below. + * + * Mode transition: + * ---------------------------------------------------------------- + * old mode Event New mode + * ---------------------------------------------------------------- + * UDP_MT_HOT Call to udp_become_writer() UDP_SQUEUE + * and udp_reader_count == 1 + * + * UDP_MT_HOT Call to udp_become_writer() UDP_MT_QUEUED + * and udp_reader_count > 1 + * + * UDP_MT_QUEUED udp_reader_count drops to zero UDP_QUEUED_SQUEUE + * + * UDP_QUEUED_SQUEUE All messages enqueued on the UDP_SQUEUE + * internal UDP queue successfully + * moved to squeue AND udp_squeue_count != 0 + * + * UDP_QUEUED_SQUEUE All messages enqueued on the UDP_MT_HOT + * internal UDP queue successfully + * moved to squeue AND udp_squeue_count + * drops to zero + * + * UDP_SQUEUE udp_squeue_count drops to zero UDP_MT_HOT + * ---------------------------------------------------------------- + */ + +static queue_t * +UDP_WR(queue_t *q) +{ + ASSERT(q->q_ptr == NULL && _OTHERQ(q)->q_ptr == NULL); + ASSERT(WR(q)->q_next != NULL && WR(q)->q_next->q_ptr != NULL); + ASSERT(IPCL_IS_UDP(Q_TO_CONN(WR(q)->q_next))); + + return (_WR(q)->q_next); +} + +static queue_t * +UDP_RD(queue_t *q) +{ + ASSERT(q->q_ptr != NULL && _OTHERQ(q)->q_ptr != NULL); + ASSERT(IPCL_IS_UDP(Q_TO_CONN(q))); + ASSERT(RD(q)->q_next != NULL && RD(q)->q_next->q_ptr == NULL); + + return (_RD(q)->q_next); +} + +#ifdef DEBUG +#define UDP_MODE_ASSERTIONS(udp, caller) udp_mode_assertions(udp, caller) +#else +#define UDP_MODE_ASSERTIONS(udp, caller) +#endif + +/* Invariants */ +#ifdef DEBUG + +uint32_t udp_count[4]; + +/* Context of udp_mode_assertions */ +#define UDP_ENTER 1 +#define UDP_BECOME_WRITER 2 +#define UDP_EXIT 3 + +static void +udp_mode_assertions(udp_t *udp, int caller) +{ + ASSERT(MUTEX_HELD(&udp->udp_connp->conn_lock)); + + switch (udp->udp_mode) { + case UDP_MT_HOT: + /* + * Messages have not yet been enqueued on the internal queue, + * otherwise we would have switched to UDP_MT_QUEUED. Likewise + * by definition, there can't be any messages enqueued on the + * squeue. The UDP could be quiescent, so udp_reader_count + * could be zero at entry. + */ + ASSERT(udp->udp_mphead == NULL && udp->udp_mpcount == 0 && + udp->udp_squeue_count == 0); + ASSERT(caller == UDP_ENTER || udp->udp_reader_count != 0); + udp_count[0]++; + break; + + case UDP_MT_QUEUED: + /* + * The last MT thread to exit the udp perimeter empties the + * internal queue and then switches the UDP to + * UDP_QUEUED_SQUEUE mode. Since we are still in UDP_MT_QUEUED + * mode, it means there must be at least 1 MT thread still in + * the perimeter and at least 1 message on the internal queue. + */ + ASSERT(udp->udp_reader_count >= 1 && udp->udp_mphead != NULL && + udp->udp_mpcount != 0 && udp->udp_squeue_count == 0); + udp_count[1]++; + break; + + case UDP_QUEUED_SQUEUE: + /* + * The switch has happened from MT to SQUEUE. So there can't + * any MT threads. Messages could still pile up on the internal + * queue until the transition is complete and we move to + * UDP_SQUEUE mode. We can't assert on nonzero udp_squeue_count + * since the squeue could drain any time. + */ + ASSERT(udp->udp_reader_count == 0); + udp_count[2]++; + break; + + case UDP_SQUEUE: + /* + * The transition is complete. Thre can't be any messages on + * the internal queue. The udp could be quiescent or the squeue + * could drain any time, so we can't assert on nonzero + * udp_squeue_count during entry. Nor can we assert that + * udp_reader_count is zero, since, a reader thread could have + * directly become writer in line by calling udp_become_writer + * without going through the queued states. + */ + ASSERT(udp->udp_mphead == NULL && udp->udp_mpcount == 0); + ASSERT(caller == UDP_ENTER || udp->udp_squeue_count != 0); + udp_count[3]++; + break; + } +} +#endif + +#define _UDP_ENTER(connp, mp, proc, tag) { \ + udp_t *_udp = (connp)->conn_udp; \ + \ + mutex_enter(&(connp)->conn_lock); \ + if ((connp)->conn_state_flags & CONN_CLOSING) { \ + mutex_exit(&(connp)->conn_lock); \ + freemsg(mp); \ + } else { \ + UDP_MODE_ASSERTIONS(_udp, UDP_ENTER); \ + \ + switch (_udp->udp_mode) { \ + case UDP_MT_HOT: \ + /* We can execute as reader right away. */ \ + UDP_READERS_INCREF(_udp); \ + mutex_exit(&(connp)->conn_lock); \ + (*(proc))(connp, mp, (connp)->conn_sqp); \ + break; \ + \ + case UDP_SQUEUE: \ + /* \ + * We are in squeue mode, send the \ + * packet to the squeue \ + */ \ + _udp->udp_squeue_count++; \ + CONN_INC_REF_LOCKED(connp); \ + mutex_exit(&(connp)->conn_lock); \ + squeue_enter((connp)->conn_sqp, mp, proc, \ + connp, tag); \ + break; \ + \ + case UDP_MT_QUEUED: \ + case UDP_QUEUED_SQUEUE: \ + /* \ + * Some messages may have been enqueued \ + * ahead of us. Enqueue the new message \ + * at the tail of the internal queue to \ + * preserve message ordering. \ + */ \ + UDP_ENQUEUE_MP(_udp, mp, proc, tag); \ + mutex_exit(&(connp)->conn_lock); \ + break; \ + } \ + } \ +} + +static void +udp_enter(conn_t *connp, mblk_t *mp, sqproc_t proc, uint8_t tag) +{ + _UDP_ENTER(connp, mp, proc, tag); +} + +static void +udp_become_writer(conn_t *connp, mblk_t *mp, sqproc_t proc, uint8_t tag) +{ + udp_t *udp; + + udp = connp->conn_udp; + + mutex_enter(&connp->conn_lock); + + UDP_MODE_ASSERTIONS(udp, UDP_BECOME_WRITER); + + switch (udp->udp_mode) { + case UDP_MT_HOT: + if (udp->udp_reader_count == 1) { + /* + * We are the only MT thread. Switch to squeue mode + * immediately. + */ + udp->udp_mode = UDP_SQUEUE; + udp->udp_squeue_count = 1; + CONN_INC_REF_LOCKED(connp); + mutex_exit(&connp->conn_lock); + squeue_enter(connp->conn_sqp, mp, proc, connp, tag); + return; + } + /* FALLTHRU */ + + case UDP_MT_QUEUED: + /* Enqueue the packet internally in UDP */ + udp->udp_mode = UDP_MT_QUEUED; + UDP_ENQUEUE_MP(udp, mp, proc, tag); + mutex_exit(&connp->conn_lock); + return; + + case UDP_SQUEUE: + case UDP_QUEUED_SQUEUE: + /* + * We are already exclusive. i.e. we are already + * writer. Simply call the desired function. + */ + udp->udp_squeue_count++; + mutex_exit(&connp->conn_lock); + (*proc)(connp, mp, connp->conn_sqp); + return; + } +} + +/* + * Transition from MT mode to SQUEUE mode, when the last MT thread + * is exiting the UDP perimeter. Move all messages from the internal + * udp queue to the squeue. A better way would be to move all the + * messages in one shot, this needs more support from the squeue framework + */ +static void +udp_switch_to_squeue(udp_t *udp) +{ + mblk_t *mp; + mblk_t *mp_next; + sqproc_t proc; + uint8_t tag; + conn_t *connp = udp->udp_connp; + + ASSERT(MUTEX_HELD(&connp->conn_lock)); + ASSERT(udp->udp_mode == UDP_MT_QUEUED); + while (udp->udp_mphead != NULL) { + mp = udp->udp_mphead; + udp->udp_mphead = NULL; + udp->udp_mptail = NULL; + udp->udp_mpcount = 0; + udp->udp_mode = UDP_QUEUED_SQUEUE; + mutex_exit(&connp->conn_lock); + /* + * It is best not to hold any locks across the calls + * to squeue functions. Since we drop the lock we + * need to go back and check the udp_mphead once again + * after the squeue_fill and hence the while loop at + * the top of this function + */ + for (; mp != NULL; mp = mp_next) { + mp_next = mp->b_next; + proc = (sqproc_t)mp->b_prev; + tag = (uint8_t)((uintptr_t)mp->b_queue); + mp->b_next = NULL; + mp->b_prev = NULL; + mp->b_queue = NULL; + CONN_INC_REF(connp); + udp->udp_squeue_count++; + squeue_fill(connp->conn_sqp, mp, proc, connp, + tag); + } + mutex_enter(&connp->conn_lock); + } + /* + * udp_squeue_count of zero implies that the squeue has drained + * even before we arrived here (i.e. after the squeue_fill above) + */ + udp->udp_mode = (udp->udp_squeue_count != 0) ? + UDP_SQUEUE : UDP_MT_HOT; +} + +#define _UDP_EXIT(connp) { \ + udp_t *_udp = (connp)->conn_udp; \ + \ + mutex_enter(&(connp)->conn_lock); \ + UDP_MODE_ASSERTIONS(_udp, UDP_EXIT); \ + \ + switch (_udp->udp_mode) { \ + case UDP_MT_HOT: \ + UDP_READERS_DECREF(_udp); \ + mutex_exit(&(connp)->conn_lock); \ + break; \ + \ + case UDP_SQUEUE: \ + UDP_SQUEUE_DECREF(_udp); \ + if (_udp->udp_squeue_count == 0) \ + _udp->udp_mode = UDP_MT_HOT; \ + mutex_exit(&(connp)->conn_lock); \ + break; \ + \ + case UDP_MT_QUEUED: \ + /* \ + * If this is the last MT thread, we need to \ + * switch to squeue mode \ + */ \ + UDP_READERS_DECREF(_udp); \ + if (_udp->udp_reader_count == 0) \ + udp_switch_to_squeue(_udp); \ + mutex_exit(&(connp)->conn_lock); \ + break; \ + \ + case UDP_QUEUED_SQUEUE: \ + UDP_SQUEUE_DECREF(_udp); \ + /* \ + * Even if the udp_squeue_count drops to zero, we \ + * don't want to change udp_mode to UDP_MT_HOT here. \ + * The thread in udp_switch_to_squeue will take care \ + * of the transition to UDP_MT_HOT, after emptying \ + * any more new messages that have been enqueued in \ + * udp_mphead. \ + */ \ + mutex_exit(&(connp)->conn_lock); \ + break; \ + } \ +} + +static void +udp_exit(conn_t *connp) +{ + _UDP_EXIT(connp); +} /* * Return the next anonymous port in the priviledged port range for @@ -379,9 +917,13 @@ static int udp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) { udp_fanout_t *udpf; - udp_t *udp; int i; zoneid_t zoneid; + conn_t *connp; + udp_t *udp; + + connp = Q_TO_CONN(q); + udp = connp->conn_udp; /* Refer to comments in udp_status_report(). */ if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) { @@ -403,8 +945,7 @@ udp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) " zone lport src addr dest addr port state"); /* 1234 12345 xxx.xxx.xxx.xxx xxx.xxx.xxx.xxx 12345 UNBOUND */ - udp = (udp_t *)q->q_ptr; - zoneid = udp->udp_zoneid; + zoneid = connp->conn_zoneid; for (i = 0; i < udp_bind_fanout_size; i++) { udpf = &udp_bind_fanout[i]; @@ -415,7 +956,7 @@ udp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) if (zoneid != GLOBAL_ZONEID) { /* skip to first entry in this zone; might be none */ while (udp != NULL && - udp->udp_zoneid != zoneid) + udp->udp_connp->conn_zoneid != zoneid) udp = udp->udp_bind_hash; } if (udp != NULL) { @@ -432,7 +973,7 @@ udp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) } for (; udp != NULL; udp = udp->udp_bind_hash) { if (zoneid == GLOBAL_ZONEID || - zoneid == udp->udp_zoneid) + zoneid == udp->udp_connp->conn_zoneid) udp_report_item(mp->b_cont, udp); } } @@ -542,7 +1083,6 @@ udp_bind(queue_t *q, mblk_t *mp) in_port_t port; /* Host byte order */ in_port_t requested_port; /* Host byte order */ struct T_bind_req *tbr; - udp_t *udp; int count; in6_addr_t v6src; boolean_t bind_to_req_port_only; @@ -550,8 +1090,11 @@ udp_bind(queue_t *q, mblk_t *mp) udp_fanout_t *udpf; in_port_t lport; /* Network byte order */ zoneid_t zoneid; + conn_t *connp; + udp_t *udp; - udp = (udp_t *)q->q_ptr; + connp = Q_TO_CONN(q); + udp = connp->conn_udp; if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, "udp_bind: bad req, len %u", @@ -559,6 +1102,7 @@ udp_bind(queue_t *q, mblk_t *mp) udp_err_ack(q, mp, TPROTO, 0); return; } + if (udp->udp_state != TS_UNBND) { (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, "udp_bind: bad state, %u", udp->udp_state); @@ -673,7 +1217,7 @@ udp_bind(queue_t *q, mblk_t *mp) } if (priv) { - cred_t *cr = DB_CREDDEF(mp, udp->udp_credp); + cred_t *cr = DB_CREDDEF(mp, connp->conn_cred); if (secpolicy_net_privaddr(cr, port) != 0) { udp_err_ack(q, mp, TACCES, 0); @@ -736,7 +1280,7 @@ udp_bind(queue_t *q, mblk_t *mp) loopmax = udp_largest_anon_port - udp_smallest_anon_port + 1; } - zoneid = udp->udp_zoneid; + zoneid = connp->conn_zoneid; for (;;) { udp_t *udp1; boolean_t is_inaddr_any; @@ -753,7 +1297,7 @@ udp_bind(queue_t *q, mblk_t *mp) for (udp1 = udpf->uf_udp; udp1 != NULL; udp1 = udp1->udp_bind_hash) { if (lport != udp1->udp_port || - zoneid != udp1->udp_zoneid) + zoneid != udp1->udp_connp->conn_zoneid) continue; /* @@ -933,7 +1477,39 @@ udp_bind(queue_t *q, mblk_t *mp) mp->b_cont->b_wptr += sizeof (ire_t); mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE; } - putnext(q, mp); + if (udp->udp_family == AF_INET6) + mp = ip_bind_v6(q, mp, connp, NULL); + else + mp = ip_bind_v4(q, mp, connp); + + if (mp != NULL) + udp_rput_other(_RD(q), mp); + else + CONN_INC_REF(connp); +} + + +void +udp_resume_bind(conn_t *connp, mblk_t *mp) +{ + udp_enter(connp, mp, udp_resume_bind_cb, SQTAG_BIND_RETRY); +} + +/* + * This is called from ip_wput_nondata to resume a deferred UDP bind. + */ +/* ARGSUSED */ +static void +udp_resume_bind_cb(void *arg, mblk_t *mp, void *arg2) +{ + conn_t *connp = arg; + + ASSERT(connp != NULL && IPCL_IS_UDP(connp)); + + udp_rput_other(connp->conn_rq, mp); + + CONN_OPER_PENDING_DONE(connp); + udp_exit(connp); } /* @@ -958,15 +1534,16 @@ udp_connect(queue_t *q, mblk_t *mp) sin6_t *sin6; sin_t *sin; struct T_conn_req *tcr; - udp_t *udp, *udp1; in6_addr_t v6dst; ipaddr_t v4dst; uint16_t dstport; uint32_t flowinfo; mblk_t *mp1, *mp2; udp_fanout_t *udpf; + udp_t *udp, *udp1; + + udp = Q_TO_UDP(q); - udp = (udp_t *)q->q_ptr; tcr = (struct T_conn_req *)mp->b_rptr; /* A bit of sanity checking */ @@ -987,6 +1564,7 @@ udp_connect(queue_t *q, mblk_t *mp) ASSERT(udp->udp_port != 0 && udp->udp_ptpbhn != NULL); udpf = &udp_bind_fanout[UDP_BIND_HASH(udp->udp_port)]; + if (udp->udp_state == TS_DATA_XFER) { /* Already connected - clear out state */ mutex_enter(&udpf->uf_lock); @@ -1185,20 +1763,67 @@ bind_failed: linkb(mp1, mp); linkb(mp1, mp2); - putnext(q, mp1); + if (udp->udp_family == AF_INET) + mp1 = ip_bind_v4(q, mp1, udp->udp_connp); + else + mp1 = ip_bind_v6(q, mp1, udp->udp_connp, NULL); + + if (mp1 != NULL) + udp_rput_other(_RD(q), mp1); + else + CONN_INC_REF(udp->udp_connp); } -/* This is the close routine for udp. It frees the per-stream data. */ static int udp_close(queue_t *q) { - udp_t *udp = (udp_t *)q->q_ptr; + conn_t *connp = Q_TO_CONN(UDP_WR(q)); + udp_t *udp; + queue_t *ip_rq = RD(UDP_WR(q)); - TRACE_1(TR_FAC_UDP, TR_UDP_CLOSE, - "udp_close: q %p", q); + ASSERT(connp != NULL && IPCL_IS_UDP(connp)); + udp = connp->conn_udp; + + ip_quiesce_conn(connp); + /* + * Disable read-side synchronous stream + * interface and drain any queued data. + */ + udp_rcv_drain(q, udp, B_TRUE); + ASSERT(!udp->udp_direct_sockfs); qprocsoff(q); + /* restore IP module's high and low water marks to default values */ + ip_rq->q_hiwat = ip_rq->q_qinfo->qi_minfo->mi_hiwat; + WR(ip_rq)->q_hiwat = WR(ip_rq)->q_qinfo->qi_minfo->mi_hiwat; + WR(ip_rq)->q_lowat = WR(ip_rq)->q_qinfo->qi_minfo->mi_lowat; + + ASSERT(udp->udp_rcv_cnt == 0); + ASSERT(udp->udp_rcv_msgcnt == 0); + ASSERT(udp->udp_rcv_list_head == NULL); + ASSERT(udp->udp_rcv_list_tail == NULL); + + /* connp is now single threaded. */ + udp_close_free(connp); + /* + * Restore connp as an IP endpoint. We don't need + * any locks since we are now single threaded + */ + connp->conn_flags &= ~IPCL_UDP; + connp->conn_state_flags &= + ~(CONN_CLOSING | CONN_CONDEMNED | CONN_QUIESCED); + return (0); +} + +/* + * Called in the close path from IP (ip_quiesce_conn) to quiesce the conn + */ +void +udp_quiesce_conn(conn_t *connp) +{ + udp_t *udp = connp->conn_udp; + if (cl_inet_unbind != NULL && udp->udp_state == TS_IDLE) { /* * Running in cluster mode - register unbind information @@ -1215,16 +1840,30 @@ udp_close(queue_t *q) } udp_bind_hash_remove(udp, B_FALSE); - mutex_enter(&udp_g_lock); - /* Unlink the udp structure and release the minor device number. */ - mi_close_unlink(&udp_g_head, (IDP)udp); - mutex_exit(&udp_g_lock); + + mutex_enter(&connp->conn_lock); + while (udp->udp_reader_count != 0 || udp->udp_squeue_count != 0 || + udp->udp_mode != UDP_MT_HOT) { + cv_wait(&connp->conn_cv, &connp->conn_lock); + } + mutex_exit(&connp->conn_lock); +} + +void +udp_close_free(conn_t *connp) +{ + udp_t *udp = connp->conn_udp; + /* If there are any options associated with the stream, free them. */ - if (udp->udp_ip_snd_options) + if (udp->udp_ip_snd_options) { mi_free((char *)udp->udp_ip_snd_options); + udp->udp_ip_snd_options = NULL; + } - if (udp->udp_ip_rcv_options) + if (udp->udp_ip_rcv_options) { mi_free((char *)udp->udp_ip_rcv_options); + udp->udp_ip_rcv_options = NULL; + } /* Free memory associated with sticky options */ if (udp->udp_sticky_hdrs_len != 0) { @@ -1233,30 +1872,33 @@ udp_close(queue_t *q) udp->udp_sticky_hdrs = NULL; udp->udp_sticky_hdrs_len = 0; } + if (udp->udp_sticky_ipp.ipp_fields & IPPF_HOPOPTS) { kmem_free(udp->udp_sticky_ipp.ipp_hopopts, udp->udp_sticky_ipp.ipp_hopoptslen); + udp->udp_sticky_ipp.ipp_hopopts = NULL; } if (udp->udp_sticky_ipp.ipp_fields & IPPF_RTDSTOPTS) { kmem_free(udp->udp_sticky_ipp.ipp_rtdstopts, udp->udp_sticky_ipp.ipp_rtdstoptslen); + udp->udp_sticky_ipp.ipp_rtdstopts = NULL; } if (udp->udp_sticky_ipp.ipp_fields & IPPF_RTHDR) { kmem_free(udp->udp_sticky_ipp.ipp_rthdr, udp->udp_sticky_ipp.ipp_rthdrlen); + udp->udp_sticky_ipp.ipp_rthdr = NULL; } if (udp->udp_sticky_ipp.ipp_fields & IPPF_DSTOPTS) { kmem_free(udp->udp_sticky_ipp.ipp_dstopts, udp->udp_sticky_ipp.ipp_dstoptslen); + udp->udp_sticky_ipp.ipp_dstopts = NULL; } udp->udp_sticky_ipp.ipp_fields &= ~(IPPF_HOPOPTS|IPPF_RTDSTOPTS|IPPF_RTHDR|IPPF_DSTOPTS); - crfree(udp->udp_credp); - /* Free the data structure */ - mi_close_free((IDP)udp); - q->q_ptr = WR(q)->q_ptr = NULL; - return (0); + udp->udp_connp = NULL; + connp->conn_udp = NULL; + kmem_cache_free(udp_cache, udp); } /* @@ -1277,12 +1919,10 @@ udp_close(queue_t *q) static void udp_disconnect(queue_t *q, mblk_t *mp) { - udp_t *udp; + udp_t *udp = Q_TO_UDP(q); mblk_t *mp1; udp_fanout_t *udpf; - udp = (udp_t *)q->q_ptr; - if (udp->udp_state != TS_DATA_XFER) { (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, "udp_disconnect: bad state, %u", udp->udp_state); @@ -1331,7 +1971,16 @@ udp_disconnect(queue_t *q, mblk_t *mp) /* Append the T_OK_ACK to the T_BIND_REQ for udp_rput */ linkb(mp1, mp); - putnext(q, mp1); + + if (udp->udp_family == AF_INET6) + mp1 = ip_bind_v6(q, mp1, udp->udp_connp, NULL); + else + mp1 = ip_bind_v4(q, mp1, udp->udp_connp); + + if (mp1 != NULL) + udp_rput_other(_RD(q), mp1); + else + CONN_INC_REF(udp->udp_connp); } /* This routine creates a T_ERROR_ACK message and passes it upstream. */ @@ -1339,7 +1988,7 @@ static void udp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error) { if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) - qreply(q, mp); + putnext(UDP_RD(q), mp); } /* Shorthand to generate and send TPI error acks to our client */ @@ -1355,7 +2004,7 @@ udp_err_ack_prim(queue_t *q, mblk_t *mp, int primitive, t_scalar_t t_error, teackp->ERROR_prim = primitive; teackp->TLI_error = t_error; teackp->UNIX_error = sys_error; - qreply(q, mp); + putnext(UDP_RD(q), mp); } } @@ -1372,10 +2021,6 @@ udp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) return (0); } -/* - * Hold udp_g_lock to prevent multiple threads from changing udp_g_epriv_ports - * at the same time. - */ /* ARGSUSED */ static int udp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp, @@ -1393,11 +2038,9 @@ udp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp, return (EINVAL); } - mutex_enter(&udp_g_lock); /* Check if the value is already in the list */ for (i = 0; i < udp_g_num_epriv_ports; i++) { if (new_value == udp_g_epriv_ports[i]) { - mutex_exit(&udp_g_lock); return (EEXIST); } } @@ -1407,20 +2050,14 @@ udp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp, break; } if (i == udp_g_num_epriv_ports) { - mutex_exit(&udp_g_lock); return (EOVERFLOW); } /* Set the new value */ udp_g_epriv_ports[i] = (in_port_t)new_value; - mutex_exit(&udp_g_lock); return (0); } -/* - * Hold udp_g_lock to prevent multiple threads from changing udp_g_epriv_ports - * at the same time. - */ /* ARGSUSED */ static int udp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp, @@ -1438,20 +2075,17 @@ udp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp, return (EINVAL); } - mutex_enter(&udp_g_lock); /* Check that the value is already in the list */ for (i = 0; i < udp_g_num_epriv_ports; i++) { if (udp_g_epriv_ports[i] == new_value) break; } if (i == udp_g_num_epriv_ports) { - mutex_exit(&udp_g_lock); return (ESRCH); } /* Clear the value */ udp_g_epriv_ports[i] = 0; - mutex_exit(&udp_g_lock); return (0); } @@ -1478,8 +2112,8 @@ udp_icmp_error(queue_t *q, mblk_t *mp) sin6_t sin6; mblk_t *mp1; int error = 0; - udp_t *udp = (udp_t *)q->q_ptr; size_t mp_size = MBLKL(mp); + udp_t *udp = Q_TO_UDP(q); /* * Assume IP provides aligned packets - otherwise toss @@ -1495,7 +2129,7 @@ udp_icmp_error(queue_t *q, mblk_t *mp) */ if (!udp->udp_dgram_errind || mp_size < sizeof (ipha_t)) { noticmpv4: - putnext(q, mp); + putnext(UDP_RD(q), mp); return; } @@ -1590,7 +2224,7 @@ noticmpv4: break; } if (mp1) - putnext(q, mp1); + putnext(UDP_RD(q), mp1); freemsg(mp); } @@ -1609,7 +2243,6 @@ noticmpv4: static void udp_icmp_error_ipv6(queue_t *q, mblk_t *mp) { - udp_t *udp = (udp_t *)q->q_ptr; icmp6_t *icmp6; ip6_t *ip6h, *outer_ip6h; uint16_t hdr_length; @@ -1619,13 +2252,14 @@ udp_icmp_error_ipv6(queue_t *q, mblk_t *mp) mblk_t *mp1; int error = 0; size_t mp_size = MBLKL(mp); + udp_t *udp = Q_TO_UDP(q); /* * Verify that we have a complete IP header. If not, send it upstream. */ if (mp_size < sizeof (ip6_t)) { noticmpv6: - putnext(q, mp); + putnext(UDP_RD(q), mp); return; } @@ -1736,7 +2370,7 @@ noticmpv6: * message. Free it, then send our empty message. */ freemsg(mp); - putnext(q, newmp); + putnext(UDP_RD(q), newmp); return; } case ICMP6_TIME_EXCEEDED: @@ -1766,7 +2400,7 @@ noticmpv6: mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), NULL, 0, error); if (mp1) - putnext(q, mp1); + putnext(UDP_RD(q), mp1); freemsg(mp); } @@ -1780,11 +2414,11 @@ noticmpv6: static void udp_addr_req(queue_t *q, mblk_t *mp) { - udp_t *udp = (udp_t *)q->q_ptr; sin_t *sin; sin6_t *sin6; mblk_t *ackmp; struct T_addr_ack *taa; + udp_t *udp = Q_TO_UDP(q); /* Make it large enough for worst case */ ackmp = reallocb(mp, sizeof (struct T_addr_ack) + @@ -1894,7 +2528,7 @@ udp_addr_req(queue_t *q, mblk_t *mp) } } ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); - qreply(q, ackmp); + putnext(UDP_RD(q), ackmp); } static void @@ -1918,9 +2552,9 @@ udp_copy_info(struct T_info_ack *tap, udp_t *udp) static void udp_capability_req(queue_t *q, mblk_t *mp) { - udp_t *udp = (udp_t *)q->q_ptr; t_uscalar_t cap_bits1; struct T_capability_ack *tcap; + udp_t *udp = Q_TO_UDP(q); cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; @@ -1937,7 +2571,7 @@ udp_capability_req(queue_t *q, mblk_t *mp) tcap->CAP_bits1 |= TC1_INFO; } - qreply(q, mp); + putnext(UDP_RD(q), mp); } /* @@ -1948,7 +2582,7 @@ udp_capability_req(queue_t *q, mblk_t *mp) static void udp_info_req(queue_t *q, mblk_t *mp) { - udp_t *udp = (udp_t *)q->q_ptr; + udp_t *udp = Q_TO_UDP(q); /* Create a T_INFO_ACK message. */ mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, @@ -1956,7 +2590,7 @@ udp_info_req(queue_t *q, mblk_t *mp) if (!mp) return; udp_copy_info((struct T_info_ack *)mp->b_rptr, udp); - qreply(q, mp); + putnext(UDP_RD(q), mp); } /* @@ -2102,20 +2736,19 @@ udp_ip_bind_mp(udp_t *udp, t_scalar_t bind_prim, t_scalar_t addr_length) * This is the open routine for udp. It allocates a udp_t structure for * the stream and, on the first open of the module, creates an ND table. */ +/* ARGSUSED */ static int udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) { int err; udp_t *udp; + conn_t *connp; + zoneid_t zoneid = getzoneid(); + queue_t *ip_wq; + char *name; TRACE_1(TR_FAC_UDP, TR_UDP_OPEN, "udp_open: q %p", q); - /* - * Defer the qprocson until everything is initialized since - * we are D_MTPERQ and after qprocson the rput routine can - * run. - */ - /* If the stream is already open, return immediately. */ if (q->q_ptr != NULL) return (0); @@ -2124,85 +2757,110 @@ udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) if (sflag != MODOPEN) return (EINVAL); + q->q_hiwat = udp_recv_hiwat; + WR(q)->q_hiwat = udp_xmit_hiwat; + WR(q)->q_lowat = udp_xmit_lowat; + + /* Insert ourselves in the stream since we're about to walk q_next */ + qprocson(q); + + udp = kmem_cache_alloc(udp_cache, KM_SLEEP); + bzero(udp, sizeof (*udp)); + /* - * Create and initialize a udp_t structure for this stream. + * UDP is supported only as a module and it has to be pushed directly + * above the device instance of IP. If UDP is pushed anywhere else + * on a stream, it will support just T_SVR4_OPTMGMT_REQ for the + * sake of MIB browsers and fail everything else. */ - udp = (udp_t *)mi_open_alloc_sleep(sizeof (udp_t)); + ip_wq = WR(q)->q_next; + if (ip_wq->q_next != NULL || + (name = ip_wq->q_qinfo->qi_minfo->mi_idname) == NULL || + strcmp(name, IP_MOD_NAME) != 0 || + ip_wq->q_qinfo->qi_minfo->mi_idnum != IP_MOD_ID) { + /* Support just SNMP for MIB browsers */ + connp = ipcl_conn_create(IPCL_IPCCONN, KM_SLEEP); + connp->conn_rq = q; + connp->conn_wq = WR(q); + connp->conn_flags |= IPCL_UDPMOD; + connp->conn_cred = credp; + connp->conn_zoneid = zoneid; + connp->conn_udp = udp; + udp->udp_connp = connp; + q->q_ptr = WR(q)->q_ptr = connp; + crhold(credp); + q->q_qinfo = &udp_snmp_rinit; + WR(q)->q_qinfo = &udp_snmp_winit; + return (0); + } + + /* + * Initialize the udp_t structure for this stream. + */ + q = RD(ip_wq); + connp = Q_TO_CONN(q); + mutex_enter(&connp->conn_lock); + connp->conn_proto = IPPROTO_UDP; + connp->conn_flags |= IPCL_UDP; + connp->conn_sqp = IP_SQUEUE_GET(lbolt); + connp->conn_udp = udp; /* Set the initial state of the stream and the privilege status. */ - q->q_ptr = WR(q)->q_ptr = udp; + udp->udp_connp = connp; udp->udp_state = TS_UNBND; + udp->udp_mode = UDP_MT_HOT; if (getmajor(*devp) == (major_t)UDP6_MAJ) { udp->udp_family = AF_INET6; udp->udp_ipversion = IPV6_VERSION; udp->udp_max_hdr_len = IPV6_HDR_LEN + UDPH_SIZE; udp->udp_ttl = udp_ipv6_hoplimit; + connp->conn_af_isv6 = B_TRUE; + connp->conn_flags |= IPCL_ISV6; } else { udp->udp_family = AF_INET; udp->udp_ipversion = IPV4_VERSION; udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE; udp->udp_ttl = udp_ipv4_ttl; + connp->conn_af_isv6 = B_FALSE; + connp->conn_flags &= ~IPCL_ISV6; } - /* - * The receive hiwat is only looked at on the stream head queue. - * Store in q_hiwat in order to return on SO_RCVBUF getsockopts. - */ - q->q_hiwat = udp_recv_hiwat; - udp->udp_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; - udp->udp_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; - udp->udp_credp = credp; - crhold(credp); - - udp->udp_zoneid = getzoneid(); - - /* - * Acquire the lock and link it into the list of open streams. - */ - mutex_enter(&udp_g_lock); - err = mi_open_link(&udp_g_head, (IDP)udp, devp, flag, sflag, credp); - mutex_exit(&udp_g_lock); - if (err != 0) - goto error; + connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; + connp->conn_zoneid = zoneid; - qprocson(q); + if (connp->conn_flags & IPCL_SOCKET) { + udp->udp_issocket = B_TRUE; + udp->udp_direct_sockfs = B_TRUE; + } + mutex_exit(&connp->conn_lock); /* * The transmit hiwat/lowat is only looked at on IP's queue. - * Store in q_hiwat in order to return on SO_SNDBUF + * Store in q_hiwat in order to return on SO_SNDBUF/SO_RCVBUF * getsockopts. */ + q->q_hiwat = udp_recv_hiwat; WR(q)->q_hiwat = udp_xmit_hiwat; - WR(q)->q_next->q_hiwat = WR(q)->q_hiwat; WR(q)->q_lowat = udp_xmit_lowat; - WR(q)->q_next->q_lowat = WR(q)->q_lowat; if (udp->udp_family == AF_INET6) { /* Build initial header template for transmit */ if ((err = udp_build_hdrs(q, udp)) != 0) { - qprocsoff(q); - /* - * Unlink the udp structure and release - * the minor device number. - */ - mutex_enter(&udp_g_lock); - mi_close_unlink(&udp_g_head, (IDP)udp); - mutex_exit(&udp_g_lock); - goto error; + qprocsoff(UDP_RD(q)); + udp->udp_connp = NULL; + connp->conn_udp = NULL; + kmem_cache_free(udp_cache, udp); + return (err); } } - /* Set the Stream head write offset. */ - (void) mi_set_sth_wroff(q, udp->udp_max_hdr_len + udp_wroff_extra); - (void) mi_set_sth_hiwat(q, q->q_hiwat); - return (0); + /* Set the Stream head write offset and high watermark. */ + (void) mi_set_sth_wroff(UDP_RD(q), + udp->udp_max_hdr_len + udp_wroff_extra); + (void) mi_set_sth_hiwat(UDP_RD(q), udp_set_rcv_hiwat(udp, q->q_hiwat)); -error: - q->q_ptr = WR(q)->q_ptr = NULL; - crfree(credp); - mi_close_free((IDP)udp); - return (err); + return (0); } /* @@ -2212,7 +2870,6 @@ error: static boolean_t udp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name) { - return (B_TRUE); } @@ -2255,15 +2912,22 @@ udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) } /* - * This routine retrieves the current status of socket options. - * It returns the size of the option retrieved. + * This routine retrieves the current status of socket options + * and expects the caller to pass in the queue pointer of the + * upper instance. It returns the size of the option retrieved. */ int udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) { int *i1 = (int *)ptr; - udp_t *udp = (udp_t *)q->q_ptr; - ip6_pkt_t *ipp = &udp->udp_sticky_ipp; + conn_t *connp; + udp_t *udp; + ip6_pkt_t *ipp; + + q = UDP_WR(q); + connp = Q_TO_CONN(q); + udp = connp->conn_udp; + ipp = &udp->udp_sticky_ipp; switch (level) { case SOL_SOCKET: @@ -2333,7 +2997,7 @@ udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) *(uchar_t *)ptr = udp->udp_multicast_ttl; return (sizeof (uchar_t)); case IP_MULTICAST_LOOP: - *ptr = udp->udp_multicast_loop; + *ptr = connp->conn_multicast_loop; return (sizeof (uint8_t)); case IP_RECVOPTS: *i1 = udp->udp_recvopts; @@ -2394,7 +3058,7 @@ udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) *i1 = udp->udp_multicast_ttl; break; /* goto sizeof (int) option return */ case IPV6_MULTICAST_LOOP: - *i1 = udp->udp_multicast_loop; + *i1 = connp->conn_multicast_loop; break; /* goto sizeof (int) option return */ case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: @@ -2520,18 +3184,26 @@ udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) return (sizeof (int)); } -/* This routine sets socket options. */ +/* + * This routine sets socket options; it expects the caller + * to pass in the queue pointer of the upper instance. + */ /* ARGSUSED */ int udp_opt_set(queue_t *q, uint_t optset_context, int level, int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk) { - udp_t *udp = (udp_t *)q->q_ptr; int *i1 = (int *)invalp; boolean_t onoff = (*i1 == 0) ? 0 : 1; boolean_t checkonly; int error; + conn_t *connp; + udp_t *udp; + + q = UDP_WR(q); + connp = Q_TO_CONN(q); + udp = connp->conn_udp; switch (optset_context) { case SETFN_OPTCOM_CHECKONLY: @@ -2619,7 +3291,7 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level, } if (!checkonly) { q->q_hiwat = *i1; - q->q_next->q_hiwat = *i1; + WR(UDP_RD(q))->q_hiwat = *i1; } break; case SO_RCVBUF: @@ -2629,7 +3301,9 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level, } if (!checkonly) { RD(q)->q_hiwat = *i1; - (void) mi_set_sth_hiwat(RD(q), *i1); + UDP_RD(q)->q_hiwat = *i1; + (void) mi_set_sth_hiwat(UDP_RD(q), + udp_set_rcv_hiwat(udp, *i1)); } break; case SO_DGRAM_ERRIND: @@ -2709,7 +3383,7 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level, break; case IP_MULTICAST_LOOP: if (!checkonly) - udp->udp_multicast_loop = *invalp; + connp->conn_multicast_loop = *invalp; break; case IP_RECVOPTS: if (!checkonly) @@ -2847,7 +3521,7 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level, return (EINVAL); } if (!checkonly) - udp->udp_multicast_loop = *i1; + connp->conn_multicast_loop = *i1; break; case IPV6_JOIN_GROUP: case IPV6_LEAVE_GROUP: @@ -3093,6 +3767,7 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level, ipp->ipp_rtdstopts = NULL; ipp->ipp_rtdstoptslen = 0; } + ipp->ipp_fields &= ~IPPF_RTDSTOPTS; ipp->ipp_sticky_ignored |= IPPF_RTDSTOPTS; } else { @@ -3447,12 +4122,13 @@ udp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr) } static void -udp_rput(queue_t *q, mblk_t *mp) +udp_input(conn_t *connp, mblk_t *mp) { struct T_unitdata_ind *tudi; - uchar_t *rptr; - int hdr_length; + uchar_t *rptr; /* Pointer to IP header */ + int hdr_length; /* Length of IP+UDP headers */ int udi_size; /* Size of T_unitdata_ind */ + int mp_len; udp_t *udp; udpha_t *udpha; int ipversion; @@ -3462,104 +4138,56 @@ udp_rput(queue_t *q, mblk_t *mp) mblk_t *mp1; mblk_t *options_mp = NULL; in_pktinfo_t *pinfo = NULL; - size_t mp_size = MBLKL(mp); cred_t *cr = NULL; + queue_t *q = connp->conn_rq; pid_t cpid; TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_START, "udp_rput_start: q %p mp %p", q, mp); - udp = (udp_t *)q->q_ptr; + udp = connp->conn_udp; rptr = mp->b_rptr; + ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_CTL); + ASSERT(OK_32PTR(rptr)); - switch (mp->b_datap->db_type) { - case M_DATA: - /* - * M_DATA messages contain IP datagrams. They are handled - * after this switch. - */ - break; - case M_PROTO: - case M_PCPROTO: - /* M_PROTO messages contain some type of TPI message. */ - if ((mp->b_wptr - rptr) < sizeof (t_scalar_t)) { - freemsg(mp); - TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END, - "udp_rput_end: q %p (%S)", q, "protoshort"); - return; - } - qwriter(q, mp, udp_rput_other, PERIM_INNER); - TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END, - "udp_rput_end: q %p (%S)", q, "proto"); - return; - case M_FLUSH: - if (*mp->b_rptr & FLUSHR) - flushq(q, FLUSHDATA); - putnext(q, mp); - TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END, - "udp_rput_end: q %p (%S)", q, "flush"); - return; - case M_CTL: - if (udp->udp_recvif || udp->udp_recvslla || - udp->udp_ipv6_recvpktinfo) { + /* + * IP should have prepended the options data in an M_CTL + * Check M_CTL "type" to make sure are not here bcos of + * a valid ICMP message + */ + if (DB_TYPE(mp) == M_CTL) { + if (MBLKL(mp) == sizeof (in_pktinfo_t) && + ((in_pktinfo_t *)mp->b_rptr)->in_pkt_ulp_type == + IN_PKTINFO) { /* - * IP should have prepended the options data in an M_CTL - * Check M_CTL "type" to make sure are not here bcos of - * a valid ICMP message + * IP_RECVIF or IP_RECVSLLA information has been + * appended to the packet by IP. We need to + * extract the mblk and adjust the rptr */ - if (mp_size == sizeof (in_pktinfo_t) && - ((in_pktinfo_t *)mp->b_rptr)->in_pkt_ulp_type == - IN_PKTINFO) { - pinfo = (in_pktinfo_t *)mp->b_rptr; - /* - * Jump to normal data processing, this is not - * an ICMP message - */ - break; - } + pinfo = (in_pktinfo_t *)mp->b_rptr; + options_mp = mp; + mp = mp->b_cont; + rptr = mp->b_rptr; + UDP_STAT(udp_in_pktinfo); + } else { + /* + * ICMP messages. + */ + udp_icmp_error(q, mp); + TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END, + "udp_rput_end: q %p (%S)", q, "m_ctl"); + return; } - /* - * ICMP messages. - */ - udp_icmp_error(q, mp); - TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END, - "udp_rput_end: q %p (%S)", q, "m_ctl"); - return; - default: - putnext(q, mp); - TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END, - "udp_rput_end: q %p (%S)", q, "default"); - return; } - /* - * If we are here bcos the IP_RECVIF or IP_RECVSLLA then we need to - * extract the mblk and adjust the rptr - */ - if (pinfo != NULL) { - ASSERT(mp->b_datap->db_type == M_CTL); - options_mp = mp; - mp = mp->b_cont; - rptr = mp->b_rptr; - mp_size = MBLKL(mp); - } + mp_len = msgdsize(mp); /* * This is the inbound data path. * First, we check to make sure the IP version number is correct, * and then pull the IP and UDP headers into the first mblk. - */ - /* * Assume IP provides aligned packets - otherwise toss. * Also, check if we have a complete IP header. */ - if (!OK_32PTR(rptr) || (mp_size < sizeof (ipha_t))) { -tossit: - freemsg(mp); - if (options_mp != NULL) - freeb(options_mp); - BUMP_MIB(&udp_mib, udpInErrors); - return; - } /* Initialize regardless if ipversion is IPv4 or IPv6 */ ipp.ipp_fields = 0; @@ -3567,10 +4195,9 @@ tossit: ipversion = IPH_HDR_VERSION(rptr); switch (ipversion) { case IPV4_VERSION: + ASSERT(MBLKL(mp) >= sizeof (ipha_t)); + ASSERT(((ipha_t *)rptr)->ipha_protocol == IPPROTO_UDP); hdr_length = IPH_HDR_LENGTH(rptr) + UDPH_SIZE; - /* Verify this is a UDP packet */ - if (((ipha_t *)rptr)->ipha_protocol != IPPROTO_UDP) - goto tossit; if ((hdr_length > IP_SIMPLE_HDR_LENGTH + UDPH_SIZE) || (udp->udp_ip_rcv_options_len)) { /* @@ -3587,7 +4214,7 @@ tossit: * the packet. */ udpha = (udpha_t *)(rptr + (hdr_length - UDPH_SIZE)); - if (msgdsize(mp) != (ntohs(udpha->uha_length) + + if (mp_len != (ntohs(udpha->uha_length) + hdr_length - UDPH_SIZE)) { goto tossit; } @@ -3597,14 +4224,16 @@ tossit: */ if (pinfo != NULL) mp = options_mp; - qwriter(q, mp, udp_rput_other, PERIM_INNER); + udp_become_writer(connp, mp, udp_rput_other_wrapper, + SQTAG_UDP_INPUT); TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END, "udp_rput_end: q %p (%S)", q, "end"); return; } /* Handle IPV6_RECVHOPLIMIT. */ - if ((udp->udp_family == AF_INET6) && (pinfo != NULL)) { + if ((udp->udp_family == AF_INET6) && (pinfo != NULL) && + udp->udp_ipv6_recvpktinfo) { if (pinfo->in_pkt_flags & IPF_RECVIF) { ipp.ipp_fields |= IPPF_IFINDEX; ipp.ipp_ifindex = pinfo->in_pkt_ifindex; @@ -3620,8 +4249,7 @@ tossit: ASSERT(udp->udp_family == AF_INET6); ip6h = (ip6_t *)rptr; - if ((uchar_t *)&ip6h[1] > mp->b_wptr) - goto tossit; + ASSERT((uchar_t *)&ip6h[1] <= mp->b_wptr); if (ip6h->ip6_nxt != IPPROTO_UDP) { uint8_t nexthdrp; @@ -3647,6 +4275,7 @@ tossit: if (MBLKL(mp) < (IPV6_HDR_LEN + UDPH_SIZE)) goto tossit; ip6h = (ip6_t *)rptr; + mp_len = msgdsize(mp); } /* * Find any potentially interesting extension headers @@ -3655,18 +4284,14 @@ tossit: */ hdr_length = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdrp) + UDPH_SIZE; - /* Verify this is a UDP packet */ - if (nexthdrp != IPPROTO_UDP) - goto tossit; + ASSERT(nexthdrp == IPPROTO_UDP); } else { hdr_length = IPV6_HDR_LEN + UDPH_SIZE; ip6i = NULL; } break; default: - TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END, - "udp_rput_end: q %p (%S)", q, "Unknown IP version"); - goto tossit; + ASSERT(0); } /* @@ -3677,14 +4302,15 @@ tossit: */ udpha = (udpha_t *)(rptr + (hdr_length - UDPH_SIZE)); if ((MBLKL(mp) < hdr_length) || - (msgdsize(mp) != (ntohs(udpha->uha_length) + - hdr_length - UDPH_SIZE))) { + (mp_len != (ntohs(udpha->uha_length) + hdr_length - UDPH_SIZE))) { goto tossit; } /* Walk past the headers. */ - if (!udp->udp_rcvhdr) + if (!udp->udp_rcvhdr) { mp->b_rptr = rptr + hdr_length; + mp_len -= hdr_length; + } /* * This is the inbound data path. Packets are passed upstream as @@ -3706,6 +4332,7 @@ tossit: if (udp->udp_recvdstaddr) { udi_size += sizeof (struct T_opthdr) + sizeof (struct in_addr); + UDP_STAT(udp_in_recvdstaddr); } /* @@ -3714,25 +4341,28 @@ tossit: */ if (udp->udp_recvif && (pinfo != NULL) && (pinfo->in_pkt_flags & IPF_RECVIF)) { - udi_size += sizeof (struct T_opthdr) + - sizeof (uint_t); + udi_size += sizeof (struct T_opthdr) + sizeof (uint_t); + UDP_STAT(udp_in_recvif); } if (udp->udp_recvslla && (pinfo != NULL) && (pinfo->in_pkt_flags & IPF_RECVSLLA)) { udi_size += sizeof (struct T_opthdr) + - sizeof (struct sockaddr_dl); + sizeof (struct sockaddr_dl); + UDP_STAT(udp_in_recvslla); } if (udp->udp_recvucred && (cr = DB_CRED(mp)) != NULL) { udi_size += sizeof (struct T_opthdr) + ucredsize; cpid = DB_CPID(mp); + UDP_STAT(udp_in_recvucred); } /* * If IP_RECVTTL is set allocate the appropriate sized buffer */ if (udp->udp_recvttl) { udi_size += sizeof (struct T_opthdr) + sizeof (uint8_t); + UDP_STAT(udp_in_recvttl); } ASSERT(IPH_HDR_LENGTH((ipha_t *)rptr) == IP_SIMPLE_HDR_LENGTH); @@ -3889,12 +4519,14 @@ tossit: (ipp.ipp_fields & IPPF_HOPOPTS)) { udi_size += sizeof (struct T_opthdr) + ipp.ipp_hopoptslen; + UDP_STAT(udp_in_recvhopopts); } if ((udp->udp_ipv6_recvdstopts || udp->udp_old_ipv6_recvdstopts) && (ipp.ipp_fields & IPPF_DSTOPTS)) { udi_size += sizeof (struct T_opthdr) + ipp.ipp_dstoptslen; + UDP_STAT(udp_in_recvdstopts); } if (((udp->udp_ipv6_recvdstopts && udp->udp_ipv6_recvrthdr && @@ -3903,29 +4535,37 @@ tossit: (ipp.ipp_fields & IPPF_RTDSTOPTS)) { udi_size += sizeof (struct T_opthdr) + ipp.ipp_rtdstoptslen; + UDP_STAT(udp_in_recvrtdstopts); } if (udp->udp_ipv6_recvrthdr && (ipp.ipp_fields & IPPF_RTHDR)) { udi_size += sizeof (struct T_opthdr) + ipp.ipp_rthdrlen; + UDP_STAT(udp_in_recvrthdr); } if (udp->udp_ipv6_recvpktinfo && (ipp.ipp_fields & IPPF_IFINDEX)) { udi_size += sizeof (struct T_opthdr) + sizeof (struct in6_pktinfo); + UDP_STAT(udp_in_recvpktinfo); } } if (udp->udp_recvucred && (cr = DB_CRED(mp)) != NULL) { udi_size += sizeof (struct T_opthdr) + ucredsize; cpid = DB_CPID(mp); + UDP_STAT(udp_in_recvucred); } - if (udp->udp_ipv6_recvhoplimit) + if (udp->udp_ipv6_recvhoplimit) { udi_size += sizeof (struct T_opthdr) + sizeof (int); + UDP_STAT(udp_in_recvhoplimit); + } - if (udp->udp_ipv6_recvtclass) + if (udp->udp_ipv6_recvtclass) { udi_size += sizeof (struct T_opthdr) + sizeof (int); + UDP_STAT(udp_in_recvtclass); + } mp1 = allocb(udi_size, BPRI_MED); if (mp1 == NULL) { @@ -3960,7 +4600,7 @@ tossit: sin6->sin6_flowinfo = 0; sin6->sin6_scope_id = 0; sin6->__sin6_src_id = ip_srcid_find_addr(&v6dst, - udp->udp_zoneid); + connp->conn_zoneid); } else { sin6->sin6_addr = ip6h->ip6_src; /* No sin6_flowinfo per API */ @@ -3971,8 +4611,8 @@ tossit: sin6->sin6_scope_id = ipp.ipp_ifindex; else sin6->sin6_scope_id = 0; - sin6->__sin6_src_id = - ip_srcid_find_addr(&ip6h->ip6_dst, udp->udp_zoneid); + sin6->__sin6_src_id = ip_srcid_find_addr( + &ip6h->ip6_dst, connp->conn_zoneid); } sin6->sin6_port = udpha->uha_src_port; sin6->sin6_family = udp->udp_family; @@ -4133,7 +4773,45 @@ tossit: "udp_rput_end: q %p (%S)", q, "end"); if (options_mp != NULL) freeb(options_mp); - putnext(q, mp); + + if (udp->udp_direct_sockfs) { + /* + * There is nothing above us except for the stream head; + * use the read-side synchronous stream interface in + * order to reduce the time spent in interrupt thread. + */ + ASSERT(udp->udp_issocket); + udp_rcv_enqueue(UDP_RD(q), udp, mp, mp_len); + } else { + /* + * Use regular STREAMS interface to pass data upstream + * if this is not a socket endpoint, or if we have + * switched over to the slow mode due to sockmod being + * popped or a module being pushed on top of us. + */ + putnext(UDP_RD(q), mp); + } + return; + +tossit: + freemsg(mp); + if (options_mp != NULL) + freeb(options_mp); + BUMP_MIB(&udp_mib, udpInErrors); +} + +void +udp_conn_recv(conn_t *connp, mblk_t *mp) +{ + _UDP_ENTER(connp, mp, udp_input_wrapper, SQTAG_UDP_FANOUT); +} + +/* ARGSUSED */ +static void +udp_input_wrapper(void *arg, mblk_t *mp, void *arg2) +{ + udp_input((conn_t *)arg, mp); + _UDP_EXIT((conn_t *)arg); } /* @@ -4152,18 +4830,17 @@ udp_rput_other(queue_t *q, mblk_t *mp) int opt_len; /* Length of IP options */ sin_t *sin; struct T_error_ack *tea; - udp_t *udp; mblk_t *options_mp = NULL; in_pktinfo_t *pinfo; boolean_t recv_on = B_FALSE; cred_t *cr = NULL; + udp_t *udp = Q_TO_UDP(q); pid_t cpid; TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_START, "udp_rput_other: q %p mp %p", q, mp); ASSERT(OK_32PTR(mp->b_rptr)); - udp = (udp_t *)q->q_ptr; rptr = mp->b_rptr; switch (mp->b_datap->db_type) { @@ -4258,7 +4935,7 @@ udp_rput_other(queue_t *q, mblk_t *mp) freemsg(mp); return; } - putnext(q, mp); + putnext(UDP_RD(q), mp); return; } @@ -4323,9 +5000,12 @@ udp_rput_other(queue_t *q, mblk_t *mp) udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t); if (udp->udp_recvdstaddr) { udi_size += sizeof (struct T_opthdr) + sizeof (struct in_addr); + UDP_STAT(udp_in_recvdstaddr); } - if (udp->udp_recvopts && opt_len > 0) + if (udp->udp_recvopts && opt_len > 0) { udi_size += sizeof (struct T_opthdr) + opt_len; + UDP_STAT(udp_in_recvopts); + } /* * If the IP_RECVSLLA or the IP_RECVIF is set then allocate @@ -4333,25 +5013,28 @@ udp_rput_other(queue_t *q, mblk_t *mp) */ if (udp->udp_recvif && recv_on && (pinfo->in_pkt_flags & IPF_RECVIF)) { - udi_size += sizeof (struct T_opthdr) + - sizeof (uint_t); + udi_size += sizeof (struct T_opthdr) + sizeof (uint_t); + UDP_STAT(udp_in_recvif); } if (udp->udp_recvslla && recv_on && (pinfo->in_pkt_flags & IPF_RECVSLLA)) { udi_size += sizeof (struct T_opthdr) + sizeof (struct sockaddr_dl); + UDP_STAT(udp_in_recvslla); } if (udp->udp_recvucred && (cr = DB_CRED(mp)) != NULL) { udi_size += sizeof (struct T_opthdr) + ucredsize; cpid = DB_CPID(mp); + UDP_STAT(udp_in_recvucred); } /* * If IP_RECVTTL is set allocate the appropriate sized buffer */ if (udp->udp_recvttl) { udi_size += sizeof (struct T_opthdr) + sizeof (uint8_t); + UDP_STAT(udp_in_recvttl); } /* Allocate a message block for the T_UNITDATA_IND structure. */ @@ -4502,7 +5185,34 @@ udp_rput_other(queue_t *q, mblk_t *mp) "udp_rput_other_end: q %p (%S)", q, "end"); if (options_mp != NULL) freeb(options_mp); - putnext(q, mp); + + if (udp->udp_direct_sockfs) { + /* + * There is nothing above us except for the stream head; + * use the read-side synchronous stream interface in + * order to reduce the time spent in interrupt thread. + */ + ASSERT(udp->udp_issocket); + udp_rcv_enqueue(UDP_RD(q), udp, mp, msgdsize(mp)); + } else { + /* + * Use regular STREAMS interface to pass data upstream + * if this is not a socket endpoint, or if we have + * switched over to the slow mode due to sockmod being + * popped or a module being pushed on top of us. + */ + putnext(UDP_RD(q), mp); + } +} + +/* ARGSUSED */ +static void +udp_rput_other_wrapper(void *arg, mblk_t *mp, void *arg2) +{ + conn_t *connp = arg; + + udp_rput_other(connp->conn_rq, mp); + udp_exit(connp); } /* @@ -4511,7 +5221,7 @@ udp_rput_other(queue_t *q, mblk_t *mp) static void udp_rput_bind_ack(queue_t *q, mblk_t *mp) { - udp_t *udp = (udp_t *)q->q_ptr; + udp_t *udp = Q_TO_UDP(q); mblk_t *mp1; ire_t *ire; struct T_bind_ack *tba; @@ -4602,20 +5312,20 @@ udp_rput_bind_ack(queue_t *q, mblk_t *mp) while (mp != NULL) { mp1 = mp->b_cont; mp->b_cont = NULL; - putnext(q, mp); + putnext(UDP_RD(q), mp); mp = mp1; } return; } freemsg(mp->b_cont); mp->b_cont = NULL; - putnext(q, mp); + putnext(UDP_RD(q), mp); } /* * return SNMP stuff in buffer in mpdata */ -static int +int udp_snmp_get(queue_t *q, mblk_t *mpctl) { mblk_t *mpdata; @@ -4626,12 +5336,14 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl) mblk_t *mp_conn_tail = NULL; mblk_t *mp6_conn_tail = NULL; struct opthdr *optp; - IDP idp; - udp_t *udp; mib2_udpEntry_t ude; mib2_udp6Entry_t ude6; int state; zoneid_t zoneid; + int i; + connf_t *connfp; + conn_t *connp = Q_TO_CONN(q); + udp_t *udp = connp->conn_udp; if (mpctl == NULL || (mpdata = mpctl->b_cont) == NULL || @@ -4644,8 +5356,7 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl) mp_conn_data = mp_conn_ctl->b_cont; mp6_conn_data = mp6_conn_ctl->b_cont; - udp = (udp_t *)q->q_ptr; - zoneid = udp->udp_zoneid; + zoneid = connp->conn_zoneid; /* fixed length structure for IPv4 and IPv6 counters */ SET_MIB(udp_mib.udpEntrySize, sizeof (mib2_udpEntry_t)); @@ -4657,76 +5368,88 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl) optp->len = msgdsize(mpdata); qreply(q, mpctl); - mutex_enter(&udp_g_lock); - for (idp = mi_first_ptr(&udp_g_head); - (udp = (udp_t *)idp) != 0; - idp = mi_next_ptr(&udp_g_head, idp)) { + for (i = 0; i < CONN_G_HASH_SIZE; i++) { + connfp = &ipcl_globalhash_fanout[i]; + connp = NULL; - if (zoneid != udp->udp_zoneid) - continue; + while ((connp = ipcl_get_next_conn(connfp, connp, + IPCL_UDP))) { + udp = connp->conn_udp; + if (zoneid != connp->conn_zoneid) + continue; - /* Note that the port numbers are sent in host byte order */ + /* + * Note that the port numbers are sent in + * host byte order + */ - if (udp->udp_state == TS_UNBND) - state = MIB2_UDP_unbound; - else if (udp->udp_state == TS_IDLE) - state = MIB2_UDP_idle; - else if (udp->udp_state == TS_DATA_XFER) - state = MIB2_UDP_connected; - else - state = MIB2_UDP_unknown; + if (udp->udp_state == TS_UNBND) + state = MIB2_UDP_unbound; + else if (udp->udp_state == TS_IDLE) + state = MIB2_UDP_idle; + else if (udp->udp_state == TS_DATA_XFER) + state = MIB2_UDP_connected; + else + state = MIB2_UDP_unknown; - /* - * Create an IPv4 table entry for IPv4 entries and also - * any IPv6 entries which are bound to in6addr_any - * (i.e. anything a IPv4 peer could connect/send to). - */ - if (udp->udp_ipversion == IPV4_VERSION || - (udp->udp_state <= TS_IDLE && - IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src))) { - ude.udpEntryInfo.ue_state = state; - /* If in6addr_any this will set it to INADDR_ANY */ - ude.udpLocalAddress = V4_PART_OF_V6(udp->udp_v6src); - ude.udpLocalPort = ntohs(udp->udp_port); - if (udp->udp_state == TS_DATA_XFER) { + /* + * Create an IPv4 table entry for IPv4 entries and also + * any IPv6 entries which are bound to in6addr_any + * (i.e. anything a IPv4 peer could connect/send to). + */ + if (udp->udp_ipversion == IPV4_VERSION || + (udp->udp_state <= TS_IDLE && + IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src))) { + ude.udpEntryInfo.ue_state = state; /* - * Can potentially get here for v6 socket - * if another process (say, ping) has just - * done a sendto(), changing the state - * from the TS_IDLE above to TS_DATA_XFER - * by the time we hit this part of the code. + * If in6addr_any this will set it to + * INADDR_ANY */ - ude.udpEntryInfo.ue_RemoteAddress = - V4_PART_OF_V6(udp->udp_v6dst); - ude.udpEntryInfo.ue_RemotePort = - ntohs(udp->udp_dstport); - } else { - ude.udpEntryInfo.ue_RemoteAddress = 0; - ude.udpEntryInfo.ue_RemotePort = 0; + ude.udpLocalAddress = + V4_PART_OF_V6(udp->udp_v6src); + ude.udpLocalPort = ntohs(udp->udp_port); + if (udp->udp_state == TS_DATA_XFER) { + /* + * Can potentially get here for + * v6 socket if another process + * (say, ping) has just done a + * sendto(), changing the state + * from the TS_IDLE above to + * TS_DATA_XFER by the time we hit + * this part of the code. + */ + ude.udpEntryInfo.ue_RemoteAddress = + V4_PART_OF_V6(udp->udp_v6dst); + ude.udpEntryInfo.ue_RemotePort = + ntohs(udp->udp_dstport); + } else { + ude.udpEntryInfo.ue_RemoteAddress = 0; + ude.udpEntryInfo.ue_RemotePort = 0; + } + (void) snmp_append_data2(mp_conn_data, + &mp_conn_tail, (char *)&ude, sizeof (ude)); } - (void) snmp_append_data2(mp_conn_data, &mp_conn_tail, - (char *)&ude, sizeof (ude)); - } - if (udp->udp_ipversion == IPV6_VERSION) { - ude6.udp6EntryInfo.ue_state = state; - ude6.udp6LocalAddress = udp->udp_v6src; - ude6.udp6LocalPort = ntohs(udp->udp_port); - ude6.udp6IfIndex = udp->udp_bound_if; - if (udp->udp_state == TS_DATA_XFER) { - ude6.udp6EntryInfo.ue_RemoteAddress = - udp->udp_v6dst; - ude6.udp6EntryInfo.ue_RemotePort = - ntohs(udp->udp_dstport); - } else { - ude6.udp6EntryInfo.ue_RemoteAddress = - sin6_null.sin6_addr; - ude6.udp6EntryInfo.ue_RemotePort = 0; + if (udp->udp_ipversion == IPV6_VERSION) { + ude6.udp6EntryInfo.ue_state = state; + ude6.udp6LocalAddress = udp->udp_v6src; + ude6.udp6LocalPort = ntohs(udp->udp_port); + ude6.udp6IfIndex = udp->udp_bound_if; + if (udp->udp_state == TS_DATA_XFER) { + ude6.udp6EntryInfo.ue_RemoteAddress = + udp->udp_v6dst; + ude6.udp6EntryInfo.ue_RemotePort = + ntohs(udp->udp_dstport); + } else { + ude6.udp6EntryInfo.ue_RemoteAddress = + sin6_null.sin6_addr; + ude6.udp6EntryInfo.ue_RemotePort = 0; + } + (void) snmp_append_data2(mp6_conn_data, + &mp6_conn_tail, (char *)&ude6, + sizeof (ude6)); } - (void) snmp_append_data2(mp6_conn_data, &mp6_conn_tail, - (char *)&ude6, sizeof (ude6)); } } - mutex_exit(&udp_g_lock); /* IPv4 UDP endpoints */ optp = (struct opthdr *)&mp_conn_ctl->b_rptr[ @@ -4754,7 +5477,7 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl) * to do the appropriate locking. */ /* ARGSUSED */ -static int +int udp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr, int len) { @@ -4789,7 +5512,7 @@ udp_report_item(mblk_t *mp, udp_t *udp) state = "UnkState"; print_len = snprintf((char *)mp->b_wptr, buf_len, MI_COL_PTRFMT_STR "%4d %5u %s %s %5u %s\n", - (void *)udp, udp->udp_zoneid, ntohs(udp->udp_port), + (void *)udp, udp->udp_connp->conn_zoneid, ntohs(udp->udp_port), inet_ntop(AF_INET6, &udp->udp_v6src, addrbuf1, sizeof (addrbuf1)), inet_ntop(AF_INET6, &udp->udp_v6dst, @@ -4807,9 +5530,11 @@ udp_report_item(mblk_t *mp, udp_t *udp) static int udp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) { - IDP idp; - udp_t *udp; zoneid_t zoneid; + connf_t *connfp; + conn_t *connp = Q_TO_CONN(q); + udp_t *udp = connp->conn_udp; + int i; /* * Because of the ndd constraint, at most we can have 64K buffer @@ -4837,21 +5562,22 @@ udp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) " zone lport src addr dest addr port state"); /* 1234 12345 xxx.xxx.xxx.xxx xxx.xxx.xxx.xxx 12345 UNBOUND */ - udp = (udp_t *)q->q_ptr; - zoneid = udp->udp_zoneid; + zoneid = connp->conn_zoneid; - mutex_enter(&udp_g_lock); - for (idp = mi_first_ptr(&udp_g_head); - (udp = (udp_t *)idp) != 0; - idp = mi_next_ptr(&udp_g_head, idp)) { + for (i = 0; i < CONN_G_HASH_SIZE; i++) { + connfp = &ipcl_globalhash_fanout[i]; + connp = NULL; - if (zoneid != GLOBAL_ZONEID && - zoneid != udp->udp_zoneid) - continue; + while ((connp = ipcl_get_next_conn(connfp, connp, + IPCL_UDP))) { + udp = connp->conn_udp; + if (zoneid != GLOBAL_ZONEID && + zoneid != connp->conn_zoneid) + continue; - udp_report_item(mp->b_cont, udp); + udp_report_item(mp->b_cont, udp); + } } - mutex_exit(&udp_g_lock); udp_last_ndd_get_info_time = ddi_get_lbolt(); return (0); } @@ -4862,32 +5588,44 @@ udp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) * passed in mp. This message is freed. */ static void -udp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err) +udp_ud_err(queue_t *q, mblk_t *mp, uchar_t *destaddr, t_scalar_t destlen, + t_scalar_t err) { + struct T_unitdata_req *tudr; mblk_t *mp1; - struct T_unitdata_req *tudr = (struct T_unitdata_req *)mp->b_rptr; - uchar_t *destaddr, *optaddr; + uchar_t *optaddr; + t_scalar_t optlen; - if ((mp->b_wptr < mp->b_rptr) || - (mp->b_wptr - mp->b_rptr) < sizeof (struct T_unitdata_req)) { - goto done; - } - destaddr = mp->b_rptr + tudr->DEST_offset; - if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr || - destaddr + tudr->DEST_length < mp->b_rptr || - destaddr + tudr->DEST_length > mp->b_wptr) { - goto done; - } - optaddr = mp->b_rptr + tudr->OPT_offset; - if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr || - optaddr + tudr->OPT_length < mp->b_rptr || - optaddr + tudr->OPT_length > mp->b_wptr) { - goto done; + if (DB_TYPE(mp) == M_DATA) { + ASSERT(destaddr != NULL && destlen != 0); + optaddr = NULL; + optlen = 0; + } else { + if ((mp->b_wptr < mp->b_rptr) || + (MBLKL(mp)) < sizeof (struct T_unitdata_req)) { + goto done; + } + tudr = (struct T_unitdata_req *)mp->b_rptr; + destaddr = mp->b_rptr + tudr->DEST_offset; + if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr || + destaddr + tudr->DEST_length < mp->b_rptr || + destaddr + tudr->DEST_length > mp->b_wptr) { + goto done; + } + optaddr = mp->b_rptr + tudr->OPT_offset; + if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr || + optaddr + tudr->OPT_length < mp->b_rptr || + optaddr + tudr->OPT_length > mp->b_wptr) { + goto done; + } + destlen = tudr->DEST_length; + optlen = tudr->OPT_length; } - mp1 = mi_tpi_uderror_ind((char *)destaddr, tudr->DEST_length, - (char *)optaddr, tudr->OPT_length, err); - if (mp1) - qreply(q, mp1); + + mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen, + (char *)optaddr, optlen, err); + if (mp1 != NULL) + putnext(UDP_RD(q), mp1); done: freemsg(mp); @@ -4900,9 +5638,8 @@ done: static void udp_unbind(queue_t *q, mblk_t *mp) { - udp_t *udp; + udp_t *udp = Q_TO_UDP(q); - udp = (udp_t *)q->q_ptr; /* If a bind has not been done, we can't unbind. */ if (udp->udp_state == TS_UNBND) { udp_err_ack(q, mp, TOUTSTATE, 0); @@ -4939,8 +5676,13 @@ udp_unbind(queue_t *q, mblk_t *mp) return; } } - /* Pass the unbind to IP */ - putnext(q, mp); + /* + * Pass the unbind to IP; T_UNBIND_REQ is larger than T_OK_ACK + * and therefore ip_unbind must never return NULL. + */ + mp = ip_unbind(q, mp); + ASSERT(mp != NULL); + putnext(UDP_RD(q), mp); } /* @@ -4994,193 +5736,47 @@ retry: return (port); } -/* - * This routine handles all messages passed downstream. It either - * consumes the message or passes it downstream; it never queues a - * a message. - */ -static void -udp_wput(queue_t *q, mblk_t *mp) +static mblk_t * +udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port, + uint_t srcid, int *error) { - uchar_t *rptr = mp->b_rptr; - struct datab *db; - ipha_t *ipha; - udpha_t *udpha; - mblk_t *mp1; - int ip_hdr_length; -#define tudr ((struct T_unitdata_req *)rptr) - uint32_t ip_len; - udp_t *udp; - sin6_t *sin6; - sin_t *sin; - ipaddr_t v4dst; - uint16_t port; - uint_t srcid; - - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_START, - "udp_wput_start: q %p mp %p", q, mp); - - db = mp->b_datap; - switch (db->db_type) { - case M_PROTO: - case M_PCPROTO: - ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); - if (mp->b_wptr - rptr >= sizeof (struct T_unitdata_req)) { - /* Detect valid T_UNITDATA_REQ here */ - if (((union T_primitives *)rptr)->type - == T_UNITDATA_REQ) - break; - } - /* FALLTHRU */ - default: - qwriter(q, mp, udp_wput_other, PERIM_INNER); - return; - } - - udp = (udp_t *)q->q_ptr; - - /* Handle UNITDATA_REQ messages here */ - if (udp->udp_state == TS_UNBND) { - /* If a port has not been bound to the stream, fail. */ - BUMP_MIB(&udp_mib, udpOutErrors); - udp_ud_err(q, mp, EPROTO); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, - "udp_wput_end: q %p (%S)", q, "outstate"); - return; - } - mp1 = mp->b_cont; - if (mp1 == NULL) { - BUMP_MIB(&udp_mib, udpOutErrors); - udp_ud_err(q, mp, EPROTO); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, - "udp_wput_end: q %p (%S)", q, "badaddr"); - return; - } - - if ((rptr + tudr->DEST_offset + tudr->DEST_length) > mp->b_wptr) { - BUMP_MIB(&udp_mib, udpOutErrors); - udp_ud_err(q, mp, EADDRNOTAVAIL); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, - "udp_wput_end: q %p (%S)", q, "badaddr"); - return; - } - - switch (udp->udp_family) { - case AF_INET6: - sin6 = (sin6_t *)&rptr[tudr->DEST_offset]; - if (!OK_32PTR((char *)sin6) || - tudr->DEST_length != sizeof (sin6_t) || - sin6->sin6_family != AF_INET6) { - BUMP_MIB(&udp_mib, udpOutErrors); - udp_ud_err(q, mp, EADDRNOTAVAIL); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, - "udp_wput_end: q %p (%S)", q, "badaddr"); - return; - } - - if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { - /* - * Destination is a non-IPv4-compatible IPv6 address. - * Send out an IPv6 format packet. - */ - udp_wput_ipv6(q, mp, sin6, tudr->OPT_length); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, - "udp_wput_end: q %p (%S)", q, "udp_wput_ipv6"); - return; - } - /* - * If the local address is not zero or a mapped address return - * an error. - * I would be possible to send an IPv4 packet but the - * response would never make it back to the application - * since it is bound to a non-mapped address. - */ - if (!IN6_IS_ADDR_V4MAPPED(&udp->udp_v6src) && - !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) { - BUMP_MIB(&udp_mib, udpOutErrors); - udp_ud_err(q, mp, EADDRNOTAVAIL); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, - "udp_wput_end: q %p (%S)", q, "badaddr"); - return; - } - /* Send IPv4 packet without modifying udp_ipversion */ - /* Extract port and ipaddr */ - port = sin6->sin6_port; - IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, v4dst); - srcid = sin6->__sin6_src_id; - break; - - case AF_INET: - sin = (sin_t *)&rptr[tudr->DEST_offset]; - if (!OK_32PTR((char *)sin) || - tudr->DEST_length != sizeof (sin_t) || - sin->sin_family != AF_INET) { - BUMP_MIB(&udp_mib, udpOutErrors); - udp_ud_err(q, mp, EADDRNOTAVAIL); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, - "udp_wput_end: q %p (%S)", q, "badaddr"); - return; - } - /* Extract port and ipaddr */ - port = sin->sin_port; - v4dst = sin->sin_addr.s_addr; - srcid = 0; - break; - } + udp_t *udp = connp->conn_udp; + queue_t *q = connp->conn_wq; + mblk_t *mp1 = (DB_TYPE(mp) == M_DATA ? mp : mp->b_cont); + mblk_t *mp2; + ipha_t *ipha; + int ip_hdr_length; + uint32_t ip_len; + udpha_t *udpha; + *error = 0; - /* - * If options passed in, feed it for verification and handling - */ - if (tudr->OPT_length != 0) { - int error; - - if (udp_unitdata_opt_process(q, mp, &error, NULL) < 0) { - /* failure */ - BUMP_MIB(&udp_mib, udpOutErrors); - udp_ud_err(q, mp, error); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, - "udp_wput_end: q %p (%S)", q, - "udp_unitdata_opt_process"); - return; - } - ASSERT(error == 0); - /* - * Note: success in processing options. - * mp option buffer represented by - * OPT_length/offset now potentially modified - * and contain option setting results - */ - } + /* mp1 points to the M_DATA mblk carrying the packet */ + ASSERT(mp1 != NULL && DB_TYPE(mp1) == M_DATA); /* Add an IP header */ ip_hdr_length = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE + udp->udp_ip_snd_options_len; ipha = (ipha_t *)&mp1->b_rptr[-ip_hdr_length]; - if ((mp1->b_datap->db_ref != 1) || - ((uchar_t *)ipha < mp1->b_datap->db_base) || + if (DB_REF(mp1) != 1 || (uchar_t *)ipha < DB_BASE(mp1) || !OK_32PTR(ipha)) { - uchar_t *wptr; - - mp1 = allocb(ip_hdr_length + udp_wroff_extra, BPRI_LO); - if (!mp1) { - BUMP_MIB(&udp_mib, udpOutErrors); - udp_ud_err(q, mp, ENOMEM); + mp2 = allocb(ip_hdr_length + udp_wroff_extra, BPRI_LO); + if (mp2 == NULL) { TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, - "udp_wput_end: q %p (%S)", q, "allocbfail2"); - return; - } - mp1->b_cont = mp->b_cont; - mp->b_cont = mp1; - wptr = mp1->b_datap->db_lim; - mp1->b_wptr = wptr; - ipha = (ipha_t *)(wptr - ip_hdr_length); - } - mp1->b_rptr = (uchar_t *)ipha; - - ASSERT((uintptr_t)(mp1->b_wptr - (uchar_t *)ipha) <= - (uintptr_t)UINT_MAX); + "udp_wput_end: q %p (%S)", q, "allocbfail2"); + *error = ENOMEM; + goto done; + } + mp2->b_wptr = DB_LIM(mp2); + mp2->b_cont = mp1; + mp1 = mp2; + if (DB_TYPE(mp) != M_DATA) + mp->b_cont = mp1; + else + mp = mp1; + ipha = (ipha_t *)(mp1->b_wptr - ip_hdr_length); + } ip_hdr_length -= UDPH_SIZE; #ifdef _BIG_ENDIAN /* Set version, header length, and tos */ @@ -5206,24 +5802,25 @@ udp_wput(queue_t *q, mblk_t *mp) if (srcid != 0 && ipha->ipha_src == INADDR_ANY) { in6_addr_t v6src; - ip_srcid_find_id(srcid, &v6src, udp->udp_zoneid); + ip_srcid_find_id(srcid, &v6src, connp->conn_zoneid); IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src); } ipha->ipha_fragment_offset_and_flags = 0; ipha->ipha_ident = 0; + mp1->b_rptr = (uchar_t *)ipha; + + ASSERT((uintptr_t)(mp1->b_wptr - (uchar_t *)ipha) <= + (uintptr_t)UINT_MAX); + /* Determine length of packet */ ip_len = (uint32_t)(mp1->b_wptr - (uchar_t *)ipha); - { - mblk_t *mp2; - if ((mp2 = mp1->b_cont) != NULL) { - do { - ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) - <= (uintptr_t)UINT_MAX); - ip_len += (uint32_t)(mp2->b_wptr - mp2->b_rptr); - } while ((mp2 = mp2->b_cont) != NULL); - } + if ((mp2 = mp1->b_cont) != NULL) { + do { + ASSERT((uintptr_t)MBLKL(mp2) <= (uintptr_t)UINT_MAX); + ip_len += (uint32_t)MBLKL(mp2); + } while ((mp2 = mp2->b_cont) != NULL); } /* * If the size of the packet is greater than the maximum allowed by @@ -5231,19 +5828,18 @@ udp_wput(queue_t *q, mblk_t *mp) * the size will have wrapped and be inconsistent with the msg size. */ if (ip_len > IP_MAXPACKET) { - BUMP_MIB(&udp_mib, udpOutErrors); - udp_ud_err(q, mp, EMSGSIZE); TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, "udp_wput_end: q %p (%S)", q, "IP length exceeded"); - return; + *error = EMSGSIZE; + goto done; } ipha->ipha_length = htons((uint16_t)ip_len); ip_len -= ip_hdr_length; ip_len = htons((uint16_t)ip_len); udpha = (udpha_t *)(((uchar_t *)ipha) + ip_hdr_length); + /* - * Copy in the destination address and port from the T_UNITDATA - * request + * Copy in the destination address */ if (v4dst == INADDR_ANY) ipha->ipha_dst = htonl(INADDR_LOOPBACK); @@ -5310,41 +5906,648 @@ udp_wput(queue_t *q, mblk_t *mp) /* Set UDP length and checksum */ *((uint32_t *)&udpha->uha_length) = ip_len; - freeb(mp); + if (DB_TYPE(mp) != M_DATA) { + ASSERT(mp != mp1); + freeb(mp); + } + + /* mp has been consumed and we'll return success */ + ASSERT(*error == 0); + mp = NULL; /* We're done. Pass the packet to ip. */ BUMP_MIB(&udp_mib, udpOutDatagrams); TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, "udp_wput_end: q %p (%S)", q, "end"); - putnext(q, mp1); -#undef tudr + + if ((connp->conn_flags & IPCL_CHECK_POLICY) != 0 || + CONN_OUTBOUND_POLICY_PRESENT(connp) || + connp->conn_dontroute || connp->conn_xmit_if_ill != NULL || + connp->conn_nofailover_ill != NULL || + connp->conn_outgoing_ill != NULL || + ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION || + IPP_ENABLED(IPP_LOCAL_OUT) || ip_g_mrouter != NULL) { + UDP_STAT(udp_ip_send); + ip_output(connp, mp1, connp->conn_wq, IP_WPUT); + } else { + udp_send_data(udp, connp->conn_wq, mp1, ipha); + } + +done: + if (*error != 0) { + ASSERT(mp != NULL); + BUMP_MIB(&udp_mib, udpOutErrors); + } + return (mp); +} + +static void +udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha) +{ + conn_t *connp = udp->udp_connp; + ipaddr_t src, dst; + ill_t *ill; + ire_t *ire; + ipif_t *ipif = NULL; + mblk_t *ire_fp_mp; + uint_t ire_fp_mp_len; + uint16_t *up; + uint32_t cksum, hcksum_txflags; + queue_t *dev_q; + boolean_t retry_caching; + + dst = ipha->ipha_dst; + src = ipha->ipha_src; + ASSERT(ipha->ipha_ident == 0); + + if (CLASSD(dst)) { + int err; + + ipif = conn_get_held_ipif(connp, + &connp->conn_multicast_ipif, &err); + + if (ipif == NULL || ipif->ipif_isv6 || + (ipif->ipif_ill->ill_phyint->phyint_flags & + PHYI_LOOPBACK)) { + if (ipif != NULL) + ipif_refrele(ipif); + UDP_STAT(udp_ip_send); + ip_output(connp, mp, q, IP_WPUT); + return; + } + } + + retry_caching = B_FALSE; + mutex_enter(&connp->conn_lock); + ire = connp->conn_ire_cache; + ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT)); + + if (ire == NULL || ire->ire_addr != dst || + (ire->ire_marks & IRE_MARK_CONDEMNED)) { + retry_caching = B_TRUE; + } else if (CLASSD(dst) && (ire->ire_type & IRE_CACHE)) { + ill_t *stq_ill = (ill_t *)ire->ire_stq->q_ptr; + + ASSERT(ipif != NULL); + if (stq_ill != ipif->ipif_ill && (stq_ill->ill_group == NULL || + stq_ill->ill_group != ipif->ipif_ill->ill_group)) + retry_caching = B_TRUE; + } + + if (!retry_caching) { + ASSERT(ire != NULL); + IRE_REFHOLD(ire); + mutex_exit(&connp->conn_lock); + } else { + boolean_t cached = B_FALSE; + + connp->conn_ire_cache = NULL; + mutex_exit(&connp->conn_lock); + + /* Release the old ire */ + if (ire != NULL) { + IRE_REFRELE_NOTR(ire); + ire = NULL; + } + + if (CLASSD(dst)) { + ASSERT(ipif != NULL); + ire = ire_ctable_lookup(dst, 0, 0, ipif, + connp->conn_zoneid, MATCH_IRE_ILL_GROUP); + } else { + ASSERT(ipif == NULL); + ire = ire_cache_lookup(dst, connp->conn_zoneid); + } + + if (ire == NULL) { + if (ipif != NULL) + ipif_refrele(ipif); + UDP_STAT(udp_ire_null); + ip_output(connp, mp, q, IP_WPUT); + return; + } + IRE_REFHOLD_NOTR(ire); + + mutex_enter(&connp->conn_lock); + if (!(connp->conn_state_flags & CONN_CLOSING) && + connp->conn_ire_cache == NULL) { + rw_enter(&ire->ire_bucket->irb_lock, RW_READER); + if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { + connp->conn_ire_cache = ire; + cached = B_TRUE; + } + rw_exit(&ire->ire_bucket->irb_lock); + } + mutex_exit(&connp->conn_lock); + + /* + * We can continue to use the ire but since it was not + * cached, we should drop the extra reference. + */ + if (!cached) + IRE_REFRELE_NOTR(ire); + } + ASSERT(ire != NULL && ire->ire_ipversion == IPV4_VERSION); + ASSERT(!CLASSD(dst) || ipif != NULL); + + if ((ire->ire_type & (IRE_BROADCAST|IRE_LOCAL|IRE_LOOPBACK)) || + (ire->ire_flags & RTF_MULTIRT) || ire->ire_stq == NULL || + ire->ire_max_frag < ntohs(ipha->ipha_length) || + (ire_fp_mp = ire->ire_fp_mp) == NULL || + (ire_fp_mp_len = MBLKL(ire_fp_mp)) > MBLKHEAD(mp)) { + if (ipif != NULL) + ipif_refrele(ipif); + UDP_STAT(udp_ip_ire_send); + IRE_REFRELE(ire); + ip_output(connp, mp, q, IP_WPUT); + return; + } + + BUMP_MIB(&ip_mib, ipOutRequests); + + ill = ire_to_ill(ire); + ASSERT(ill != NULL); + + dev_q = ire->ire_stq->q_next; + ASSERT(dev_q != NULL); + /* + * If the service thread is already running, or if the driver + * queue is currently flow-controlled, queue this packet. + */ + if ((q->q_first != NULL || connp->conn_draining) || + ((dev_q->q_next || dev_q->q_first) && !canput(dev_q))) { + if (ip_output_queue) { + (void) putq(q, mp); + } else { + BUMP_MIB(&ip_mib, ipOutDiscards); + freemsg(mp); + } + if (ipif != NULL) + ipif_refrele(ipif); + IRE_REFRELE(ire); + return; + } + + ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1); +#ifndef _BIG_ENDIAN + ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8); +#endif + + if (src == INADDR_ANY && !connp->conn_unspec_src) { + if (CLASSD(dst) && !(ire->ire_flags & RTF_SETSRC)) + src = ipha->ipha_src = ipif->ipif_src_addr; + else + src = ipha->ipha_src = ire->ire_src_addr; + } + + if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) { + ASSERT(ill->ill_hcksum_capab != NULL); + hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags; + } else { + hcksum_txflags = 0; + } + + /* pseudo-header checksum (do it in parts for IP header checksum) */ + cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); + + ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION); + up = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH); + if (*up != 0) { + IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, + mp, ipha, up, IPPROTO_UDP, IP_SIMPLE_HDR_LENGTH, + ntohs(ipha->ipha_length), cksum); + + /* Software checksum? */ + if (DB_CKSUMFLAGS(mp) == 0) { + UDP_STAT(udp_out_sw_cksum); + UDP_STAT_UPDATE(udp_out_sw_cksum_bytes, + ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH); + } + } + + ipha->ipha_fragment_offset_and_flags |= + (uint32_t)htons(ire->ire_frag_flag); + + /* Calculate IP header checksum if hardware isn't capable */ + if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { + IP_HDR_CKSUM(ipha, cksum, ((uint32_t *)ipha)[0], + ((uint16_t *)ipha)[4]); + } + + if (CLASSD(dst)) { + ilm_t *ilm; + + ILM_WALKER_HOLD(ill); + ilm = ilm_lookup_ill(ill, dst, ALL_ZONES); + ILM_WALKER_RELE(ill); + if (ilm != NULL) { + ip_multicast_loopback(q, ill, mp, + connp->conn_multicast_loop ? 0 : + IP_FF_NO_MCAST_LOOP, connp->conn_zoneid); + } + + /* If multicast TTL is 0 then we are done */ + if (ipha->ipha_ttl == 0) { + if (ipif != NULL) + ipif_refrele(ipif); + freemsg(mp); + IRE_REFRELE(ire); + return; + } + } + + ASSERT(DB_TYPE(ire_fp_mp) == M_DATA); + mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len; + bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len); + + UPDATE_OB_PKT_COUNT(ire); + ire->ire_last_used_time = lbolt; + + if (ILL_POLL_CAPABLE(ill)) { + /* + * Send the packet directly to DLD, where it may be queued + * depending on the availability of transmit resources at + * the media layer. + */ + IP_POLL_ILL_TX(ill, mp); + } else { + putnext(ire->ire_stq, mp); + } + + if (ipif != NULL) + ipif_refrele(ipif); + IRE_REFRELE(ire); } /* - * udp_wput_ipv6(): - * Assumes that udp_wput did some sanity checking on the destination - * address. + * This routine handles all messages passed downstream. It either + * consumes the message or passes it downstream; it never queues a + * a message. */ static void -udp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen) +udp_output(conn_t *connp, mblk_t *mp, struct sockaddr *addr, socklen_t addrlen) { - ip6_t *ip6h; - ip6i_t *ip6i; /* mp1->b_rptr even if no ip6i_t */ - mblk_t *mp1; - int udp_ip_hdr_len = IPV6_HDR_LEN + UDPH_SIZE; - size_t ip_len; - udpha_t *udph; - udp_t *udp; - ip6_pkt_t ipp_s; /* For ancillary data options */ - ip6_pkt_t *ipp = &ipp_s; - ip6_pkt_t *tipp; /* temporary ipp */ - uint32_t csum = 0; - uint_t ignore = 0; - uint_t option_exists = 0, is_sticky = 0; - uint8_t *cp; - uint8_t *nxthdr_ptr; + sin6_t *sin6; + sin_t *sin; + ipaddr_t v4dst; + uint16_t port; + uint_t srcid; + queue_t *q = connp->conn_wq; + udp_t *udp = connp->conn_udp; + t_scalar_t optlen; + int error = 0; + struct sockaddr_storage ss; - udp = (udp_t *)q->q_ptr; + TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_START, + "udp_wput_start: connp %p mp %p", connp, mp); + + /* + * We directly handle several cases here: T_UNITDATA_REQ message + * coming down as M_PROTO/M_PCPROTO and M_DATA messages for both + * connected and non-connected socket. The latter carries the + * address structure along when this routine gets called. + */ + switch (DB_TYPE(mp)) { + case M_DATA: + if (!udp->udp_direct_sockfs || udp->udp_state != TS_DATA_XFER) { + if (!udp->udp_direct_sockfs || + addr == NULL || addrlen == 0) { + /* Not connected; address is required */ + BUMP_MIB(&udp_mib, udpOutErrors); + UDP_STAT(udp_out_err_notconn); + freemsg(mp); + TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, + "udp_wput_end: connp %p (%S)", connp, + "not-connected; address required"); + return; + } + ASSERT(udp->udp_issocket); + UDP_DBGSTAT(udp_data_notconn); + /* Not connected; do some more checks below */ + optlen = 0; + break; + } + /* M_DATA for connected socket */ + UDP_DBGSTAT(udp_data_conn); + IN6_V4MAPPED_TO_IPADDR(&udp->udp_v6dst, v4dst); + + /* Initialize addr and addrlen as if they're passed in */ + if (udp->udp_family == AF_INET) { + sin = (sin_t *)&ss; + sin->sin_family = AF_INET; + sin->sin_port = udp->udp_dstport; + sin->sin_addr.s_addr = v4dst; + addr = (struct sockaddr *)sin; + addrlen = sizeof (*sin); + } else { + sin6 = (sin6_t *)&ss; + sin6->sin6_family = AF_INET6; + sin6->sin6_port = udp->udp_dstport; + sin6->sin6_flowinfo = udp->udp_flowinfo; + sin6->sin6_addr = udp->udp_v6dst; + sin6->sin6_scope_id = 0; + sin6->__sin6_src_id = 0; + addr = (struct sockaddr *)sin6; + addrlen = sizeof (*sin6); + } + + if (udp->udp_family == AF_INET || + IN6_IS_ADDR_V4MAPPED(&udp->udp_v6dst)) { + /* + * Handle both AF_INET and AF_INET6; the latter + * for IPV4 mapped destination addresses. Note + * here that both addr and addrlen point to the + * corresponding struct depending on the address + * family of the socket. + */ + mp = udp_output_v4(connp, mp, v4dst, + udp->udp_dstport, 0, &error); + } else { + mp = udp_output_v6(connp, mp, sin6, 0, &error); + } + if (error != 0) { + ASSERT(addr != NULL && addrlen != 0); + goto ud_error; + } + return; + case M_PROTO: + case M_PCPROTO: { + struct T_unitdata_req *tudr; + + ASSERT((uintptr_t)MBLKL(mp) <= (uintptr_t)INT_MAX); + tudr = (struct T_unitdata_req *)mp->b_rptr; + + /* Handle valid T_UNITDATA_REQ here */ + if (MBLKL(mp) >= sizeof (*tudr) && + ((t_primp_t)mp->b_rptr)->type == T_UNITDATA_REQ) { + if (mp->b_cont == NULL) { + TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, + "udp_wput_end: q %p (%S)", q, "badaddr"); + error = EPROTO; + goto ud_error; + } + + if (!MBLKIN(mp, 0, tudr->DEST_offset + + tudr->DEST_length)) { + TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, + "udp_wput_end: q %p (%S)", q, "badaddr"); + error = EADDRNOTAVAIL; + goto ud_error; + } + /* + * If a port has not been bound to the stream, fail. + * This is not a problem when sockfs is directly + * above us, because it will ensure that the socket + * is first bound before allowing data to be sent. + */ + if (udp->udp_state == TS_UNBND) { + TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, + "udp_wput_end: q %p (%S)", q, "outstate"); + error = EPROTO; + goto ud_error; + } + addr = (struct sockaddr *) + &mp->b_rptr[tudr->DEST_offset]; + addrlen = tudr->DEST_length; + optlen = tudr->OPT_length; + if (optlen != 0) + UDP_STAT(udp_out_opt); + break; + } + /* FALLTHRU */ + } + default: + udp_become_writer(connp, mp, udp_wput_other_wrapper, + SQTAG_UDP_OUTPUT); + return; + } + ASSERT(addr != NULL); + + switch (udp->udp_family) { + case AF_INET6: + sin6 = (sin6_t *)addr; + if (!OK_32PTR((char *)sin6) || addrlen != sizeof (sin6_t) || + sin6->sin6_family != AF_INET6) { + TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, + "udp_wput_end: q %p (%S)", q, "badaddr"); + error = EADDRNOTAVAIL; + goto ud_error; + } + + if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + /* + * Destination is a non-IPv4-compatible IPv6 address. + * Send out an IPv6 format packet. + */ + mp = udp_output_v6(connp, mp, sin6, optlen, &error); + if (error != 0) + goto ud_error; + + TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, + "udp_wput_end: q %p (%S)", q, "udp_output_v6"); + return; + } + /* + * If the local address is not zero or a mapped address + * return an error. It would be possible to send an IPv4 + * packet but the response would never make it back to the + * application since it is bound to a non-mapped address. + */ + if (!IN6_IS_ADDR_V4MAPPED(&udp->udp_v6src) && + !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) { + TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, + "udp_wput_end: q %p (%S)", q, "badaddr"); + error = EADDRNOTAVAIL; + goto ud_error; + } + /* Send IPv4 packet without modifying udp_ipversion */ + /* Extract port and ipaddr */ + port = sin6->sin6_port; + IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, v4dst); + srcid = sin6->__sin6_src_id; + break; + + case AF_INET: + sin = (sin_t *)addr; + if (!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t) || + sin->sin_family != AF_INET) { + TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, + "udp_wput_end: q %p (%S)", q, "badaddr"); + error = EADDRNOTAVAIL; + goto ud_error; + } + /* Extract port and ipaddr */ + port = sin->sin_port; + v4dst = sin->sin_addr.s_addr; + srcid = 0; + break; + } + + /* + * If options passed in, feed it for verification and handling + */ + if (optlen != 0) { + ASSERT(DB_TYPE(mp) != M_DATA); + if (udp_unitdata_opt_process(q, mp, &error, NULL) < 0) { + /* failure */ + TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, + "udp_wput_end: q %p (%S)", q, + "udp_unitdata_opt_process"); + goto ud_error; + } + /* + * Note: success in processing options. + * mp option buffer represented by + * OPT_length/offset now potentially modified + * and contain option setting results + */ + } + ASSERT(error == 0); + mp = udp_output_v4(connp, mp, v4dst, port, srcid, &error); + if (error != 0) { +ud_error: + UDP_STAT(udp_out_err_output); + ASSERT(mp != NULL); + /* mp is freed by the following routine */ + udp_ud_err(q, mp, (uchar_t *)addr, (t_scalar_t)addrlen, + (t_scalar_t)error); + } +} + +/* ARGSUSED */ +static void +udp_output_wrapper(void *arg, mblk_t *mp, void *arg2) +{ + udp_output((conn_t *)arg, mp, NULL, 0); + _UDP_EXIT((conn_t *)arg); +} + +static void +udp_wput(queue_t *q, mblk_t *mp) +{ + _UDP_ENTER(Q_TO_CONN(UDP_WR(q)), mp, udp_output_wrapper, + SQTAG_UDP_WPUT); +} + +/* + * Allocate and prepare a T_UNITDATA_REQ message. + */ +static mblk_t * +udp_tudr_alloc(struct sockaddr *addr, socklen_t addrlen) +{ + struct T_unitdata_req *tudr; + mblk_t *mp; + + mp = allocb(sizeof (*tudr) + addrlen, BPRI_MED); + if (mp != NULL) { + mp->b_wptr += sizeof (*tudr) + addrlen; + DB_TYPE(mp) = M_PROTO; + + tudr = (struct T_unitdata_req *)mp->b_rptr; + tudr->PRIM_type = T_UNITDATA_REQ; + tudr->DEST_length = addrlen; + tudr->DEST_offset = (t_scalar_t)sizeof (*tudr); + tudr->OPT_length = 0; + tudr->OPT_offset = 0; + bcopy(addr, tudr+1, addrlen); + } + return (mp); +} + +/* + * Entry point for sockfs when udp is in "direct sockfs" mode. This mode + * is valid when we are directly beneath the stream head, and thus sockfs + * is able to bypass STREAMS and directly call us, passing along the sockaddr + * structure without the cumbersome T_UNITDATA_REQ interface. Note that + * this is done for both connected and non-connected endpoint. + */ +void +udp_wput_data(queue_t *q, mblk_t *mp, struct sockaddr *addr, socklen_t addrlen) +{ + conn_t *connp; + udp_t *udp; + + q = UDP_WR(q); + connp = Q_TO_CONN(q); + udp = connp->conn_udp; + + /* udpsockfs should only send down M_DATA for this entry point */ + ASSERT(DB_TYPE(mp) == M_DATA); + + mutex_enter(&connp->conn_lock); + UDP_MODE_ASSERTIONS(udp, UDP_ENTER); + + if (udp->udp_mode != UDP_MT_HOT) { + /* + * We can't enter this conn right away because another + * thread is currently executing as writer; therefore we + * need to deposit the message into the squeue to be + * drained later. If a socket address is present, we + * need to create a T_UNITDATA_REQ message as placeholder. + */ + if (addr != NULL && addrlen != 0) { + mblk_t *tudr_mp = udp_tudr_alloc(addr, addrlen); + + if (tudr_mp == NULL) { + mutex_exit(&connp->conn_lock); + BUMP_MIB(&udp_mib, udpOutErrors); + UDP_STAT(udp_out_err_tudr); + freemsg(mp); + return; + } + /* Tag the packet with T_UNITDATA_REQ */ + tudr_mp->b_cont = mp; + mp = tudr_mp; + } + mutex_exit(&connp->conn_lock); + udp_enter(connp, mp, udp_output_wrapper, SQTAG_UDP_WPUT); + return; + } + + /* We can execute as reader right away. */ + UDP_READERS_INCREF(udp); + mutex_exit(&connp->conn_lock); + + udp_output(connp, mp, addr, addrlen); + + mutex_enter(&connp->conn_lock); + UDP_MODE_ASSERTIONS(udp, UDP_EXIT); + UDP_READERS_DECREF(udp); + mutex_exit(&connp->conn_lock); +} + +/* + * udp_output_v6(): + * Assumes that udp_wput did some sanity checking on the destination + * address. + */ +static mblk_t * +udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen, + int *error) +{ + ip6_t *ip6h; + ip6i_t *ip6i; /* mp1->b_rptr even if no ip6i_t */ + mblk_t *mp1 = (DB_TYPE(mp) == M_DATA ? mp : mp->b_cont); + mblk_t *mp2; + int udp_ip_hdr_len = IPV6_HDR_LEN + UDPH_SIZE; + size_t ip_len; + udpha_t *udph; + udp_t *udp = connp->conn_udp; + queue_t *q = connp->conn_wq; + ip6_pkt_t ipp_s; /* For ancillary data options */ + ip6_pkt_t *ipp = &ipp_s; + ip6_pkt_t *tipp; /* temporary ipp */ + uint32_t csum = 0; + uint_t ignore = 0; + uint_t option_exists = 0, is_sticky = 0; + uint8_t *cp; + uint8_t *nxthdr_ptr; + + *error = 0; + + /* mp1 points to the M_DATA mblk carrying the packet */ + ASSERT(mp1 != NULL && DB_TYPE(mp1) == M_DATA); + ASSERT(tudr_optlen == 0 || DB_TYPE(mp) != M_DATA); /* * If the local address is a mapped address return @@ -5354,9 +6557,8 @@ udp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen) * since it is bound to a mapped address. */ if (IN6_IS_ADDR_V4MAPPED(&udp->udp_v6src)) { - BUMP_MIB(&udp_mib, udpOutErrors); - udp_ud_err(q, mp, EADDRNOTAVAIL); - return; + *error = EADDRNOTAVAIL; + goto done; } ipp->ipp_fields = 0; @@ -5366,17 +6568,12 @@ udp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen) * If TPI options passed in, feed it for verification and handling */ if (tudr_optlen != 0) { - int error; - - if (udp_unitdata_opt_process(q, mp, &error, - (void *)ipp) < 0) { + if (udp_unitdata_opt_process(q, mp, error, (void *)ipp) < 0) { /* failure */ - BUMP_MIB(&udp_mib, udpOutErrors); - udp_ud_err(q, mp, error); - return; + goto done; } ignore = ipp->ipp_sticky_ignored; - ASSERT(error == 0); + ASSERT(*error == 0); } if (sin6->sin6_scope_id != 0 && @@ -5389,8 +6586,7 @@ udp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen) option_exists |= IPPF_SCOPE_ID; } - if ((udp->udp_sticky_ipp.ipp_fields == 0) && - (ipp->ipp_fields == 0)) { + if ((udp->udp_sticky_ipp.ipp_fields == 0) && (ipp->ipp_fields == 0)) { /* No sticky options nor ancillary data. */ goto no_options; } @@ -5475,7 +6671,8 @@ udp_wput_ipv6(queue_t *q, mblk_t *mp, sin6_t *sin6, t_scalar_t tudr_optlen) if (!(ignore & IPPF_USE_MIN_MTU)) { if (ipp->ipp_fields & IPPF_USE_MIN_MTU) { option_exists |= IPPF_USE_MIN_MTU; - } else if (udp->udp_sticky_ipp.ipp_fields & IPPF_USE_MIN_MTU) { + } else if (udp->udp_sticky_ipp.ipp_fields & + IPPF_USE_MIN_MTU) { option_exists |= IPPF_USE_MIN_MTU; is_sticky |= IPPF_USE_MIN_MTU; } @@ -5518,26 +6715,28 @@ no_options: udp_ip_hdr_len += sizeof (ip6i_t); /* check/fix buffer config, setup pointers into it */ - mp1 = mp->b_cont; ip6h = (ip6_t *)&mp1->b_rptr[-udp_ip_hdr_len]; - if ((mp1->b_datap->db_ref != 1) || - ((unsigned char *)ip6h < mp1->b_datap->db_base) || + if (DB_REF(mp1) != 1 || ((unsigned char *)ip6h < DB_BASE(mp1)) || !OK_32PTR(ip6h)) { /* Try to get everything in a single mblk next time */ if (udp_ip_hdr_len > udp->udp_max_hdr_len) { udp->udp_max_hdr_len = udp_ip_hdr_len; - (void) mi_set_sth_wroff(RD(q), + (void) mi_set_sth_wroff(UDP_RD(q), udp->udp_max_hdr_len + udp_wroff_extra); } - mp1 = allocb(udp_ip_hdr_len + udp_wroff_extra, BPRI_LO); - if (!mp1) { - BUMP_MIB(&udp_mib, udpOutErrors); - udp_ud_err(q, mp, ENOMEM); - return; + mp2 = allocb(udp_ip_hdr_len + udp_wroff_extra, BPRI_LO); + if (mp2 == NULL) { + *error = ENOMEM; + goto done; } - mp1->b_cont = mp->b_cont; - mp->b_cont = mp1; - mp1->b_wptr = mp1->b_datap->db_lim; + mp2->b_wptr = DB_LIM(mp2); + mp2->b_cont = mp1; + mp1 = mp2; + if (DB_TYPE(mp) != M_DATA) + mp->b_cont = mp1; + else + mp = mp1; + ip6h = (ip6_t *)(mp1->b_wptr - udp_ip_hdr_len); } mp1->b_rptr = (unsigned char *)ip6h; @@ -5624,7 +6823,7 @@ no_options: if (sin6->__sin6_src_id != 0 && IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) { ip_srcid_find_id(sin6->__sin6_src_id, - &ip6h->ip6_src, udp->udp_zoneid); + &ip6h->ip6_src, connp->conn_zoneid); } } @@ -5731,9 +6930,8 @@ no_options: * Drop packet - only support Type 0 routing. * Notify the application as well. */ - udp_ud_err(q, mp, EPROTO); - BUMP_MIB(&udp_mib, udpOutErrors); - return; + *error = EPROTO; + goto done; } /* @@ -5741,9 +6939,8 @@ no_options: * addresses in the header. Thus it must be even. */ if (rth->ip6r_len & 0x1) { - udp_ud_err(q, mp, EPROTO); - BUMP_MIB(&udp_mib, udpOutErrors); - return; + *error = EPROTO; + goto done; } /* * Shuffle the routing header and ip6_dst @@ -5758,9 +6955,8 @@ no_options: * for subsequent hops. */ if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) { - udp_ud_err(q, mp, EADDRNOTAVAIL); - BUMP_MIB(&udp_mib, udpOutErrors); - return; + *error = EADDRNOTAVAIL; + goto done; } cp += (rth->ip6r_len + 1)*8; @@ -5769,14 +6965,11 @@ no_options: /* count up length of UDP packet */ ip_len = (mp1->b_wptr - (unsigned char *)ip6h) - IPV6_HDR_LEN; - { - mblk_t *mp2; - - if ((mp2 = mp1->b_cont) != NULL) { - do { - ip_len += mp2->b_wptr - mp2->b_rptr; - } while ((mp2 = mp2->b_cont) != NULL); - } + if ((mp2 = mp1->b_cont) != NULL) { + do { + ASSERT((uintptr_t)MBLKL(mp2) <= (uintptr_t)UINT_MAX); + ip_len += (uint32_t)MBLKL(mp2); + } while ((mp2 = mp2->b_cont) != NULL); } /* @@ -5785,9 +6978,8 @@ no_options: * the size will have wrapped and be inconsistent with the msg size. */ if (ip_len > IP_MAXPACKET) { - BUMP_MIB(&udp_mib, udpOutErrors); - udp_ud_err(q, mp, EMSGSIZE); - return; + *error = EMSGSIZE; + goto done; } /* Store the UDP length. Subtract length of extension hdrs */ @@ -5810,11 +7002,25 @@ no_options: #endif ip6h->ip6_plen = ip_len; - freeb(mp); + if (DB_TYPE(mp) != M_DATA) { + ASSERT(mp != mp1); + freeb(mp); + } + + /* mp has been consumed and we'll return success */ + ASSERT(*error == 0); + mp = NULL; /* We're done. Pass the packet to IP */ BUMP_MIB(&udp_mib, udpOutDatagrams); - putnext(q, mp1); + ip_output_v6(connp, mp1, q, IP_WPUT); + +done: + if (*error != 0) { + ASSERT(mp != NULL); + BUMP_MIB(&udp_mib, udpOutErrors); + } + return (mp); } static void @@ -5823,26 +7029,18 @@ udp_wput_other(queue_t *q, mblk_t *mp) uchar_t *rptr = mp->b_rptr; struct datab *db; struct iocblk *iocp; - udp_t *udp; cred_t *cr; + conn_t *connp = Q_TO_CONN(q); + udp_t *udp = connp->conn_udp; TRACE_1(TR_FAC_UDP, TR_UDP_WPUT_OTHER_START, "udp_wput_other_start: q %p", q); - udp = (udp_t *)q->q_ptr; db = mp->b_datap; - cr = DB_CREDDEF(mp, udp->udp_credp); + cr = DB_CREDDEF(mp, connp->conn_cred); switch (db->db_type) { - case M_DATA: - /* Not connected */ - BUMP_MIB(&udp_mib, udpOutErrors); - freemsg(mp); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, - "udp_wput_other_end: q %p (%S)", - q, "not-connected"); - return; case M_PROTO: case M_PCPROTO: if (mp->b_wptr - rptr < sizeof (t_scalar_t)) { @@ -5852,7 +7050,7 @@ udp_wput_other(queue_t *q, mblk_t *mp) q, "protoshort"); return; } - switch (((union T_primitives *)rptr)->type) { + switch (((t_primp_t)rptr)->type) { case T_ADDR_REQ: udp_addr_req(q, mp); TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, @@ -5885,7 +7083,7 @@ udp_wput_other(queue_t *q, mblk_t *mp) * be bad. Valid T_UNITDATA_REQs are handled * in udp_wput. */ - udp_ud_err(q, mp, EADDRNOTAVAIL); + udp_ud_err(q, mp, NULL, 0, EADDRNOTAVAIL); TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, "udp_wput_other_end: q %p (%S)", q, "unitdatareq"); @@ -5897,14 +7095,26 @@ udp_wput_other(queue_t *q, mblk_t *mp) return; case T_SVR4_OPTMGMT_REQ: if (!snmpcom_req(q, mp, udp_snmp_set, udp_snmp_get, cr)) - (void) svr4_optcom_req(q, mp, cr, &udp_opt_obj); + /* + * Use upper queue for option processing in + * case the request is not handled at this + * level and needs to be passed down to IP. + */ + (void) svr4_optcom_req(_WR(UDP_RD(q)), + mp, cr, &udp_opt_obj); TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, "udp_wput_other_end: q %p (%S)", q, "optmgmtreq"); return; case T_OPTMGMT_REQ: - (void) tpi_optcom_req(q, mp, cr, &udp_opt_obj); + /* + * Use upper queue for option processing in + * case the request is not handled at this + * level and needs to be passed down to IP. + */ + (void) tpi_optcom_req(_WR(UDP_RD(q)), + mp, cr, &udp_opt_obj); TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, "udp_wput_other_end: q %p (%S)", q, "optmgmtreq"); @@ -5954,10 +7164,9 @@ udp_wput_other(queue_t *q, mblk_t *mp) * don't know the peer's name. */ iocp->ioc_error = ENOTCONN; -err_ret:; iocp->ioc_count = 0; mp->b_datap->db_type = M_IOCACK; - qreply(q, mp); + putnext(UDP_RD(q), mp); TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, "udp_wput_other_end: q %p (%S)", q, "getpeername"); @@ -5982,13 +7191,45 @@ err_ret:; /* nd_getset performs the necessary checking */ case ND_GET: if (nd_getset(q, udp_g_nd, mp)) { - qreply(q, mp); + putnext(UDP_RD(q), mp); TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, "udp_wput_other_end: q %p (%S)", q, "get"); return; } break; + case _SIOCSOCKFALLBACK: + /* + * Either sockmod is about to be popped and the + * socket would now be treated as a plain stream, + * or a module is about to be pushed so we could + * no longer use read-side synchronous stream. + * Drain any queued data and disable direct sockfs + * interface from now on. + */ + if (!udp->udp_issocket) { + DB_TYPE(mp) = M_IOCNAK; + iocp->ioc_error = EINVAL; + } else { + udp->udp_issocket = B_FALSE; + if (udp->udp_direct_sockfs) { + /* + * Disable read-side synchronous + * stream interface and drain any + * queued data. + */ + udp_rcv_drain(UDP_RD(q), udp, + B_FALSE); + ASSERT(!udp->udp_direct_sockfs); + UDP_STAT(udp_sock_fallback); + } + DB_TYPE(mp) = M_IOCACK; + iocp->ioc_error = 0; + } + iocp->ioc_count = 0; + iocp->ioc_rval = 0; + putnext(UDP_RD(q), mp); + return; default: break; } @@ -6004,7 +7245,15 @@ err_ret:; } TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, "udp_wput_other_end: q %p (%S)", q, "end"); - putnext(q, mp); + ip_output(connp, mp, q, IP_WPUT); +} + +/* ARGSUSED */ +static void +udp_wput_other_wrapper(void *arg, mblk_t *mp, void *arg2) +{ + udp_wput_other(((conn_t *)arg)->conn_wq, mp); + udp_exit((conn_t *)arg); } /* @@ -6017,11 +7266,11 @@ udp_wput_iocdata(queue_t *q, mblk_t *mp) mblk_t *mp1; STRUCT_HANDLE(strbuf, sb); uint16_t port; - udp_t *udp; in6_addr_t v6addr; ipaddr_t v4addr; uint32_t flowinfo = 0; int addrlen; + udp_t *udp = Q_TO_UDP(q); /* Make sure it is one of ours. */ switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { @@ -6029,9 +7278,11 @@ udp_wput_iocdata(queue_t *q, mblk_t *mp) case TI_GETPEERNAME: break; default: - putnext(q, mp); + ip_output(Q_TO_CONN(q), mp, q, IP_WPUT); return; } + + q = WR(UDP_RD(q)); switch (mi_copy_state(q, mp, &mp1)) { case -1: return; @@ -6068,7 +7319,6 @@ udp_wput_iocdata(queue_t *q, mblk_t *mp) */ STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag, (void *)mp1->b_rptr); - udp = (udp_t *)q->q_ptr; if (udp->udp_family == AF_INET) addrlen = sizeof (sin_t); else @@ -6113,6 +7363,10 @@ udp_wput_iocdata(queue_t *q, mblk_t *mp) port = udp->udp_port; break; case TI_GETPEERNAME: + if (udp->udp_state != TS_DATA_XFER) { + mi_copy_done(q, mp, ENOTCONN); + return; + } if (udp->udp_family == AF_INET) { ASSERT(udp->udp_ipversion == IPV4_VERSION); v4addr = V4_PART_OF_V6(udp->udp_v6dst); @@ -6163,21 +7417,23 @@ static int udp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp, void *thisdg_attrs) { - udp_t *udp; struct T_unitdata_req *udreqp; int is_absreq_failure; cred_t *cr; + conn_t *connp = Q_TO_CONN(q); - ASSERT(((union T_primitives *)mp->b_rptr)->type); + ASSERT(((t_primp_t)mp->b_rptr)->type); - udp = (udp_t *)q->q_ptr; - - cr = DB_CREDDEF(mp, udp->udp_credp); + cr = DB_CREDDEF(mp, connp->conn_cred); udreqp = (struct T_unitdata_req *)mp->b_rptr; *errorp = 0; - *errorp = tpi_optcom_buf(q, mp, &udreqp->OPT_length, + /* + * Use upper queue for option processing since the callback + * routines expect to be called in UDP instance instead of IP. + */ + *errorp = tpi_optcom_buf(_WR(UDP_RD(q)), mp, &udreqp->OPT_length, udreqp->OPT_offset, cr, &udp_opt_obj, thisdg_attrs, &is_absreq_failure); @@ -6198,7 +7454,6 @@ udp_ddi_init(void) int i; UDP6_MAJ = ddi_name_to_major(UDP6); - mutex_init(&udp_g_lock, NULL, MUTEX_DEFAULT, NULL); udp_max_optsize = optcom_max_optsize(udp_opt_obj.odb_opt_des_arr, udp_opt_obj.odb_opt_arr_cnt); @@ -6218,7 +7473,11 @@ udp_ddi_init(void) NULL); } (void) udp_param_register(udp_param_arr, A_CNT(udp_param_arr)); + udp_kstat_init(); + + udp_cache = kmem_cache_create("udp_cache", sizeof (udp_t), + CACHE_ALIGN_SIZE, NULL, NULL, NULL, NULL, NULL, 0); } void @@ -6228,14 +7487,16 @@ udp_ddi_destroy(void) nd_free(&udp_g_nd); - mutex_destroy(&udp_g_lock); for (i = 0; i < udp_bind_fanout_size; i++) { mutex_destroy(&udp_bind_fanout[i].uf_lock); } + kmem_free(udp_bind_fanout, udp_bind_fanout_size * sizeof (udp_fanout_t)); + udp_kstat_fini(); + kmem_cache_destroy(udp_cache); } static void @@ -6250,9 +7511,9 @@ udp_kstat_init(void) { "outErrors", KSTAT_DATA_UINT32, 0 }, }; - udp_mibkp = kstat_create("udp", 0, "udp", "mib2", KSTAT_TYPE_NAMED, - NUM_OF_FIELDS(udp_named_kstat_t), - 0); + udp_mibkp = kstat_create(UDP_MOD_NAME, 0, UDP_MOD_NAME, + "mib2", KSTAT_TYPE_NAMED, NUM_OF_FIELDS(udp_named_kstat_t), 0); + if (udp_mibkp == NULL) return; @@ -6264,12 +7525,24 @@ udp_kstat_init(void) udp_mibkp->ks_update = udp_kstat_update; kstat_install(udp_mibkp); + + if ((udp_ksp = kstat_create(UDP_MOD_NAME, 0, "udpstat", + "net", KSTAT_TYPE_NAMED, + sizeof (udp_statistics) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL)) != NULL) { + udp_ksp->ks_data = &udp_statistics; + kstat_install(udp_ksp); + } } static void udp_kstat_fini(void) { - if (udp_mibkp) { + if (udp_ksp != NULL) { + kstat_delete(udp_ksp); + udp_ksp = NULL; + } + if (udp_mibkp != NULL) { kstat_delete(udp_mibkp); udp_mibkp = NULL; } @@ -6296,6 +7569,269 @@ udp_kstat_update(kstat_t *kp, int rw) return (0); } +/* ARGSUSED */ +static void +udp_rput(queue_t *q, mblk_t *mp) +{ + /* + * We get here whenever we do qreply() from IP, + * i.e as part of handlings ioctls, etc. + */ + putnext(q, mp); +} + +/* + * Read-side synchronous stream info entry point, called as a + * result of handling certain STREAMS ioctl operations. + */ +static int +udp_rinfop(queue_t *q, infod_t *dp) +{ + mblk_t *mp; + uint_t cmd = dp->d_cmd; + int res = 0; + int error = 0; + udp_t *udp = Q_TO_UDP(RD(UDP_WR(q))); + struct stdata *stp = STREAM(q); + + mutex_enter(&udp->udp_drain_lock); + /* If shutdown on read has happened, return nothing */ + mutex_enter(&stp->sd_lock); + if (stp->sd_flag & STREOF) { + mutex_exit(&stp->sd_lock); + goto done; + } + mutex_exit(&stp->sd_lock); + + if ((mp = udp->udp_rcv_list_head) == NULL) + goto done; + + ASSERT(DB_TYPE(mp) != M_DATA && mp->b_cont != NULL); + + if (cmd & INFOD_COUNT) { + /* + * Return the number of messages. + */ + dp->d_count += udp->udp_rcv_msgcnt; + res |= INFOD_COUNT; + } + if (cmd & INFOD_BYTES) { + /* + * Return size of all data messages. + */ + dp->d_bytes += udp->udp_rcv_cnt; + res |= INFOD_BYTES; + } + if (cmd & INFOD_FIRSTBYTES) { + /* + * Return size of first data message. + */ + dp->d_bytes = msgdsize(mp); + res |= INFOD_FIRSTBYTES; + dp->d_cmd &= ~INFOD_FIRSTBYTES; + } + if (cmd & INFOD_COPYOUT) { + mblk_t *mp1 = mp->b_cont; + int n; + /* + * Return data contents of first message. + */ + ASSERT(DB_TYPE(mp1) == M_DATA); + while (mp1 != NULL && dp->d_uiop->uio_resid > 0) { + n = MIN(dp->d_uiop->uio_resid, MBLKL(mp1)); + if (n != 0 && (error = uiomove((char *)mp1->b_rptr, n, + UIO_READ, dp->d_uiop)) != 0) { + goto done; + } + mp1 = mp1->b_cont; + } + res |= INFOD_COPYOUT; + dp->d_cmd &= ~INFOD_COPYOUT; + } +done: + mutex_exit(&udp->udp_drain_lock); + + dp->d_res |= res; + + return (error); +} + +/* + * Read-side synchronous stream entry point. This is called as a result + * of recv/read operation done at sockfs, and is guaranteed to execute + * outside of the interrupt thread context. It returns a single datagram + * (b_cont chain of T_UNITDATA_IND plus data) to the upper layer. + */ +static int +udp_rrw(queue_t *q, struiod_t *dp) +{ + mblk_t *mp; + udp_t *udp = Q_TO_UDP(_RD(UDP_WR(q))); + + /* We should never get here when we're in SNMP mode */ + ASSERT(!(udp->udp_connp->conn_flags & IPCL_UDPMOD)); + + /* + * Dequeue datagram from the head of the list and return + * it to caller; also ensure that RSLEEP sd_wakeq flag is + * set/cleared depending on whether or not there's data + * remaining in the list. + */ + mutex_enter(&udp->udp_drain_lock); + if (!udp->udp_direct_sockfs) { + mutex_exit(&udp->udp_drain_lock); + UDP_STAT(udp_rrw_busy); + return (EBUSY); + } + if ((mp = udp->udp_rcv_list_head) != NULL) { + uint_t size = msgdsize(mp); + + /* Last datagram in the list? */ + if ((udp->udp_rcv_list_head = mp->b_next) == NULL) + udp->udp_rcv_list_tail = NULL; + mp->b_next = NULL; + + udp->udp_rcv_cnt -= size; + udp->udp_rcv_msgcnt--; + UDP_STAT(udp_rrw_msgcnt); + + /* No longer flow-controlling? */ + if (udp->udp_rcv_cnt < udp->udp_rcv_hiwat && + udp->udp_rcv_msgcnt < udp->udp_rcv_hiwat) + udp->udp_drain_qfull = B_FALSE; + } + if (udp->udp_rcv_list_head == NULL) { + /* + * Either we just dequeued the last datagram or + * we get here from sockfs and have nothing to + * return; in this case clear RSLEEP. + */ + ASSERT(udp->udp_rcv_cnt == 0); + ASSERT(udp->udp_rcv_msgcnt == 0); + ASSERT(udp->udp_rcv_list_tail == NULL); + STR_WAKEUP_CLEAR(STREAM(q)); + } else { + /* + * More data follows; we need udp_rrw() to be + * called in future to pick up the rest. + */ + STR_WAKEUP_SET(STREAM(q)); + } + mutex_exit(&udp->udp_drain_lock); + dp->d_mp = mp; + return (0); +} + +/* + * Enqueue a completely-built T_UNITDATA_IND message into the receive + * list; this is typically executed within the interrupt thread context + * and so we do things as quickly as possible. + */ +static void +udp_rcv_enqueue(queue_t *q, udp_t *udp, mblk_t *mp, uint_t pkt_len) +{ + ASSERT(q == RD(q)); + ASSERT(pkt_len == msgdsize(mp)); + ASSERT(mp->b_next == NULL && mp->b_cont != NULL); + ASSERT(DB_TYPE(mp) == M_PROTO && DB_TYPE(mp->b_cont) == M_DATA); + ASSERT(MBLKL(mp) >= sizeof (struct T_unitdata_ind)); + + mutex_enter(&udp->udp_drain_lock); + /* + * Wake up and signal the receiving app; it is okay to do this + * before enqueueing the mp because we are holding the drain lock. + * One of the advantages of synchronous stream is the ability for + * us to find out when the application performs a read on the + * socket by way of udp_rrw() entry point being called. We need + * to generate SIGPOLL/SIGIO for each received data in the case + * of asynchronous socket just as in the strrput() case. However, + * we only wake the application up when necessary, i.e. during the + * first enqueue. When udp_rrw() is called, we send up a single + * datagram upstream and call STR_WAKEUP_SET() again when there + * are still data remaining in our receive queue. + */ + if (udp->udp_rcv_list_head == NULL) { + STR_WAKEUP_SET(STREAM(q)); + udp->udp_rcv_list_head = mp; + } else { + udp->udp_rcv_list_tail->b_next = mp; + } + udp->udp_rcv_list_tail = mp; + udp->udp_rcv_cnt += pkt_len; + udp->udp_rcv_msgcnt++; + + /* Need to flow-control? */ + if (udp->udp_rcv_cnt >= udp->udp_rcv_hiwat || + udp->udp_rcv_msgcnt >= udp->udp_rcv_hiwat) + udp->udp_drain_qfull = B_TRUE; + + /* Update poll events and send SIGPOLL/SIGIO if necessary */ + STR_SENDSIG(STREAM(q)); + mutex_exit(&udp->udp_drain_lock); +} + +/* + * Drain the contents of receive list to the module upstream; we do + * this during close or when we fallback to the slow mode due to + * sockmod being popped or a module being pushed on top of us. + */ +static void +udp_rcv_drain(queue_t *q, udp_t *udp, boolean_t closing) +{ + mblk_t *mp; + + ASSERT(q == RD(q)); + + mutex_enter(&udp->udp_drain_lock); + /* + * There is no race with a concurrent udp_input() sending + * up packets using putnext() after we have cleared the + * udp_direct_sockfs flag but before we have completed + * sending up the packets in udp_rcv_list, since we are + * either a writer or we have quiesced the conn. + */ + udp->udp_direct_sockfs = B_FALSE; + mutex_exit(&udp->udp_drain_lock); + + if (udp->udp_rcv_list_head != NULL) + UDP_STAT(udp_drain); + + /* + * Send up everything via putnext(); note here that we + * don't need the udp_drain_lock to protect us since + * nothing can enter udp_rrw() and that we currently + * have exclusive access to this udp. + */ + while ((mp = udp->udp_rcv_list_head) != NULL) { + udp->udp_rcv_list_head = mp->b_next; + mp->b_next = NULL; + udp->udp_rcv_cnt -= msgdsize(mp); + udp->udp_rcv_msgcnt--; + if (closing) { + freemsg(mp); + } else { + putnext(q, mp); + } + } + ASSERT(udp->udp_rcv_cnt == 0); + ASSERT(udp->udp_rcv_msgcnt == 0); + ASSERT(udp->udp_rcv_list_head == NULL); + udp->udp_rcv_list_tail = NULL; + udp->udp_drain_qfull = B_FALSE; +} + +static size_t +udp_set_rcv_hiwat(udp_t *udp, size_t size) +{ + /* We add a bit of extra buffering */ + size += size >> 1; + if (size > udp_max_buf) + size = udp_max_buf; + + udp->udp_rcv_hiwat = size; + return (size); +} + /* * Little helper for IPsec's NAT-T processing. */ diff --git a/usr/src/uts/common/inet/udp/udp6ddi.c b/usr/src/uts/common/inet/udp/udp6ddi.c index 277aa3b970..c5b203c654 100644 --- a/usr/src/uts/common/inet/udp/udp6ddi.c +++ b/usr/src/uts/common/inet/udp/udp6ddi.c @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 1992,1997-2002 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -36,7 +36,13 @@ #define INET_DEVMINOR IPV6_MINOR #define INET_DEVDESC "UDP6 STREAMS driver %I%" #define INET_STRTAB udpinfo -#define INET_DEVMTFLAGS IP_DEVMTFLAGS /* since we're really ip */ +#define INET_DEVMTFLAGS IP_DEVMTFLAGS +/* + * We define both synchronous STREAMS and sockfs direct-access + * mode for UDP module instance, because it is autopushed on + * top of /dev/ip for the sockets case. + */ +#define INET_MODMTFLAGS (D_MP|D_SYNCSTR|_D_DIRECT) #include "../inetddi.c" diff --git a/usr/src/uts/common/inet/udp/udpddi.c b/usr/src/uts/common/inet/udp/udpddi.c index dcff39b3c9..ad5542295e 100644 --- a/usr/src/uts/common/inet/udp/udpddi.c +++ b/usr/src/uts/common/inet/udp/udpddi.c @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -32,20 +32,23 @@ #include <sys/modctl.h> #include <inet/common.h> #include <inet/ip.h> +#include <inet/udp_impl.h> #define INET_NAME "udp" #define INET_MODDESC "UDP STREAMS module %I%" #define INET_DEVDESC "UDP STREAMS driver %I%" #define INET_DEVMINOR IPV4_MINOR #define INET_STRTAB udpinfo -#define INET_DEVMTFLAGS IP_DEVMTFLAGS /* since as a driver we're ip */ -#define INET_MODMTFLAGS (D_MP | D_MTQPAIR | D_MTPUTSHARED | _D_MTOCSHARED) +#define INET_DEVMTFLAGS IP_DEVMTFLAGS +/* + * We define both synchronous STREAMS and sockfs direct-access + * mode for UDP module instance, because it is autopushed on + * top of /dev/ip for the sockets case. + */ +#define INET_MODMTFLAGS (D_MP|D_SYNCSTR|_D_DIRECT) #include "../inetddi.c" -extern void udp_ddi_init(void); -extern void udp_ddi_destroy(void); - int _init(void) { diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h index 8b5c52ba32..66faf934a8 100644 --- a/usr/src/uts/common/inet/udp_impl.h +++ b/usr/src/uts/common/inet/udp_impl.h @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -29,6 +29,13 @@ #pragma ident "%Z%%M% %I% %E% SMI" +/* + * UDP implementation private declarations. These interfaces are + * used to build the IP module and are not meant to be accessed + * by any modules except IP itself. They are undocumented and are + * subject to change without notice. + */ + #ifdef __cplusplus extern "C" { #endif @@ -43,32 +50,42 @@ extern "C" { #include <inet/common.h> #include <inet/ip.h> +#define UDP_MOD_ID 5607 + +/* udp_mode. UDP_MT_HOT and UDP_SQUEUE are stable modes. Rest are transient */ +typedef enum { + UDP_MT_HOT = 0, /* UDP endpoint is MT HOT */ + UDP_MT_QUEUED = 1, /* Messages enqueued in udp_mphead */ + UDP_QUEUED_SQUEUE = 2, /* Messages enqueued in conn_sqp */ + UDP_SQUEUE = 3 /* Single threaded using squeues */ +} udp_mode_t; + /* Internal udp control structure, one per open stream */ typedef struct udp_s { - uint32_t udp_state; /* TPI state */ - in_port_t udp_port; /* Port bound to this stream */ - in_port_t udp_dstport; /* Connected port */ - in6_addr_t udp_v6src; /* Source address of this stream */ - in6_addr_t udp_bound_v6src; /* Explicitly bound address */ - in6_addr_t udp_v6dst; /* Connected destination */ + uint32_t udp_state; /* TPI state */ + in_port_t udp_port; /* Port bound to this stream */ + in_port_t udp_dstport; /* Connected port */ + in6_addr_t udp_v6src; /* Source address of this stream */ + in6_addr_t udp_bound_v6src; /* Explicitly bound address */ + in6_addr_t udp_v6dst; /* Connected destination */ uint32_t udp_flowinfo; /* Connected flow id and tclass */ - uint32_t udp_max_hdr_len; /* For write offset in stream head */ + uint32_t udp_max_hdr_len; /* For write offset in stream head */ sa_family_t udp_family; /* Family from socket() call */ /* * IP format that packets transmitted from this struct should use. * Value can be IP4_VERSION or IPV6_VERSION. */ ushort_t udp_ipversion; - uint32_t udp_ip_snd_options_len; /* Len of IPv4 options */ + uint32_t udp_ip_snd_options_len; /* Len of IPv4 options */ uchar_t *udp_ip_snd_options; /* Ptr to IPv4 options */ - uint32_t udp_ip_rcv_options_len; /* Len of IPv4 options recvd */ + uint32_t udp_ip_rcv_options_len; /* Len of IPv4 options recvd */ uchar_t *udp_ip_rcv_options; /* Ptr to IPv4 options recvd */ - cred_t *udp_credp; /* Credentials at open */ uchar_t udp_multicast_ttl; /* IP*_MULTICAST_TTL/HOPS */ - ipaddr_t udp_multicast_if_addr; /* IP_MULTICAST_IF option */ + ipaddr_t udp_multicast_if_addr; /* IP_MULTICAST_IF option */ uint_t udp_multicast_if_index; /* IPV6_MULTICAST_IF option */ int udp_bound_if; /* IP*_BOUND_IF option */ int udp_xmit_if; /* IP_XMIT_IF option */ + conn_t *udp_connp; uint32_t udp_debug : 1, /* SO_DEBUG "socket" option. */ udp_dontroute : 1, /* SO_DONTROUTE "socket" option. */ @@ -76,35 +93,36 @@ typedef struct udp_s { udp_useloopback : 1, /* SO_USELOOPBACK "socket" option */ udp_reuseaddr : 1, /* SO_REUSEADDR "socket" option. */ - udp_multicast_loop : 1, /* IP_MULTICAST_LOOP option */ udp_dgram_errind : 1, /* SO_DGRAM_ERRIND option */ udp_recvdstaddr : 1, /* IP_RECVDSTADDR option */ - udp_recvopts : 1, /* IP_RECVOPTS option */ + udp_discon_pending : 1, /* T_DISCON_REQ in progress */ udp_unspec_source : 1, /* IP*_UNSPEC_SRC option */ udp_ipv6_recvpktinfo : 1, /* IPV6_RECVPKTINFO option */ - udp_ipv6_recvhoplimit : 1, /* IPV6_RECVHOPLIMIT option */ + udp_ipv6_recvhopopts : 1, /* IPV6_RECVHOPOPTS option */ udp_ipv6_recvdstopts : 1, /* IPV6_RECVDSTOPTS option */ udp_ipv6_recvrthdr : 1, /* IPV6_RECVRTHDR option */ - udp_ipv6_recvtclass : 1, /* IPV6_RECVTCLASS */ + udp_ipv6_recvpathmtu : 1, /* IPV6_RECVPATHMTU */ udp_anon_priv_bind : 1, udp_exclbind : 1, /* ``exclusive'' binding */ - udp_recvif : 1, /* IP_RECVIF option */ + udp_recvslla : 1, /* IP_RECVSLLA option */ udp_recvttl : 1, /* IP_RECVTTL option */ udp_recvucred : 1, /* IP_RECVUCRED option */ - udp_old_ipv6_recvdstopts : 1, /* old form of IPV6_DSTOPTS */ - udp_ipv6_recvrthdrdstopts : 1, /* IPV6_RECVRTHDRDSTOPTS */ + udp_ipv6_recvrthdrdstopts : 1, /* IPV6_RECVRTHDRDSTOPTS */ udp_rcvhdr : 1, /* UDP_RCVHDR option */ - udp_pad_to_bit_31 : 7; + udp_issocket : 1, /* socket mode */ + udp_direct_sockfs : 1, /* direct calls to/from sockfs */ + + udp_pad_to_bit_31 : 4; uint8_t udp_type_of_service; /* IP_TOS option */ uint8_t udp_ttl; /* TTL or hoplimit */ @@ -114,7 +132,20 @@ typedef struct udp_s { uint_t udp_sticky_hdrs_len; /* Incl. ip6h and any ip6i */ struct udp_s *udp_bind_hash; /* Bind hash chain */ struct udp_s **udp_ptpbhn; /* Pointer to previous bind hash next. */ - zoneid_t udp_zoneid; /* ID of owning zone */ + udp_mode_t udp_mode; /* Current mode of operation */ + mblk_t *udp_mphead; /* Head of the queued operations */ + mblk_t *udp_mptail; /* Tail of the queued operations */ + uint_t udp_mpcount; /* Number of messages in the queue */ + uint_t udp_reader_count; /* Number of reader threads */ + uint_t udp_squeue_count; /* Number of messages in conn_sqp */ + + kmutex_t udp_drain_lock; /* lock for udp_rcv_list */ + boolean_t udp_drain_qfull; /* drain queue is full */ + mblk_t *udp_rcv_list_head; /* b_next chain of mblks */ + mblk_t *udp_rcv_list_tail; /* last mblk in chain */ + uint_t udp_rcv_cnt; /* total data in rcv_list */ + uint_t udp_rcv_msgcnt; /* total messages in rcv_list */ + size_t udp_rcv_hiwat; /* receive high watermark */ } udp_t; /* UDP Protocol header */ @@ -127,6 +158,92 @@ typedef struct udpahdr_s { } udpha_t; #define UDPH_SIZE 8 +/* Named Dispatch Parameter Management Structure */ +typedef struct udpparam_s { + uint32_t udp_param_min; + uint32_t udp_param_max; + uint32_t udp_param_value; + char *udp_param_name; +} udpparam_t; + +extern udpparam_t udp_param_arr[]; + +#define udp_wroff_extra udp_param_arr[0].udp_param_value +#define udp_ipv4_ttl udp_param_arr[1].udp_param_value +#define udp_ipv6_hoplimit udp_param_arr[2].udp_param_value +#define udp_smallest_nonpriv_port udp_param_arr[3].udp_param_value +#define udp_do_checksum udp_param_arr[4].udp_param_value +#define udp_smallest_anon_port udp_param_arr[5].udp_param_value +#define udp_largest_anon_port udp_param_arr[6].udp_param_value +#define udp_xmit_hiwat udp_param_arr[7].udp_param_value +#define udp_xmit_lowat udp_param_arr[8].udp_param_value +#define udp_recv_hiwat udp_param_arr[9].udp_param_value +#define udp_max_buf udp_param_arr[10].udp_param_value +#define udp_ndd_get_info_interval udp_param_arr[11].udp_param_value + +/* Kstats */ +typedef struct { /* Class "net" kstats */ + kstat_named_t udp_ip_send; + kstat_named_t udp_ip_ire_send; + kstat_named_t udp_ire_null; + kstat_named_t udp_drain; + kstat_named_t udp_sock_fallback; + kstat_named_t udp_rrw_busy; + kstat_named_t udp_rrw_msgcnt; + kstat_named_t udp_out_sw_cksum; + kstat_named_t udp_out_sw_cksum_bytes; + kstat_named_t udp_out_opt; + kstat_named_t udp_out_err_notconn; + kstat_named_t udp_out_err_output; + kstat_named_t udp_out_err_tudr; + kstat_named_t udp_in_pktinfo; + kstat_named_t udp_in_recvdstaddr; + kstat_named_t udp_in_recvopts; + kstat_named_t udp_in_recvif; + kstat_named_t udp_in_recvslla; + kstat_named_t udp_in_recvucred; + kstat_named_t udp_in_recvttl; + kstat_named_t udp_in_recvhopopts; + kstat_named_t udp_in_recvhoplimit; + kstat_named_t udp_in_recvdstopts; + kstat_named_t udp_in_recvrtdstopts; + kstat_named_t udp_in_recvrthdr; + kstat_named_t udp_in_recvpktinfo; + kstat_named_t udp_in_recvtclass; +#ifdef DEBUG + kstat_named_t udp_data_conn; + kstat_named_t udp_data_notconn; +#endif +} udp_stat_t; + +extern udp_stat_t udp_statistics; + +#define UDP_STAT(x) (udp_statistics.x.value.ui64++) +#define UDP_STAT_UPDATE(x, n) (udp_statistics.x.value.ui64 += (n)) +#ifdef DEBUG +#define UDP_DBGSTAT(x) UDP_STAT(x) +#else +#define UDP_DBGSTAT(x) +#endif /* DEBUG */ + +extern major_t UDP6_MAJ; + +extern int udp_opt_default(queue_t *, t_scalar_t, t_scalar_t, uchar_t *); +extern int udp_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *); +extern int udp_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *, + uint_t *, uchar_t *, void *, cred_t *, mblk_t *); +extern int udp_snmp_get(queue_t *, mblk_t *); +extern int udp_snmp_set(queue_t *, t_scalar_t, t_scalar_t, uchar_t *, int); +extern void udp_close_free(conn_t *); +extern void udp_quiesce_conn(conn_t *); +extern void udp_ddi_init(void); +extern void udp_ddi_destroy(void); +extern void udp_resume_bind(conn_t *, mblk_t *); +extern void udp_conn_recv(conn_t *, mblk_t *); +extern boolean_t udp_compute_checksum(void); +extern void udp_wput_data(queue_t *, mblk_t *, struct sockaddr *, + socklen_t); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/io/gld.c b/usr/src/uts/common/io/gld.c index 90ccf7952a..faa90fb792 100644 --- a/usr/src/uts/common/io/gld.c +++ b/usr/src/uts/common/io/gld.c @@ -3415,6 +3415,8 @@ gld_cap_ack(queue_t *q, mblk_t *mp) dlhp->hcksum_txflags |= HCKSUM_INET_PARTIAL; if (macinfo->gldm_capabilities & GLD_CAP_CKSUM_FULL_V4) dlhp->hcksum_txflags |= HCKSUM_INET_FULL_V4; + if (macinfo->gldm_capabilities & GLD_CAP_CKSUM_FULL_V6) + dlhp->hcksum_txflags |= HCKSUM_INET_FULL_V6; if (macinfo->gldm_capabilities & GLD_CAP_CKSUM_IPHDR) dlhp->hcksum_txflags |= HCKSUM_IPHDRCKSUM; diff --git a/usr/src/uts/common/io/stream.c b/usr/src/uts/common/io/stream.c index 93564f29f4..9baaebd365 100644 --- a/usr/src/uts/common/io/stream.c +++ b/usr/src/uts/common/io/stream.c @@ -1690,6 +1690,21 @@ getq(queue_t *q) } /* + * Calculate number of data bytes in a single data message block taking + * multidata messages into account. + */ + +#define ADD_MBLK_SIZE(mp, size) \ + if (DB_TYPE(mp) != M_MULTIDATA) { \ + (size) += MBLKL(mp); \ + } else { \ + uint_t pinuse; \ + \ + mmd_getsize(mmd_getmultidata(mp), NULL, &pinuse); \ + (size) += pinuse; \ + } + +/* * Like getq() but does not backenable. This is used by the stream * head when a putback() is likely. The caller must call qbackenable() * after it is done with accessing the queue. @@ -1721,7 +1736,7 @@ getq_noenab(queue_t *q) /* Get message byte count for q_count accounting */ for (tmp = bp; tmp; tmp = tmp->b_cont) { - bytecnt += (tmp->b_wptr - tmp->b_rptr); + ADD_MBLK_SIZE(tmp, bytecnt); mblkcnt++; } @@ -1941,7 +1956,7 @@ rmvq_noenab(queue_t *q, mblk_t *mp) /* Get the size of the message for q_count accounting */ for (tmp = mp; tmp; tmp = tmp->b_cont) { - bytecnt += (tmp->b_wptr - tmp->b_rptr); + ADD_MBLK_SIZE(tmp, bytecnt); mblkcnt++; } @@ -2433,9 +2448,10 @@ putq(queue_t *q, mblk_t *bp) /* Get message byte count for q_count accounting */ for (tmp = bp; tmp; tmp = tmp->b_cont) { - bytecnt += (tmp->b_wptr - tmp->b_rptr); + ADD_MBLK_SIZE(tmp, bytecnt); mblkcnt++; } + if (qbp) { qbp->qb_count += bytecnt; qbp->qb_mblkcnt += mblkcnt; @@ -2617,7 +2633,7 @@ putbq(queue_t *q, mblk_t *bp) /* Get message byte count for q_count accounting */ for (tmp = bp; tmp; tmp = tmp->b_cont) { - bytecnt += (tmp->b_wptr - tmp->b_rptr); + ADD_MBLK_SIZE(tmp, bytecnt); mblkcnt++; } if (qbp) { @@ -2748,7 +2764,7 @@ badord: /* Get mblk and byte count for q_count accounting */ for (tmp = mp; tmp; tmp = tmp->b_cont) { - bytecnt += (tmp->b_wptr - tmp->b_rptr); + ADD_MBLK_SIZE(tmp, bytecnt); mblkcnt++; } diff --git a/usr/src/uts/common/io/strsun.c b/usr/src/uts/common/io/strsun.c index 00b22e348f..87f0eeaa60 100644 --- a/usr/src/uts/common/io/strsun.c +++ b/usr/src/uts/common/io/strsun.c @@ -37,7 +37,9 @@ #include <sys/errno.h> #include <sys/stream.h> #include <sys/stropts.h> +#include <sys/strsubr.h> #include <sys/strsun.h> +#include <sys/sysmacros.h> #include <sys/cmn_err.h> void @@ -243,3 +245,63 @@ miocpullup(mblk_t *iocmp, size_t size) freemsg(datamp); return (0); } + +/* Copy userdata into a new mblk_t */ +mblk_t * +mcopyinuio(struct stdata *stp, uio_t *uiop, ssize_t iosize, + ssize_t maxblk, int *errorp) +{ + mblk_t *head = NULL, **tail = &head; + size_t offset = stp->sd_wroff; + + if (iosize == INFPSZ || iosize > uiop->uio_resid) + iosize = uiop->uio_resid; + + if (maxblk == INFPSZ) + maxblk = iosize; + + /* Nothing to do in these cases, so we're done */ + if (iosize < 0 || maxblk < 0 || (maxblk == 0 && iosize > 0)) + goto done; + + if (stp->sd_flag & STRCOPYCACHED) + uiop->uio_extflg |= UIO_COPY_CACHED; + + /* + * We will enter the loop below if iosize is 0; it will allocate an + * empty message block and call uiomove(9F) which will just return. + * We could avoid that with an extra check but would only slow + * down the much more likely case where iosize is larger than 0. + */ + do { + ssize_t blocksize; + mblk_t *mp; + + blocksize = MIN(iosize, maxblk); + ASSERT(blocksize >= 0); + if ((mp = allocb_cred(offset + blocksize, CRED())) == NULL) { + *errorp = ENOMEM; + return (head); + } + mp->b_rptr += offset; + mp->b_wptr = mp->b_rptr + blocksize; + DB_CPID(mp) = curproc->p_pid; + + *tail = mp; + tail = &mp->b_cont; + + /* uiomove(9F) either returns 0 or EFAULT */ + if ((*errorp = uiomove(mp->b_rptr, (size_t)blocksize, + UIO_WRITE, uiop)) != 0) { + ASSERT(*errorp != ENOMEM); + freemsg(head); + return (NULL); + } + + iosize -= blocksize; + } while (iosize > 0); + +done: + *errorp = 0; + return (head); +} diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c index e28d9e2fe0..0b0ac98ca4 100644 --- a/usr/src/uts/common/os/streamio.c +++ b/usr/src/uts/common/os/streamio.c @@ -2642,11 +2642,18 @@ strput(struct stdata *stp, mblk_t *mctl, struct uio *uiop, ssize_t *iosize, int strwrite(struct vnode *vp, struct uio *uiop, cred_t *crp) { + return (strwrite_common(vp, uiop, crp, 0)); +} + +/* ARGSUSED2 */ +int +strwrite_common(struct vnode *vp, struct uio *uiop, cred_t *crp, int wflag) +{ struct stdata *stp; struct queue *wqp; ssize_t rmin, rmax; ssize_t iosize; - char waitflag; + int waitflag; int tempmode; int error = 0; int b_flag; @@ -2701,7 +2708,7 @@ strwrite(struct vnode *vp, struct uio *uiop, cred_t *crp) /* * Do until count satisfied or error. */ - waitflag = WRITEWAIT; + waitflag = WRITEWAIT | wflag; if (stp->sd_flag & OLDNDELAY) tempmode = uiop->uio_fmode & ~FNDELAY; else @@ -2803,79 +2810,6 @@ out: } /* - * kstrwritemp() has very similar semantics as that of strwrite(). - * The main difference is it obtains mblks from the caller and also - * does not do any copy as done in strwrite() from user buffers to - * kernel buffers. - * - * - * Currently, this routine is used by sendfile to send data allocated - * within the kernel without any copying. This interface does not use the - * synchronous stream interface as synch. stream interface implies - * copying. - */ -int -kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode) -{ - struct stdata *stp; - struct queue *wqp; - char waitflag; - int tempmode; - int error; - int done = 0; - - ASSERT(vp->v_stream); - stp = vp->v_stream; - - if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) { - mutex_enter(&stp->sd_lock); - error = strwriteable(stp, B_FALSE, B_TRUE); - mutex_exit(&stp->sd_lock); - if (error != 0) - return (error); - } - - /* - * First, check for flow control without grabbing the sd_lock. - * If we would block, re-check with the lock. This is similar - * to the logic used by strwrite(). - */ - wqp = stp->sd_wrq; - if (canputnext(wqp)) { - putnext(wqp, mp); - return (0); - } - - waitflag = WRITEWAIT; - if (stp->sd_flag & OLDNDELAY) - tempmode = fmode & ~FNDELAY; - else - tempmode = fmode; - - mutex_enter(&stp->sd_lock); - do { - if (canputnext(wqp)) { - mutex_exit(&stp->sd_lock); - putnext(wqp, mp); - return (0); - } - error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1, - &done); - } while (error == 0 && !done); - - mutex_exit(&stp->sd_lock); - /* - * EAGAIN tells the application to try again. ENOMEM - * is returned only if the memory allocation size - * exceeds the physical limits of the system. ENOMEM - * can't be true here. - */ - if (error == ENOMEM) - error = EAGAIN; - return (error); -} - -/* * Stream head write service routine. * Its job is to wake up any sleeping writers when a queue * downstream needs data (part of the flow control in putq and getq). diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c index 66184c5206..16dad7e4bb 100644 --- a/usr/src/uts/common/os/strsubr.c +++ b/usr/src/uts/common/os/strsubr.c @@ -2437,6 +2437,18 @@ devflg_to_qflag(struct streamtab *stp, uint32_t devflag, uint32_t *qflagp, if (devflag & D_SYNCSTR) qflag |= QSYNCSTR; + /* + * Private flag used by a transport module to indicate + * to sockfs that it supports direct-access mode without + * having to go through STREAMS. + */ + if (devflag & _D_DIRECT) { + /* Reject unless the module is fully-MT (no perimeter) */ + if ((qflag & QMT_TYPEMASK) != QMTSAFE) + goto bad; + qflag |= _QDIRECT; + } + *qflagp = qflag; *sqtypep = sqtype; return (0); @@ -8236,11 +8248,11 @@ hcksum_assoc(mblk_t *mp, multidata_t *mmd, pdesc_t *pd, ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_MULTIDATA); if (mp->b_datap->db_type == M_DATA) { /* Associate values for M_DATA type */ - mp->b_datap->db_cksumstart = (intptr_t)start; - mp->b_datap->db_cksumstuff = (intptr_t)stuff; - mp->b_datap->db_cksumend = (intptr_t)end; - mp->b_datap->db_struioun.cksum.flags = flags; - mp->b_datap->db_cksum16 = (uint16_t)value; + DB_CKSUMSTART(mp) = (intptr_t)start; + DB_CKSUMSTUFF(mp) = (intptr_t)stuff; + DB_CKSUMEND(mp) = (intptr_t)end; + DB_CKSUMFLAGS(mp) = flags; + DB_CKSUM16(mp) = (uint16_t)value; } else { pattrinfo_t pa_info; @@ -8258,6 +8270,8 @@ hcksum_assoc(mblk_t *mp, multidata_t *mmd, pdesc_t *pd, hck->hcksum_end_offset = end; hck->hcksum_cksum_val.inet_cksum = (uint16_t)value; hck->hcksum_flags = flags; + } else { + rc = -1; } } return (rc); @@ -8271,20 +8285,16 @@ hcksum_retrieve(mblk_t *mp, multidata_t *mmd, pdesc_t *pd, ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_MULTIDATA); if (mp->b_datap->db_type == M_DATA) { if (flags != NULL) { - *flags = mp->b_datap->db_struioun.cksum.flags; + *flags = DB_CKSUMFLAGS(mp); if (*flags & HCK_PARTIALCKSUM) { if (start != NULL) - *start = (uint32_t) - mp->b_datap->db_cksumstart; + *start = (uint32_t)DB_CKSUMSTART(mp); if (stuff != NULL) - *stuff = (uint32_t) - mp->b_datap->db_cksumstuff; + *stuff = (uint32_t)DB_CKSUMSTUFF(mp); if (end != NULL) - *end = - (uint32_t)mp->b_datap->db_cksumend; + *end = (uint32_t)DB_CKSUMEND(mp); if (value != NULL) - *value = - (uint32_t)mp->b_datap->db_cksum16; + *value = (uint32_t)DB_CKSUM16(mp); } } } else { diff --git a/usr/src/uts/common/sys/conf.h b/usr/src/uts/common/sys/conf.h index 2f19697c81..305c40e236 100644 --- a/usr/src/uts/common/sys/conf.h +++ b/usr/src/uts/common/sys/conf.h @@ -24,7 +24,7 @@ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -219,6 +219,8 @@ extern int cdev_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, #define D_U64BIT 0x40000 /* Driver supports unsigned 64-bit uio offset */ +#define _D_DIRECT 0x80000 /* Private flag for transport modules */ + #endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h index c35f9dc27d..1169d68d68 100644 --- a/usr/src/uts/common/sys/dlpi.h +++ b/usr/src/uts/common/sys/dlpi.h @@ -689,6 +689,8 @@ typedef struct { /* ability */ #define HCKSUM_INET_FULL_V4 0x04 /* Full 1's complement checksum */ /* ability for IPv4 packets. */ +#define HCKSUM_INET_FULL_V6 0x08 /* Full 1's complement checksum */ + /* ability for IPv6 packets. */ #define HCKSUM_IPHDRCKSUM 0x10 /* IPv4 Header checksum offload */ /* capability */ #ifdef _KERNEL diff --git a/usr/src/uts/common/sys/gld.h b/usr/src/uts/common/sys/gld.h index ed24a8deae..e42bb62f28 100644 --- a/usr/src/uts/common/sys/gld.h +++ b/usr/src/uts/common/sys/gld.h @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -240,9 +240,12 @@ typedef struct gld_mac_info { #define GLD_CAP_LINKSTATE 0x00000001 /* will call gld_linkstate() */ #define GLD_CAP_CKSUM_IPHDR 0x00000008 /* IP checksum offload */ #define GLD_CAP_CKSUM_PARTIAL 0x00000010 /* TCP/UDP partial */ -#define GLD_CAP_CKSUM_FULL_V4 0x00000020 /* TCP/UDP full */ -#define GLD_CAP_CKSUM_ANY 0x00000038 /* any or all of the above */ +#define GLD_CAP_CKSUM_FULL_V4 0x00000020 /* TCP/UDP full for IPv4 */ #define GLD_CAP_ZEROCOPY 0x00000040 /* zerocopy */ +#define GLD_CAP_CKSUM_FULL_V6 0x00000080 /* TCP/UDP full for IPv6 */ +#define GLD_CAP_CKSUM_ANY \ + (GLD_CAP_CKSUM_IPHDR|GLD_CAP_CKSUM_PARTIAL| \ + GLD_CAP_CKSUM_FULL_V4|GLD_CAP_CKSUM_FULL_V6) /* values of gldm_linkstate, as passed to gld_linkstate() */ #define GLD_LINKSTATE_DOWN -1 diff --git a/usr/src/uts/common/sys/multidata.h b/usr/src/uts/common/sys/multidata.h index 60ce570fbf..f649b187bc 100644 --- a/usr/src/uts/common/sys/multidata.h +++ b/usr/src/uts/common/sys/multidata.h @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -68,19 +68,24 @@ typedef struct mbufinfo_s { /* * Multidata packet descriptor information. */ -typedef struct pdescinfo_s { - uint_t flags; /* misc. flags */ - uchar_t *hdr_base; /* start address of header area */ - uchar_t *hdr_rptr; /* start address of header data */ - uchar_t *hdr_wptr; /* end address of header data */ - uchar_t *hdr_lim; /* end address of header area */ - uint_t pld_cnt; /* number of payload area */ - struct pld_ary_s { - int pld_pbuf_idx; /* payload buffer index */ - uchar_t *pld_rptr; /* start address of payload data */ - uchar_t *pld_wptr; /* pointer to end of payload data */ - } pld_ary[MULTIDATA_MAX_PBUFS]; -} pdescinfo_t; +struct pld_ary_s { + int pld_pbuf_idx; /* payload buffer index */ + uchar_t *pld_rptr; /* start address of payload data */ + uchar_t *pld_wptr; /* pointer to end of payload data */ +}; + +#define PDESCINFO_STRUCT(elems) \ +{ \ + uint_t flags; /* misc. flags */ \ + uchar_t *hdr_base; /* start address of header area */ \ + uchar_t *hdr_rptr; /* start address of header data */ \ + uchar_t *hdr_wptr; /* end address of header data */ \ + uchar_t *hdr_lim; /* end address of header area */ \ + uint_t pld_cnt; /* number of payload area */ \ + struct pld_ary_s pld_ary[(elems)]; \ +} + +typedef struct pdescinfo_s PDESCINFO_STRUCT(MULTIDATA_MAX_PBUFS) pdescinfo_t; /* * Possible values for flags diff --git a/usr/src/uts/common/sys/multidata_impl.h b/usr/src/uts/common/sys/multidata_impl.h index 92df853beb..05589c6f03 100644 --- a/usr/src/uts/common/sys/multidata_impl.h +++ b/usr/src/uts/common/sys/multidata_impl.h @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -191,21 +191,6 @@ struct multidata_s { uint_t mmd_pbuf_ref; /* descriptors referring to payload buffer(s) */ }; -/* - * Smaller and private version of pdescinfo_t used specifically for tcp, - * which allows for only two payload spans per packet. Any changes made - * to the pdescinfo_t structure must be reflected here as well. - */ -typedef struct tcp_pdescinfo_s { - uint_t flags; /* misc. flags */ - uchar_t *hdr_base; /* start address of header area */ - uchar_t *hdr_rptr; /* start address of header data */ - uchar_t *hdr_wptr; /* end address of header data */ - uchar_t *hdr_lim; /* end address of header area */ - uint_t pld_cnt; /* number of payload area */ - struct pld_ary_s pld_ary[2]; -} tcp_pdescinfo_t; - #ifdef _KERNEL extern void mmd_init(void); diff --git a/usr/src/uts/common/sys/socketvar.h b/usr/src/uts/common/sys/socketvar.h index fc39185768..7bcb924d7d 100644 --- a/usr/src/uts/common/sys/socketvar.h +++ b/usr/src/uts/common/sys/socketvar.h @@ -100,6 +100,7 @@ struct sockaddr_ux { }; typedef struct sonodeops sonodeops_t; +typedef struct sonode sonode_t; /* * The sonode represents a socket. A sonode never exist in the file system @@ -364,7 +365,7 @@ struct sonode { #define SS_DONEREAD 0x00080000 /* NCAfs: all data read */ #define SS_MOREDATA 0x00100000 /* NCAfs: NCA has more data */ -#define SS_TCP_FAST_ACCEPT 0x00200000 /* Use TCP's accept fast-path */ +#define SS_DIRECT 0x00200000 /* transport is directly below */ #define SS_LADDR_VALID 0x01000000 /* so_laddr valid for user */ #define SS_FADDR_VALID 0x02000000 /* so_faddr valid for user */ @@ -769,8 +770,10 @@ extern void so_drain_discon_ind(struct sonode *); extern void so_flush_discon_ind(struct sonode *); extern int sowaitconnected(struct sonode *, int, int); +extern int sostream_direct(struct sonode *, struct uio *, + mblk_t *, cred_t *); extern int sosend_dgram(struct sonode *, struct sockaddr *, - socklen_t, struct uio *, int); + socklen_t, struct uio *, int); extern int sosend_svc(struct sonode *, struct uio *, t_scalar_t, int, int); extern void so_installhooks(struct sonode *); extern int so_strinit(struct sonode *, struct sonode *); diff --git a/usr/src/uts/common/sys/sockio.h b/usr/src/uts/common/sys/sockio.h index 0c15e6deee..c9e57359ef 100644 --- a/usr/src/uts/common/sys/sockio.h +++ b/usr/src/uts/common/sys/sockio.h @@ -265,9 +265,9 @@ extern "C" { #define SIOCDXARP _IOW('i', 168, struct xarpreq) /* delete ARP entry */ /* - * IOCTL to indicate to the transport that the sockmod is being popped + * IOCTL private to sockfs. */ -#define SIOCPOPSOCKFS _IOW('i', 169, 0) +#define _SIOCSOCKFALLBACK _IOW('i', 169, 0) /* * IOCTLs for getting and setting zone associated with an interface, and diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h index e17ce9388f..3c7b9e685c 100644 --- a/usr/src/uts/common/sys/stream.h +++ b/usr/src/uts/common/sys/stream.h @@ -171,6 +171,8 @@ typedef struct queue { #define _QINSERTING 0x04000000 /* Private, module is being inserted */ #define _QREMOVING 0x08000000 /* Private, module is being removed */ #define _QASSOCIATED 0x10000000 /* queue is associated with a device */ +#define _QDIRECT 0x20000000 /* Private; transport module uses */ + /* direct interface to/from sockfs */ /* queue sqflags (protected by SQLOCK). */ #define Q_SQQUEUED 0x01 /* Queue is in the syncq list */ diff --git a/usr/src/uts/common/sys/strsubr.h b/usr/src/uts/common/sys/strsubr.h index 4151204cd3..f907db2c06 100644 --- a/usr/src/uts/common/sys/strsubr.h +++ b/usr/src/uts/common/sys/strsubr.h @@ -1096,6 +1096,8 @@ extern int strpoll(register struct stdata *, short, int, short *, extern void strclean(struct vnode *); extern void str_cn_clean(); /* XXX hook for consoles signal cleanup */ extern int strwrite(struct vnode *, struct uio *, cred_t *); +extern int strwrite_common(struct vnode *, struct uio *, cred_t *, int); +extern int kstrwritemp(struct vnode *, mblk_t *, ushort_t); extern int strread(struct vnode *, struct uio *, cred_t *); extern int strioctl(struct vnode *, int, intptr_t, int, int, cred_t *, int *); extern int strrput(queue_t *, mblk_t *); @@ -1180,6 +1182,7 @@ extern mblk_t *allocb_wait(size_t, uint_t, uint_t, int *); extern mblk_t *allocb_cred(size_t, cred_t *); extern mblk_t *allocb_cred_wait(size_t, uint_t, int *, cred_t *); extern mblk_t *allocb_tmpl(size_t, const mblk_t *); +extern mblk_t *allocb_tryhard(size_t); extern void mblk_setcred(mblk_t *, cred_t *); extern void strpollwakeup(vnode_t *, short); extern int putnextctl_wait(queue_t *, int); @@ -1188,7 +1191,6 @@ extern int kstrputmsg(struct vnode *, mblk_t *, struct uio *, ssize_t, unsigned char, int, int); extern int kstrgetmsg(struct vnode *, mblk_t **, struct uio *, unsigned char *, int *, clock_t, rval_t *); -extern int kstrwritemp(struct vnode *, mblk_t *, ushort_t); extern void strsetrerror(vnode_t *, int, int, errfunc_t); extern void strsetwerror(vnode_t *, int, int, errfunc_t); @@ -1217,6 +1219,8 @@ extern void fmodsw_rele(fmodsw_impl_t *); extern void freemsgchain(mblk_t *); extern mblk_t *copymsgchain(mblk_t *); +extern mblk_t *mcopyinuio(struct stdata *, uio_t *, ssize_t, ssize_t, int *); + /* * shared or externally configured data structures */ @@ -1263,6 +1267,19 @@ extern struct queue *RD(queue_t *); extern struct queue *WR(queue_t *); extern int SAMESTR(queue_t *); +/* + * The following hardware checksum related macros are private + * interfaces that are subject to change without notice. + */ +#ifdef _KERNEL +#define DB_CKSUMSTART(mp) ((mp)->b_datap->db_cksumstart) +#define DB_CKSUMEND(mp) ((mp)->b_datap->db_cksumend) +#define DB_CKSUMSTUFF(mp) ((mp)->b_datap->db_cksumstuff) +#define DB_CKSUMFLAGS(mp) ((mp)->b_datap->db_struioun.cksum.flags) +#define DB_CKSUM16(mp) ((mp)->b_datap->db_cksum16) +#define DB_CKSUM32(mp) ((mp)->b_datap->db_cksum32) +#endif /* _KERNEL */ + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/syscall/sendfile.c b/usr/src/uts/common/syscall/sendfile.c index 2a3e29a859..04bbd99f65 100644 --- a/usr/src/uts/common/syscall/sendfile.c +++ b/usr/src/uts/common/syscall/sendfile.c @@ -73,6 +73,89 @@ extern int sosendfile64(file_t *, file_t *, const struct ksendfilevec64 *, extern void nl7c_sendfilev(struct sonode *, u_offset_t, struct sendfilevec *, int); +/* + * kstrwritemp() has very similar semantics as that of strwrite(). + * The main difference is it obtains mblks from the caller and also + * does not do any copy as done in strwrite() from user buffers to + * kernel buffers. + * + * Currently, this routine is used by sendfile to send data allocated + * within the kernel without any copying. This interface does not use the + * synchronous stream interface as synch. stream interface implies + * copying. + */ +int +kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode) +{ + struct stdata *stp; + struct queue *wqp; + char waitflag; + int tempmode; + int error = 0; + int done = 0; + struct sonode *so; + boolean_t direct; + + ASSERT(vp->v_stream); + stp = vp->v_stream; + + so = VTOSO(vp); + direct = (so->so_state & SS_DIRECT); + + /* + * This is the sockfs direct fast path. canputnext() need + * not be accurate so we don't grab the sd_lock here. If + * we get flow-controlled, we grab sd_lock just before the + * do..while loop below to emulate what strwrite() does. + */ + wqp = stp->sd_wrq; + if (canputnext(wqp) && direct && + !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) { + return (sostream_direct(so, NULL, mp, CRED())); + } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) { + /* Fast check of flags before acquiring the lock */ + mutex_enter(&stp->sd_lock); + error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0); + mutex_exit(&stp->sd_lock); + if (error != 0) { + if (!(stp->sd_flag & STPLEX) && + (stp->sd_wput_opt & SW_SIGPIPE)) { + tsignal(curthread, SIGPIPE); + error = EPIPE; + } + return (error); + } + } + + waitflag = WRITEWAIT; + if (stp->sd_flag & OLDNDELAY) + tempmode = fmode & ~FNDELAY; + else + tempmode = fmode; + + mutex_enter(&stp->sd_lock); + do { + if (canputnext(wqp)) { + mutex_exit(&stp->sd_lock); + putnext(wqp, mp); + return (0); + } + error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1, + &done); + } while (error == 0 && !done); + + mutex_exit(&stp->sd_lock); + /* + * EAGAIN tells the application to try again. ENOMEM + * is returned only if the memory allocation size + * exceeds the physical limits of the system. ENOMEM + * can't be true here. + */ + if (error == ENOMEM) + error = EAGAIN; + return (error); +} + #define SEND_MAX_CHUNK 16 #if defined(_SYSCALL32_IMPL) || defined(_ILP32) @@ -1045,7 +1128,7 @@ sendfilev(int opcode, int fildes, const struct sendfilevec *vec, int sfvcnt, goto err; } - if ((so->so_state & SS_TCP_FAST_ACCEPT) && + if ((so->so_state & SS_DIRECT) && (so->so_priv != NULL)) { maxblk = ((tcp_t *)so->so_priv)->tcp_mss; } else { diff --git a/usr/src/uts/intel/ia32/ml/modstubs.s b/usr/src/uts/intel/ia32/ml/modstubs.s index e0b0a92ee9..658dc6dd89 100644 --- a/usr/src/uts/intel/ia32/ml/modstubs.s +++ b/usr/src/uts/intel/ia32/ml/modstubs.s @@ -482,6 +482,7 @@ fcnname/**/_info: \ NO_UNLOAD_STUB(sockfs, sosendfile64, nomod_zero); NO_UNLOAD_STUB(sockfs, sock_getfasync, nomod_zero); NO_UNLOAD_STUB(sockfs, nl7c_sendfilev, nomod_zero); + NO_UNLOAD_STUB(sockfs, sostream_direct, nomod_zero); END_MODULE(sockfs); #endif @@ -529,12 +530,6 @@ fcnname/**/_info: \ END_MODULE(spdsock); #endif -#ifndef UDP_MODULE - MODULE(udp,drv); - WSTUB(udp, udp_compute_checksum, nomod_zero); - END_MODULE(udp); -#endif - #ifndef NATTYMOD_MODULE MODULE(nattymod, strmod); WSTUB(nattymod, nattymod_clean_ipif, nomod_zero); diff --git a/usr/src/uts/sparc/ml/modstubs.s b/usr/src/uts/sparc/ml/modstubs.s index 9594335f33..599658a635 100644 --- a/usr/src/uts/sparc/ml/modstubs.s +++ b/usr/src/uts/sparc/ml/modstubs.s @@ -368,6 +368,7 @@ stubs_base: NO_UNLOAD_STUB(sockfs, sosendfile64, nomod_zero); NO_UNLOAD_STUB(sockfs, sock_getfasync, nomod_zero); NO_UNLOAD_STUB(sockfs, nl7c_sendfilev, nomod_zero); + NO_UNLOAD_STUB(sockfs, sostream_direct, nomod_zero); END_MODULE(sockfs); #endif @@ -415,12 +416,6 @@ stubs_base: END_MODULE(spdsock); #endif -#ifndef UDP_MODULE - MODULE(udp,drv); - WSTUB(udp, udp_compute_checksum, nomod_zero); - END_MODULE(udp); -#endif - #ifndef NATTYMOD_MODULE MODULE(nattymod, strmod); WSTUB(nattymod, nattymod_clean_ipif, nomod_zero); |
